Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Transforms RandomForest estimators non-consecutive labels to consecutive labels where appropriate #4780

Merged
merged 13 commits into from
Sep 29, 2022
6 changes: 3 additions & 3 deletions python/cuml/ensemble/randomforest_common.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@ from cuml.ensemble.randomforest_shared import treelite_serialize, \
from cuml.ensemble.randomforest_shared cimport *
from cuml.common import input_to_cuml_array
from cuml.common.array_descriptor import CumlArrayDescriptor
from cuml.prims.label.classlabels import make_monotonic


class BaseRandomForestModel(Base):
Expand Down Expand Up @@ -285,9 +286,8 @@ class BaseRandomForestModel(Base):
self.num_classes = len(self.classes_)
for i in range(self.num_classes):
if i not in self.classes_:
raise ValueError("The labels need "
"to be consecutive values from "
"0 to the number of unique label values")
y_m, _ = make_monotonic(y_m)
break

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I wonder if the logic for the loop might be better to have in make_monotonic, perhaps with a parameter like make_monotonic(array, check_already_monotonic=True) so that every use of the prim is cleaner? What do you think?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'd vote we make that change in RAFT, plumb raft::label::make_monotonic to Python (rapidsai/raft#640), and then make a follow up PR to use that in cuML and remove this prim.

Context: #4478 (comment)

What do you think? With that said, we could just do both :)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I did not modify the loop as I wanted to make least possible changes to the code. However, we can make use of check_lables primitives to see if the labels are already monotonic. I can make the change for cleaner implementation.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@beckernick I am on board with that idea.

else:
y_m, _, _, y_dtype = \
Expand Down
6 changes: 6 additions & 0 deletions python/cuml/ensemble/randomforestclassifier.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@ from libc.stdint cimport uintptr_t, uint64_t
from libc.stdlib cimport calloc, malloc, free

from numba import cuda
from cuml.prims.label.classlabels import check_labels, invert_labels

from raft.common.handle cimport handle_t
cimport cuml.common.cuda
Expand Down Expand Up @@ -431,6 +432,8 @@ class RandomForestClassifier(BaseRandomForestModel,

X_m, y_m, max_feature_val = self._dataset_setup_for_fit(X, y,
convert_dtype)
# Track the labels to see if update is necessary
self.update_labels = not check_labels(y_m, self.classes_)
cdef uintptr_t X_ptr, y_ptr

X_ptr = X_m.ptr
Expand Down Expand Up @@ -611,6 +614,9 @@ class RandomForestClassifier(BaseRandomForestModel,
fil_sparse_format=fil_sparse_format,
predict_proba=False)

if self.update_labels:
preds = preds.to_output().astype(self.classes_.dtype)
preds = invert_labels(preds, self.classes_)
return preds

@insert_into_docstring(parameters=[('dense', '(n_samples, n_features)')],
Expand Down
2 changes: 1 addition & 1 deletion python/cuml/tests/test_nearest_neighbors.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@

#
# Copyright (c) 2019-2022, NVIDIA CORPORATION.
#
# Licensed under the Apache License, Version 2.0 (the "License");
Expand Down
61 changes: 61 additions & 0 deletions python/cuml/tests/test_random_forest.py
Original file line number Diff line number Diff line change
Expand Up @@ -307,6 +307,67 @@ def test_rf_classification(small_clf, datatype, max_samples, max_features):
assert fil_acc >= (cuml_acc - 0.07) # to be changed to 0.02. see issue #3910: https://github.com/rapidsai/cuml/issues/3910 # noqa


@pytest.mark.parametrize(
"max_samples", [unit_param(1.0), quality_param(0.90), stress_param(0.95)]
)
@pytest.mark.parametrize("datatype", [np.float32, np.float64])
@pytest.mark.parametrize("max_features", [1.0, "auto", "log2", "sqrt"])
@pytest.mark.parametrize("b", [0, 5, -5, 10])
@pytest.mark.parametrize("a", [1, 2, 3])
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't think we need this full matrix of tests for testing the monotonic case, one combination for each datatype would be enough?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sure, I will fix one value for a, b and max_features.

def test_rf_classification_unorder(
small_clf, datatype, max_samples, max_features, a, b):
use_handle = True

X, y = small_clf
X = X.astype(datatype)
y = y.astype(np.int32)
# affine transformation
y = a*y+b
X_train, X_test, y_train, y_test = train_test_split(
X, y, train_size=0.8, random_state=0
)
# Create a handle for the cuml model
handle, stream = get_handle(use_handle, n_streams=1)

# Initialize, fit and predict using cuML's
# random forest classification model
cuml_model = curfc(
max_features=max_features,
max_samples=max_samples,
n_bins=16,
split_criterion=0,
min_samples_leaf=2,
random_state=123,
n_streams=1,
n_estimators=40,
handle=handle,
max_leaves=-1,
max_depth=16,
)
cuml_model.fit(X_train, y_train)

fil_preds = cuml_model.predict(
X_test, predict_model="GPU", threshold=0.5, algo="auto"
)
cu_preds = cuml_model.predict(X_test, predict_model="CPU")
fil_preds = np.reshape(fil_preds, np.shape(cu_preds))
cuml_acc = accuracy_score(y_test, cu_preds)
fil_acc = accuracy_score(y_test, fil_preds)
if X.shape[0] < 500000:
sk_model = skrfc(
n_estimators=40,
max_depth=16,
min_samples_split=2,
max_features=max_features,
random_state=10,
)
sk_model.fit(X_train, y_train)
sk_preds = sk_model.predict(X_test)
sk_acc = accuracy_score(y_test, sk_preds)
assert fil_acc >= (sk_acc - 0.07)
assert fil_acc >= (cuml_acc - 0.07) # to be changed to 0.02. see issue #3910: https://github.com/rapidsai/cuml/issues/3910 # noqa


@pytest.mark.parametrize(
"max_samples", [unit_param(1.0), quality_param(0.90), stress_param(0.95)]
)
Expand Down