Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Transforms RandomForest estimators non-consecutive labels to consecutive labels where appropriate #4780

Merged
merged 13 commits into from
Sep 29, 2022
12 changes: 9 additions & 3 deletions python/cuml/ensemble/randomforest_common.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -285,9 +285,15 @@ class BaseRandomForestModel(Base):
self.num_classes = len(self.classes_)
for i in range(self.num_classes):
if i not in self.classes_:
raise ValueError("The labels need "
"to be consecutive values from "
"0 to the number of unique label values")
self.classes_unorder = cp.unique(y_m).tolist()
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We should be reusing existing primitives where at all possible and using the make_monotonic primitive to do this. That allows us to optimize this specific operation once and have it benefit all uses.

table = {val: i for i, val in enumerate((self.classes_unorder))}
y_m, _, _, _ = input_to_cuml_array(
cp.asarray([table[v] for v in (y_m).to_output('cupy').tolist()]),
check_dtype=np.int32,
convert_to_dtype=(np.int32 if convert_dtype
else None),
check_rows=self.n_rows, check_cols=1)
break

else:
y_m, _, _, y_dtype = \
Expand Down
60 changes: 60 additions & 0 deletions python/cuml/tests/test_random_forest.py
Original file line number Diff line number Diff line change
Expand Up @@ -307,6 +307,66 @@ def test_rf_classification(small_clf, datatype, max_samples, max_features):
assert fil_acc >= (cuml_acc - 0.07) # to be changed to 0.02. see issue #3910: https://github.com/rapidsai/cuml/issues/3910 # noqa


@pytest.mark.parametrize(
"max_samples", [unit_param(1.0), quality_param(0.90), stress_param(0.95)]
)
@pytest.mark.parametrize("datatype", [np.float32, np.float64])
@pytest.mark.parametrize("max_features", [1.0, "auto", "log2", "sqrt"])
@pytest.mark.parametrize("b", [0, 5, -5, 10])
@pytest.mark.parametrize("a", [1, 2, 3])
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't think we need this full matrix of tests for testing the monotonic case, one combination for each datatype would be enough?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sure, I will fix one value for a, b and max_features.

def test_rf_classification_unorder(small_clf, datatype, max_samples, max_features, a, b):
use_handle = True

X, y = small_clf
X = X.astype(datatype)
y = y.astype(np.int32)
X_train, X_test, y_train, y_test = train_test_split(
X, y, train_size=0.8, random_state=0
)
# Create a handle for the cuml model
handle, stream = get_handle(use_handle, n_streams=1)

# Initialize, fit and predict using cuML's
# random forest classification model
cuml_model = curfc(
max_features=max_features,
max_samples=max_samples,
n_bins=16,
split_criterion=0,
min_samples_leaf=2,
random_state=123,
n_streams=1,
n_estimators=40,
handle=handle,
max_leaves=-1,
max_depth=16,
)
#affine transformation
y_train = a*y_train+b
cuml_model.fit(X_train, y_train)

fil_preds = cuml_model.predict(
X_test, predict_model="GPU", threshold=0.5, algo="auto"
)
cu_preds = cuml_model.predict(X_test, predict_model="CPU")
fil_preds = np.reshape(fil_preds, np.shape(cu_preds))
cuml_acc = accuracy_score(y_test, cu_preds)
fil_acc = accuracy_score(y_test, fil_preds)
if X.shape[0] < 500000:
sk_model = skrfc(
n_estimators=40,
max_depth=16,
min_samples_split=2,
max_features=max_features,
random_state=10,
)
sk_model.fit(X_train, y_train)
sk_preds = sk_model.predict(X_test)
sk_acc = accuracy_score(y_test, sk_preds)
assert fil_acc >= (sk_acc - 0.07)
assert fil_acc >= (cuml_acc - 0.07) # to be changed to 0.02. see issue #3910: https://github.com/rapidsai/cuml/issues/3910 # noqa


@pytest.mark.parametrize(
"max_samples", [unit_param(1.0), quality_param(0.90), stress_param(0.95)]
)
Expand Down