rapidsai · rapids-bot · May 29, 2024 · Apr 17, 2024 · May 8, 2024 · May 22, 2024
@@ -63,7 +63,7 @@ dependencies:
 - recommonmark
 - rmm==24.6.*
 - scikit-build-core>=0.7.0
-- scikit-learn==1.2
+- scikit-learn==1.5
 - scipy>=1.8.0
 - seaborn
 - sphinx-copybutton

@@ -59,7 +59,7 @@ dependencies:
 - recommonmark
 - rmm==24.6.*
 - scikit-build-core>=0.7.0
-- scikit-learn==1.2
+- scikit-learn==1.5
 - scipy>=1.8.0
 - seaborn
 - sphinx-copybutton

@@ -356,7 +356,7 @@ dependencies:
           # https://github.com/pydata/pydata-sphinx-theme/issues/1539
           - pydata-sphinx-theme!=0.14.2
           - recommonmark
-          - &scikit_learn scikit-learn==1.2
+          - &scikit_learn scikit-learn==1.5
           - sphinx<6
           - sphinx-copybutton
           - sphinx-markdown-tables

@@ -14,6 +14,19 @@
 # This code is under BSD 3 clause license.
 # Authors mentioned above do not endorse or promote this production.
 
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 
 from ....internals.memory_utils import using_output_type
 from ....internals import _deprecate_pos_args
@@ -32,6 +45,7 @@
 from ..utils.extmath import _incremental_mean_and_var
 from ..utils.extmath import row_norms
 from ....thirdparty_adapters import check_array
+from sklearn.utils._indexing import resample
 from cuml.internals.mixins import AllowNaNTagMixin, SparseInputTagMixin, \
     StatelessTagMixin
 from ..utils.skl_dependencies import BaseEstimator, TransformerMixin
@@ -2284,17 +2298,14 @@ def _dense_fit(self, X, random_state):
         n_samples, n_features = X.shape
         references = np.asnumpy(self.references_ * 100)
 
-        self.quantiles_ = []
-        for col in X.T:
-            if self.subsample < n_samples:
-                subsample_idx = random_state.choice(n_samples,
-                                                    size=self.subsample,
-                                                    replace=False)
-                col = col.take(subsample_idx)
-            self.quantiles_.append(
-                cpu_np.nanpercentile(np.asnumpy(col), references)
+        X = np.asnumpy(X)
+        if self.subsample is not None and self.subsample < n_samples:
+            # Take a subsample of `X`
+            X = resample(
+                X, replace=False, n_samples=self.subsample, random_state=random_state
             )
-        self.quantiles_ = cpu_np.transpose(self.quantiles_)
+
+        self.quantiles_ = cpu_np.nanpercentile(X, references, axis=0)
         # Due to floating-point precision error in `np.nanpercentile`,
         # make sure that quantiles are monotonically increasing.
         # Upstream issue in numpy:

@@ -10,6 +10,20 @@
 # This code is under BSD 3 clause license.
 # Authors mentioned above do not endorse or promote this production.
 
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
 
 from ....internals import _deprecate_pos_args
 from ....internals.memory_utils import using_output_type
@@ -240,7 +254,7 @@ def fit(self, X, y=None) -> "KBinsDiscretizer":
         if 'onehot' in self.encode:
             self._encoder = OneHotEncoder(
                 categories=np.array([np.arange(i) for i in self.n_bins_]),
-                sparse=self.encode == 'onehot', output_type='cupy')
+                sparse_output=self.encode == 'onehot', output_type='cupy')
             # Fit the OneHotEncoder with toy datasets
             # so that it's ready for use after the KBinsDiscretizer is fitted
             self._encoder.fit(np.zeros((1, len(self.n_bins_)), dtype=int))

@@ -1,5 +1,5 @@
 #
-# Copyright (c) 2019-2022, NVIDIA CORPORATION.
+# Copyright (c) 2019-2024, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -16,6 +16,8 @@
 
 # distutils: language = c++
 
+import warnings
+
 from libc.stdint cimport uintptr_t
 
 from cuml.internals.safe_imports import cpu_only_import
@@ -103,6 +105,17 @@ class AgglomerativeClustering(Base, ClusterMixin, CMajorInputTagMixin):
         Metric used to compute the linkage. Can be "euclidean", "l1",
         "l2", "manhattan", or "cosine". If connectivity is "knn" only
         "euclidean" is accepted.
+
+        .. deprecated:: 24.06
+            `affinity` was deprecated in version 24.06 and will be renamed to
+            `metric` in 25.08.
+
+    metric : str, default=None
+        Metric used to compute the linkage. Can be "euclidean", "l1",
+        "l2", "manhattan", or "cosine". If set to `None` then "euclidean"
+        is used. If connectivity is "knn" only "euclidean" is accepted.
+        .. versionadded:: 24.06
+
     linkage : {"single"}, default="single"
         Which linkage criterion to use. The linkage criterion determines
         which distance to use between sets of observations. The algorithm
@@ -136,9 +149,9 @@ class AgglomerativeClustering(Base, ClusterMixin, CMajorInputTagMixin):
     labels_ = CumlArrayDescriptor()
     children_ = CumlArrayDescriptor()
 
-    def __init__(self, *, n_clusters=2, affinity="euclidean", linkage="single",
-                 handle=None, verbose=False, connectivity='knn',
-                 n_neighbors=10, output_type=None):
+    def __init__(self, *, n_clusters=2, affinity="deprecated", metric=None,
+                 linkage="single", handle=None, verbose=False,
+                 connectivity='knn', n_neighbors=10, output_type=None):
 
         super().__init__(handle=handle,
                          verbose=verbose,
@@ -159,11 +172,12 @@ class AgglomerativeClustering(Base, ClusterMixin, CMajorInputTagMixin):
             raise ValueError("'n_neighbors' must be a positive number "
                              "between 2 and 1023")
 
-        if affinity not in _metrics_mapping:
-            raise ValueError("'affinity' %s is not supported." % affinity)
+        if metric is not None and metric not in _metrics_mapping:
+            raise ValueError("Metric '%s' is not supported." % affinity)
 
         self.n_clusters = n_clusters
         self.affinity = affinity
+        self.metric = metric
         self.linkage = linkage
         self.n_neighbors = n_neighbors
         self.connectivity = connectivity
@@ -178,6 +192,26 @@ class AgglomerativeClustering(Base, ClusterMixin, CMajorInputTagMixin):
         """
         Fit the hierarchical clustering from features.
         """
+        if self.affinity != "deprecated":
+            if self.metric is not None:
+                raise ValueError(
+                    "Both `affinity` and `metric` attributes were set. Attribute"
+                    " `affinity` was deprecated in version 24.06 and will be removed in"
+                    " 25.08. To avoid this error, only set the `metric` attribute."
+                )
+            warnings.warn(
+                (
+                    "Attribute `affinity` was deprecated in version 24.06 and will be"
+                    " removed in 25.08. Use `metric` instead."
+                ),
+                FutureWarning,
+            )
+            metric_name = self.affinity
+        else:
+            if self.metric is None:
+                metric_name = "euclidean"
+            else:
+                metric_name = self.metric
 
         X_m, n_rows, n_cols, self.dtype = \
             input_to_cuml_array(X, order='C',
@@ -209,10 +243,10 @@ class AgglomerativeClustering(Base, ClusterMixin, CMajorInputTagMixin):
         linkage_output.labels = <int*>labels_ptr
 
         cdef DistanceType metric
-        if self.affinity in _metrics_mapping:
-            metric = _metrics_mapping[self.affinity]
+        if metric_name in _metrics_mapping:
+            metric = _metrics_mapping[metric_name]
         else:
-            raise ValueError("'affinity' %s not supported." % self.affinity)
+            raise ValueError("Metric '%s' not supported." % metric_name)
 
         if self.connectivity == 'knn':
             single_linkage_neighbors(
@@ -249,6 +283,7 @@ class AgglomerativeClustering(Base, ClusterMixin, CMajorInputTagMixin):
         return super().get_param_names() + [
             "n_clusters",
             "affinity",
+            "metric",
             "linkage",
             "connectivity",
             "n_neighbors"

@@ -1,5 +1,5 @@
 #
-# Copyright (c) 2020-2023, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -68,7 +68,7 @@ class BaseRandomForestModel(Base):
     classes_ = CumlArrayDescriptor()
 
     def __init__(self, *, split_criterion, n_streams=4, n_estimators=100,
-                 max_depth=16, handle=None, max_features='auto', n_bins=128,
+                 max_depth=16, handle=None, max_features='sqrt', n_bins=128,
                  bootstrap=True,
                  verbose=False, min_samples_leaf=1, min_samples_split=2,
                  max_samples=1.0, max_leaves=-1, accuracy_metric=None,
@@ -166,8 +166,22 @@ class BaseRandomForestModel(Base):
             return math.log2(self.n_cols)/self.n_cols
         elif self.max_features == 'auto':
             if self.RF_type == CLASSIFICATION:
+                warnings.warn(
+                    "`max_features='auto'` has been deprecated in 24.06 "
+                    "and will be removed in 25.08. To keep the past behaviour "
+                    "and silence this warning, explicitly set "
+                    "`max_features='sqrt'`.",
+                    FutureWarning
+                )
                 return 1/np.sqrt(self.n_cols)
             else:
+                warnings.warn(
+                    "`max_features='auto'` has been deprecated in 24.06 "
+                    "and will be removed in 25.08. To keep the past behaviour "
+                    "and silence this warning, explicitly set "
+                    "`max_features=1.0`.",
+                    FutureWarning
+                )
                 return 1.0
         else:
             raise ValueError(

@@ -172,15 +172,18 @@ class RandomForestClassifier(BaseRandomForestModel,
     max_leaves : int (default = -1)
         Maximum leaf nodes per tree. Soft constraint. Unlimited,
         If ``-1``.
-    max_features : int, float, or string (default = 'auto')
+    max_features : int, float, or string (default = 'sqrt')
         Ratio of number of features (columns) to consider per node
         split.\n
          * If type ``int`` then ``max_features`` is the absolute count of
            features to be used
          * If type ``float`` then ``max_features`` is used as a fraction.
-         * If ``'auto'`` then ``max_features=1/sqrt(n_features)``.
          * If ``'sqrt'`` then ``max_features=1/sqrt(n_features)``.
          * If ``'log2'`` then ``max_features=log2(n_features)/n_features``.
+
+        .. versionchanged:: 24.06
+           The default of `max_features` changed from `"auto"` to `"sqrt"`.
+
     n_bins : int (default = 128)
         Maximum number of bins used by the split algorithm per feature.
         For large problems, particularly those with highly-skewed input data,

@@ -1,5 +1,5 @@
 #
-# Copyright (c) 2019-2023, NVIDIA CORPORATION.
+# Copyright (c) 2019-2024, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -165,18 +165,22 @@ class RandomForestRegressor(BaseRandomForestModel,
         is not supported.\n
         .. note:: This default differs from scikit-learn's
           random forest, which defaults to unlimited depth.
+
     max_leaves : int (default = -1)
         Maximum leaf nodes per tree. Soft constraint. Unlimited,
         If ``-1``.
-    max_features : int, float, or string (default = 'auto')
+    max_features : int, float, or string (default = 1.0)
         Ratio of number of features (columns) to consider
         per node split.\n
          * If type ``int`` then ``max_features`` is the absolute count of
            features to be used.
          * If type ``float`` then ``max_features`` is used as a fraction.
-         * If ``'auto'`` then ``max_features=1.0``.
          * If ``'sqrt'`` then ``max_features=1/sqrt(n_features)``.
          * If ``'log2'`` then ``max_features=log2(n_features)/n_features``.
+
+        .. versionchanged:: 24.06
+          The default of `max_features` changed from `"auto"` to 1.0.
+
     n_bins : int (default = 128)
         Maximum number of bins used by the split algorithm per feature.
         For large problems, particularly those with highly-skewed input data,

@@ -1,5 +1,5 @@
 #
-# Copyright (c) 2020-2023, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -85,11 +85,15 @@ class Lars(Base, RegressorMixin):
     fit_intercept : boolean (default = True)
         If True, Lars tries to correct for the global mean of y.
         If False, the model expects that you have centered the data.
-    normalize : boolean (default = True)
+    normalize : boolean (default = False)
         This parameter is ignored when `fit_intercept` is set to False.
         If True, the predictors in X will be normalized by removing its mean
         and dividing by it's variance. If False, then the solver expects that
         the data is already normalized.
+
+        .. versionchanged:: 24.06
+            The default of `normalize` changed from `True` to `False`.
+
     copy_X : boolean (default = True)
         The solver permutes the columns of X. Set `copy_X` to True to prevent
         changing the input data.

@@ -1,5 +1,5 @@
 #
-# Copyright (c) 2019-2022, NVIDIA CORPORATION.
+# Copyright (c) 2019-2024, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -16,6 +16,8 @@
 
 # distutils: language = c++
 
+import warnings
+
 from cuml.internals.safe_imports import cpu_only_import
 from cuml.internals.safe_imports import gpu_only_import
 import pprint
@@ -36,7 +38,7 @@ cp = gpu_only_import('cupy')
 np = cpu_only_import('numpy')
 
 
-supported_penalties = ["l1", "l2", "none", "elasticnet"]
+supported_penalties = ["l1", "l2", None, "none", "elasticnet"]
 
 supported_solvers = ["qn"]
 
@@ -210,15 +212,24 @@ class LogisticRegression(UniversalBase,
                          output_type=output_type)
 
         if penalty not in supported_penalties:
-            raise ValueError("`penalty` " + str(penalty) + "not supported.")
+            raise ValueError("`penalty` " + str(penalty) + " not supported.")
 
         if solver not in supported_solvers:
             raise ValueError("Only quasi-newton `qn` solver is "
                              " supported, not %s" % solver)
         self.solver = solver
 
         self.C = C
+
+        if penalty == "none":
+            warnings.warn(
+                "The 'none' option was deprecated in version 24.06, and will "
+                "be removed in 25.08. Use None instead.",
+                FutureWarning
+            )
+            penalty = None
         self.penalty = penalty
+
         self.tol = tol
         self.fit_intercept = fit_intercept
         self.max_iter = max_iter
@@ -452,7 +463,7 @@ class LogisticRegression(UniversalBase,
         return proba
 
     def _get_qn_params(self):
-        if self.penalty == "none":
+        if self.penalty is None:
             l1_strength = 0.0
             l2_strength = 0.0