Merge pull request #2958 from rapidsai/branch-0.16

[gpuCI] Auto-merge branch-0.16 to branch-0.17 [skip ci]
rapidsai · Oct 12, 2020 · 81984fb · 81984fb
2 parents c1da169 + 3ea117d
commit 81984fb
Show file tree

Hide file tree

Showing 52 changed files with 1,674 additions and 785 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -60,6 +60,7 @@
 - PR #2910: Adding Support for CuPy 8.x
 - PR #2914: Add tests for XGBoost multi-class models in FIL
 - PR #2930: Pin libfaiss to <=1.6.3
+- PR #2928: Updating Estimators Derived from Base for Consistency
 
 ## Bug Fixes
 - PR #2885: Changing test target for NVTX wrapper test

diff --git a/docs/source/api.rst b/docs/source/api.rst
@@ -5,12 +5,16 @@ cuML API Reference
 Module Configuration
 ====================
 
+.. _output-data-type-configuration:
+
 Output Data Type Configuration
 ------------------------------
 
  .. automethod:: cuml.common.memory_utils.set_global_output_type
  .. automethod:: cuml.common.memory_utils.using_output_type
 
+.. _verbosity-levels:
+
 Verbosity Levels
 ----------------
 
@@ -471,3 +475,6 @@ Dask Base Classes and Mixins
 
 .. autoclass:: cuml.dask.common.base.DelayedInverseTransformMixin
    :members:
+
+.. autoclass:: cuml.experimental.decomposition.IncrementalPCA
+   :members:
diff --git a/docs/source/conf.py b/docs/source/conf.py
@@ -71,7 +71,7 @@
 
 # General information about the project.
 project = 'cuml'
-copyright = '2019, nvidia'
+copyright = '2020, nvidia'
 author = 'nvidia'
 
 # The version info for the project you're documenting, acts as replacement for

diff --git a/python/cuml/cluster/dbscan.pyx b/python/cuml/cluster/dbscan.pyx
@@ -125,12 +125,18 @@ class DBSCAN(Base):
         The maximum distance between 2 points such they reside in the same
         neighborhood.
     handle : cuml.Handle
-        If it is None, a new one is created just for this class
+        Specifies the cuml.handle that holds internal CUDA state for
+        computations in this model. Most importantly, this specifies the CUDA
+        stream that will be used for the model's computations, so users can
+        run different models concurrently in different streams by creating
+        handles in several streams.
+        If it is None, a new one is created.
     min_samples : int (default = 5)
         The number of samples in a neighborhood such that this group can be
         considered as an important core point (including the point itself).
-    verbose : int or boolean (default = False)
-        Logging level
+    verbose : int or boolean, default=False
+        Sets logging level. It must be one of `cuml.common.logger.level_*`.
+        See :ref:`verbosity-levels` for more info.
     max_mbytes_per_batch : (optional) int64
         Calculate batch size using no more than this number of megabytes for
         the pairwise distance computation. This enables the trade-off between
@@ -141,13 +147,11 @@ class DBSCAN(Base):
         Note: this option does not set the maximum total memory used in the
         DBSCAN computation and so this value will not be able to be set to
         the total memory available on the device.
-    output_type : (optional) {'input', 'cudf', 'cupy', 'numpy'} default = None
-        Use it to control output type of the results and attributes.
-        If None it'll inherit the output type set at the
-        module level, cuml.output_type. If that has not been changed, by
-        default the estimator will mirror the type of the data used for each
-        fit or predict call.
-        If set, the estimator will override the global option for its behavior.
+    output_type : {'input', 'cudf', 'cupy', 'numpy', 'numba'}, default=None
+        Variable to control output type of the results and attributes of
+        the estimator. If None, it'll inherit the output type set at the
+        module level, `cuml.global_output_type`.
+        See :ref:`output-data-type-configuration` for more info.
     calc_core_sample_indices : (optional) boolean (default = True)
         Indicates whether the indices of the core samples should be calculated.
         The the attribute `core_sample_indices_` will not be used, setting this
@@ -336,4 +340,9 @@ class DBSCAN(Base):
         return self.labels_
 
     def get_param_names(self):
-        return ["eps", "min_samples"]
+        return super().get_param_names() + [
+            "eps",
+            "min_samples",
+            "max_mbytes_per_batch",
+            "calc_core_sample_indices",
+        ]
diff --git a/python/cuml/cluster/kmeans.pyx b/python/cuml/cluster/kmeans.pyx
@@ -177,15 +177,21 @@ class KMeans(Base):
     Parameters
     ----------
     handle : cuml.Handle
-        If it is None, a new one is created just for this class.
+        Specifies the cuml.handle that holds internal CUDA state for
+        computations in this model. Most importantly, this specifies the CUDA
+        stream that will be used for the model's computations, so users can
+        run different models concurrently in different streams by creating
+        handles in several streams.
+        If it is None, a new one is created.
     n_clusters : int (default = 8)
         The number of centroids or clusters you want.
     max_iter : int (default = 300)
         The more iterations of EM, the more accurate, but slower.
     tol : float64 (default = 1e-4)
         Stopping criterion when centroid means do not change much.
-    verbose : int or boolean (default = False)
-        Logging level.
+    verbose : int or boolean, default=False
+        Sets logging level. It must be one of `cuml.common.logger.level_*`.
+        See :ref:`verbosity-levels` for more info.
     random_state : int (default = 1)
         If you want results to be the same when you restart Python, select a
         state.
@@ -218,6 +224,11 @@ class KMeans(Base):
         pairwise distance computation is max_samples_per_batch * n_clusters.
         It might become necessary to lower this number when n_clusters
         becomes prohibitively large.
+    output_type : {'input', 'cudf', 'cupy', 'numpy', 'numba'}, default=None
+        Variable to control output type of the results and attributes of
+        the estimator. If None, it'll inherit the output type set at the
+        module level, `cuml.global_output_type`.
+        See :ref:`output-data-type-configuration` for more info.
 
     Attributes
     ----------
@@ -607,6 +618,7 @@ class KMeans(Base):
         return self.fit(X).transform(X, convert_dtype=convert_dtype)
 
     def get_param_names(self):
-        return ['n_init', 'oversampling_factor', 'max_samples_per_batch',
+        return super().get_param_names() + \
+            ['n_init', 'oversampling_factor', 'max_samples_per_batch',
                 'init', 'max_iter', 'n_clusters', 'random_state',
                 'tol']
diff --git a/python/cuml/common/base.pyx b/python/cuml/common/base.pyx
@@ -108,20 +108,19 @@ class Base:
         stream that will be used for the model's computations, so users can
         run different models concurrently in different streams by creating
         handles in several streams.
-        If it is None, a new one is created just for this class.
-    verbose : int or boolean (default = False)
+        If it is None, a new one is created.
+    verbose : int or boolean, default=False
         Sets logging level. It must be one of `cuml.common.logger.level_*`.
-    output_type : {'input', 'cudf', 'cupy', 'numpy'}, optional
+        See :ref:`verbosity-levels` for more info.
+    output_type : {'input', 'cudf', 'cupy', 'numpy', 'numba'}, default=None
         Variable to control output type of the results and attributes of
-        the estimators. If None, it'll inherit the output type set at the
-        module level, cuml.output_type. If set, the estimator will override
-        the global option for its behavior.
+        the estimator. If None, it'll inherit the output type set at the
+        module level, `cuml.global_output_type`.
+        See :ref:`output-data-type-configuration` for more info.
 
     Examples
     --------
 
-
-
     .. code-block:: python
 
         from cuml import Base
@@ -181,8 +180,8 @@ class Base:
         else:
             self.verbose = verbose
 
-        self.output_type = cuml.global_output_type if output_type is None \
-            else _check_output_type_str(output_type)
+        self.output_type = _check_output_type_str(
+            cuml.global_output_type if output_type is None else output_type)
 
         self._mirror_input = True if self.output_type == 'input' else False
 
@@ -217,7 +216,7 @@ class Base:
         extra set of parameters that it in-turn owns. This is to simplify the
         implementation of `get_params` and `set_params` methods.
         """
-        return []
+        return ["handle", "verbose", "output_type"]
 
     def get_params(self, deep=True):
         """
@@ -442,11 +441,17 @@ def _input_to_type(input):
 def _check_output_type_str(output_str):
     if isinstance(output_str, str):
         output_type = output_str.lower()
-        if output_type in ['numpy', 'cupy', 'cudf', 'numba']:
+        if output_type in ['numpy', 'cupy', 'cudf', 'numba', 'input']:
             return output_str
         else:
-            raise ValueError("output_type must be one of " +
-                             "'numpy', 'cupy', 'cudf' or 'numba'")
+            raise ValueError(("output_type must be one of "
+                              "'numpy', 'cupy', 'cudf', 'numba', or 'input'."
+                              " Got: '{}'"
+                              ).format(output_str))
+    else:
+        raise ValueError(("output_type must be a string"
+                          " Got: '{}'"
+                          ).format(type(output_str)))
 
 
 def _input_target_to_dtype(target):

diff --git a/python/cuml/dask/cluster/kmeans.py b/python/cuml/dask/cluster/kmeans.py
@@ -48,15 +48,21 @@ class KMeans(BaseEstimator, DelayedPredictionMixin, DelayedTransformMixin):
     ----------
 
     handle : cuml.Handle
-        If it is None, a new one is created just for this class.
+        Specifies the cuml.handle that holds internal CUDA state for
+        computations in this model. Most importantly, this specifies the CUDA
+        stream that will be used for the model's computations, so users can
+        run different models concurrently in different streams by creating
+        handles in several streams.
+        If it is None, a new one is created.
     n_clusters : int (default = 8)
         The number of centroids or clusters you want.
     max_iter : int (default = 300)
         The more iterations of EM, the more accurate, but slower.
     tol : float (default = 1e-4)
         Stopping criterion when centroid means do not change much.
-    verbose : int or boolean (default = False)
-        Logging level for printing diagnostic information
+    verbose : int or boolean, default=False
+        Sets logging level. It must be one of `cuml.common.logger.level_*`.
+        See :ref:`verbosity-levels` for more info.
     random_state : int (default = 1)
         If you want results to be the same when you restart Python,
         select a state.

diff --git a/python/cuml/dask/decomposition/pca.py b/python/cuml/dask/decomposition/pca.py
@@ -98,15 +98,21 @@ class PCA(BaseDecomposition,
     Parameters
     ----------
     handle : cuml.Handle
-        If it is None, a new one is created just for this class
+        Specifies the cuml.handle that holds internal CUDA state for
+        computations in this model. Most importantly, this specifies the CUDA
+        stream that will be used for the model's computations, so users can
+        run different models concurrently in different streams by creating
+        handles in several streams.
+        If it is None, a new one is created.
     n_components : int (default = 1)
         The number of top K singular vectors / values you want.
         Must be <= number(columns).
     svd_solver : 'full', 'jacobi', or 'tsqr'
         'full': run exact full SVD and select the components by postprocessing
         'jacobi': iteratively compute SVD of the covariance matrix
-    verbose : int or boolean (default = False)
-        Logging level
+    verbose : int or boolean, default=False
+        Sets logging level. It must be one of `cuml.common.logger.level_*`.
+        See :ref:`verbosity-levels` for more info.
     whiten : boolean (default = False)
         If True, de-correlates the components. This is done by dividing them by
         the corresponding singular values then multiplying by sqrt(n_samples).

diff --git a/python/cuml/dask/decomposition/tsvd.py b/python/cuml/dask/decomposition/tsvd.py
@@ -88,15 +88,21 @@ class TruncatedSVD(BaseDecomposition,
     Parameters
     ----------
     handle : cuml.Handle
-        If it is None, a new one is created just for this class
+        Specifies the cuml.handle that holds internal CUDA state for
+        computations in this model. Most importantly, this specifies the CUDA
+        stream that will be used for the model's computations, so users can
+        run different models concurrently in different streams by creating
+        handles in several streams.
+        If it is None, a new one is created.
     n_components : int (default = 1)
         The number of top K singular vectors / values you want.
         Must be <= number(columns).
     svd_solver : 'full'
         Only Full algorithm is supported since it's significantly faster on GPU
         then the other solvers including randomized SVD.
-    verbose : int or boolean (default = False)
-        Logging level
+    verbose : int or boolean, default=False
+        Sets logging level. It must be one of `cuml.common.logger.level_*`.
+        See :ref:`verbosity-levels` for more info.
 
     Attributes
     ----------

diff --git a/python/cuml/dask/ensemble/randomforestclassifier.py b/python/cuml/dask/ensemble/randomforestclassifier.py
@@ -67,7 +67,12 @@ class RandomForestClassifier(BaseRandomForestModel, DelayedPredictionMixin,
     n_estimators : int (default = 10)
                    total number of trees in the forest (not per-worker)
     handle : cuml.Handle
-        If it is None, a new one is created just for this class.
+        Specifies the cuml.handle that holds internal CUDA state for
+        computations in this model. Most importantly, this specifies the CUDA
+        stream that will be used for the model's computations, so users can
+        run different models concurrently in different streams by creating
+        handles in several streams.
+        If it is None, a new one is created.
     split_criterion : The criterion used to split nodes.
         0 for GINI, 1 for ENTROPY, 4 for CRITERION_END.
         2 and 3 not valid for classification

diff --git a/python/cuml/dask/ensemble/randomforestregressor.py b/python/cuml/dask/ensemble/randomforestregressor.py
@@ -62,7 +62,12 @@ class RandomForestRegressor(BaseRandomForestModel, DelayedPredictionMixin,
     n_estimators : int (default = 10)
         total number of trees in the forest (not per-worker)
     handle : cuml.Handle
-        If it is None, a new one is created just for this class.
+        Specifies the cuml.handle that holds internal CUDA state for
+        computations in this model. Most importantly, this specifies the CUDA
+        stream that will be used for the model's computations, so users can
+        run different models concurrently in different streams by creating
+        handles in several streams.
+        If it is None, a new one is created.
     split_algo : int (default = 1)
         0 for HIST, 1 for GLOBAL_QUANTILE
         The type of algorithm to be used to create the trees.

diff --git a/python/cuml/dask/linear_model/elastic_net.py b/python/cuml/dask/linear_model/elastic_net.py
@@ -63,14 +63,17 @@ class ElasticNet(BaseEstimator):
         This (setting to ‘random’) often leads to significantly faster
         convergence especially when tol is higher than 1e-4.
     handle : cuml.Handle
-        If it is None, a new one is created just for this class.
-    output_type : (optional) {'input', 'cudf', 'cupy', 'numpy'} default = None
-        Use it to control output type of the results and attributes.
-        If None it'll inherit the output type set at the
-        module level, cuml.output_type. If that has not been changed, by
-        default the estimator will mirror the type of the data used for each
-        fit or predict call.
-        If set, the estimator will override the global option for its behavior.
+        Specifies the cuml.handle that holds internal CUDA state for
+        computations in this model. Most importantly, this specifies the CUDA
+        stream that will be used for the model's computations, so users can
+        run different models concurrently in different streams by creating
+        handles in several streams.
+        If it is None, a new one is created.
+    output_type : {'input', 'cudf', 'cupy', 'numpy', 'numba'}, default=None
+        Variable to control output type of the results and attributes of
+        the estimator. If None, it'll inherit the output type set at the
+        module level, `cuml.global_output_type`.
+        See :ref:`output-data-type-configuration` for more info.
 
     Attributes
     -----------

diff --git a/python/cuml/decomposition/pca.pyx b/python/cuml/decomposition/pca.pyx
@@ -209,7 +209,12 @@ class PCA(Base):
         If True, then copies data then removes mean from data. False might
         cause data to be overwritten with its mean centered version.
     handle : cuml.Handle
-        If it is None, a new one is created just for this class
+        Specifies the cuml.handle that holds internal CUDA state for
+        computations in this model. Most importantly, this specifies the CUDA
+        stream that will be used for the model's computations, so users can
+        run different models concurrently in different streams by creating
+        handles in several streams.
+        If it is None, a new one is created.
     iterated_power : int (default = 15)
         Used in Jacobi solver. The more iterations, the more accurate, but
         slower.
@@ -230,15 +235,20 @@ class PCA(Base):
     tol : float (default = 1e-7)
         Used if algorithm = "jacobi". Smaller tolerance can increase accuracy,
         but but will slow down the algorithm's convergence.
-    verbose : int or boolean (default = False)
-        Logging level
+    verbose : int or boolean, default=False
+        Sets logging level. It must be one of `cuml.common.logger.level_*`.
+        See :ref:`verbosity-levels` for more info.
     whiten : boolean (default = False)
         If True, de-correlates the components. This is done by dividing them by
         the corresponding singular values then multiplying by sqrt(n_samples).
         Whitening allows each component to have unit variance and removes
         multi-collinearity. It might be beneficial for downstream
         tasks like LinearRegression where correlated features cause problems.
-
+    output_type : {'input', 'cudf', 'cupy', 'numpy', 'numba'}, default=None
+        Variable to control output type of the results and attributes of
+        the estimator. If None, it'll inherit the output type set at the
+        module level, `cuml.global_output_type`.
+        See :ref:`output-data-type-configuration` for more info.
 
     Attributes
     ----------
@@ -739,8 +749,9 @@ class PCA(Base):
         return t_input_data.to_output(out_type)
 
     def get_param_names(self):
-        return ["copy", "iterated_power", "n_components", "svd_solver", "tol",
-                "whiten"]
+        return super().get_param_names() + \
+            ["copy", "iterated_power", "n_components", "svd_solver", "tol",
+                "whiten", "random_state"]
 
     def __getstate__(self):
         state = self.__dict__.copy()