diff --git a/CHANGELOG.md b/CHANGELOG.md
index 1b3830765d..06bde24b16 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -61,6 +61,7 @@
 - PR #2914: Add tests for XGBoost multi-class models in FIL
 - PR #2930: Pin libfaiss to <=1.6.3
 - PR #2928: Updating Estimators Derived from Base for Consistency
+- PR #2942: Adding `cuml.experimental` to the Docs
 
 ## Bug Fixes
 - PR #2885: Changing test target for NVTX wrapper test
diff --git a/docs/source/api.rst b/docs/source/api.rst
index 3294c6f12a..10f266b661 100644
--- a/docs/source/api.rst
+++ b/docs/source/api.rst
@@ -476,5 +476,26 @@ Dask Base Classes and Mixins
 .. autoclass:: cuml.dask.common.base.DelayedInverseTransformMixin
    :members:
 
+Experimental
+============
+
+.. warning:: The `cuml.experimental` module contains features that are still
+    under development. It is not recommended to depend on features in this
+    module as they may change in future releases.
+
+.. note:: Due to the nature of this module, it is not imported by default by
+    the root `cuml` package. Each `experimental` submodule must be imported
+    separately.
+
+Decomposition
+-------------
 .. autoclass:: cuml.experimental.decomposition.IncrementalPCA
-   :members:
\ No newline at end of file
+   :members:
+
+Preprocessing
+-------------
+.. automodule:: cuml.experimental.preprocessing
+   :members: Binarizer, KBinsDiscretizer, MaxAbsScaler, MinMaxScaler,
+      Normalizer, RobustScaler, SimpleImputer, StandardScaler,
+      add_dummy_feature, binarize, minmax_scale, normalize,
+      PolynomialFeatures, robust_scale, scale
diff --git a/python/cuml/_thirdparty/sklearn/README.md b/python/cuml/_thirdparty/sklearn/README.md
index 098c32ef30..38332cdcc1 100644
--- a/python/cuml/_thirdparty/sklearn/README.md
+++ b/python/cuml/_thirdparty/sklearn/README.md
@@ -1,6 +1,6 @@
 # GPU accelerated Scikit-Learn preprocessing
 
-This directory contains code originating from the Scikit-Learn library. The Scikit-Learn license applies accordingly (see `/thirdparty/LICENSES/LICENSE.scikit_learn`).
+This directory contains code originating from the Scikit-Learn library. The Scikit-Learn license applies accordingly (see `/thirdparty/LICENSES/LICENSE.scikit_learn`). Original authors mentioned in the code do not endorse or promote this production.
 
 This work is dedicated to providing GPU accelerated tools for preprocessing. The Scikit-Learn code is slightly modified to make it possible to take common inputs used throughout cuML such as Numpy and Cupy arrays, Pandas and cuDF dataframes and compute the results on GPU.
 
diff --git a/python/cuml/_thirdparty/sklearn/exceptions.py b/python/cuml/_thirdparty/sklearn/exceptions.py
index 7140b98e53..c874389392 100644
--- a/python/cuml/_thirdparty/sklearn/exceptions.py
+++ b/python/cuml/_thirdparty/sklearn/exceptions.py
@@ -1,3 +1,8 @@
+# This code originates from the Scikit-Learn library,
+# it was since modified to allow GPU acceleration.
+# This code is under BSD 3 clause license.
+
+
 """
 The :mod:`sklearn.exceptions` module includes all custom warnings and error
 classes used across scikit-learn.
diff --git a/python/cuml/_thirdparty/sklearn/preprocessing/__init__.py b/python/cuml/_thirdparty/sklearn/preprocessing/__init__.py
index d34dd7a862..ea17a3f197 100644
--- a/python/cuml/_thirdparty/sklearn/preprocessing/__init__.py
+++ b/python/cuml/_thirdparty/sklearn/preprocessing/__init__.py
@@ -1,3 +1,8 @@
+# This code originates from the Scikit-Learn library,
+# it was since modified to allow GPU acceleration.
+# This code is under BSD 3 clause license.
+
+
 """
 The :mod:`sklearn.preprocessing` module includes scaling, centering,
 normalization, binarization methods.
diff --git a/python/cuml/_thirdparty/sklearn/preprocessing/_data.py b/python/cuml/_thirdparty/sklearn/preprocessing/_data.py
index 440167b674..0c7a9dbbf9 100644
--- a/python/cuml/_thirdparty/sklearn/preprocessing/_data.py
+++ b/python/cuml/_thirdparty/sklearn/preprocessing/_data.py
@@ -1,4 +1,5 @@
-# Authors: Alexandre Gramfort <alexandre.gramfort@inria.fr>
+# Original authors from Sckit-Learn:
+#          Alexandre Gramfort <alexandre.gramfort@inria.fr>
 #          Mathieu Blondel <mathieu@mblondel.org>
 #          Olivier Grisel <olivier.grisel@ensta.org>
 #          Andreas Mueller <amueller@ais.uni-bonn.de>
@@ -8,6 +9,12 @@
 # License: BSD 3 clause
 
 
+# This code originates from the Scikit-Learn library,
+# it was since modified to allow GPU acceleration.
+# This code is under BSD 3 clause license.
+# Authors mentioned above do not endorse or promote this production.
+
+
 from itertools import chain, combinations
 import numbers
 import warnings
@@ -103,22 +110,21 @@ def scale(X, *, axis=0, with_mean=True, with_std=True, copy=True):
         unit standard deviation).
 
     copy : boolean, optional, default True
-        set to False to perform inplace row normalization and avoid a
-        copy (if the input is already a numpy array or a scipy.sparse
-        CSC matrix and if axis is 1).
+        Whether a forced copy will be triggered. If copy=False, a copy might
+        be triggered by a conversion.
 
     Notes
     -----
-    This implementation will refuse to center scipy.sparse matrices
+    This implementation will refuse to center sparse matrices
     since it would make them non-sparse and would potentially crash the
     program with memory exhaustion problems.
 
     Instead the caller is expected to either set explicitly
     `with_mean=False` (in that case, only variance scaling will be
-    performed on the features of the CSC matrix) or to call `X.toarray()`
+    performed on the features of the sparse matrix) or to densify the matrix
     if he/she expects the materialized dense array to fit in memory.
 
-    To avoid memory copy the caller should pass a CSC matrix.
+    For optimal processing the caller should pass a CSC matrix.
 
     NaNs are treated as missing values: disregarded to compute the statistics,
     and maintained during the data transformation.
@@ -218,8 +224,8 @@ class MinMaxScaler(TransformerMixin, BaseEstimator):
         Desired range of transformed data.
 
     copy : bool, default=True
-        Set to False to perform inplace row normalization and avoid a
-        copy (if the input is already a numpy array).
+        Whether a forced copy will be triggered. If copy=False, a copy might
+        be triggered by a conversion.
 
     Attributes
     ----------
@@ -455,8 +461,8 @@ def minmax_scale(X, feature_range=(0, 1), *, axis=0, copy=True):
         otherwise (if 1) scale each sample.
 
     copy : bool, default=True
-        Set to False to perform inplace scaling and avoid a copy (if the input
-        is already a numpy array).
+        Whether a forced copy will be triggered. If copy=False, a copy might
+        be triggered by a conversion.
 
     See also
     --------
@@ -521,10 +527,8 @@ class StandardScaler(TransformerMixin, BaseEstimator):
     Parameters
     ----------
     copy : boolean, optional, default True
-        If False, try to avoid a copy and do inplace scaling instead.
-        This is not guaranteed to always work inplace; e.g. if the data is
-        not a NumPy array or scipy.sparse CSR matrix, a copy may still be
-        returned.
+        Whether a forced copy will be triggered. If copy=False, a copy might
+        be triggered by a conversion.
 
     with_mean : boolean, True by default
         If True, center the data before scaling.
@@ -541,7 +545,7 @@ class StandardScaler(TransformerMixin, BaseEstimator):
     ----------
     scale_ : ndarray or None, shape (n_features,)
         Per feature relative scaling of the data. This is calculated using
-        `np.sqrt(var_)`. Equal to ``None`` when ``with_std=False``.
+        `sqrt(var_)`. Equal to ``None`` when ``with_std=False``.
 
     mean_ : ndarray or None, shape (n_features,)
         The mean value for each feature in the training set.
@@ -754,10 +758,11 @@ def transform(self, X, copy=None):
 
         Parameters
         ----------
-        X : array-like, shape [n_samples, n_features]
+        X : {array-like, sparse matrix}, shape [n_samples, n_features]
             The data used to scale along the features axis.
         copy : bool, optional (default: None)
-            Copy the input X or not.
+            Whether a forced copy will be triggered. If copy=False,
+            a copy might be triggered by a conversion.
         """
         check_is_fitted(self)
 
@@ -790,14 +795,15 @@ def inverse_transform(self, X, copy=None):
 
         Parameters
         ----------
-        X : array-like, shape [n_samples, n_features]
+        X : {array-like, sparse matrix}, shape [n_samples, n_features]
             The data used to scale along the features axis.
         copy : bool, optional (default: None)
-            Copy the input X or not.
+            Whether a forced copy will be triggered. If copy=False,
+            a copy might be triggered by a conversion.
 
         Returns
         -------
-        X_tr : array-like, shape [n_samples, n_features]
+        X_tr : {array-like, sparse matrix}, shape [n_samples, n_features]
             Transformed array.
         """
         check_is_fitted(self)
@@ -837,7 +843,6 @@ def _more_tags(self):
         return {'allow_nan': True}
 
 
-@check_cupy8()
 class MaxAbsScaler(TransformerMixin, BaseEstimator):
     """Scale each feature by its maximum absolute value.
 
@@ -851,8 +856,8 @@ class MaxAbsScaler(TransformerMixin, BaseEstimator):
     Parameters
     ----------
     copy : boolean, optional, default is True
-        Set to False to perform inplace scaling and avoid a copy (if the input
-        is already a numpy array).
+        Whether a forced copy will be triggered. If copy=False, a copy might
+        be triggered by a conversion.
 
     Attributes
     ----------
@@ -890,6 +895,7 @@ class MaxAbsScaler(TransformerMixin, BaseEstimator):
     transform.
     """
 
+    @check_cupy8()
     @_deprecate_positional_args
     def __init__(self, *, copy=True):
         self.copy = copy
@@ -1028,7 +1034,7 @@ def maxabs_scale(X, *, axis=0, copy=True):
 
     Parameters
     ----------
-    X : array-like, shape (n_samples, n_features)
+    X : {array-like, sparse matrix}, shape (n_samples, n_features)
         The data.
 
     axis : int (0 by default)
@@ -1036,8 +1042,8 @@ def maxabs_scale(X, *, axis=0, copy=True):
         otherwise (if 1) scale each sample.
 
     copy : boolean, optional, default is True
-        Set to False to perform inplace scaling and avoid a copy (if the input
-        is already a numpy array).
+        Whether a forced copy will be triggered. If copy=False, a copy might
+        be triggered by a conversion.
 
     See also
     --------
@@ -1074,43 +1080,42 @@ def maxabs_scale(X, *, axis=0, copy=True):
 class RobustScaler(TransformerMixin, BaseEstimator):
     """Scale features using statistics that are robust to outliers.
 
-    This Scaler removes the median and scales the data according to
-    the quantile range (defaults to IQR: Interquartile Range).
-    The IQR is the range between the 1st quartile (25th quantile)
-    and the 3rd quartile (75th quantile).
+    This Scaler removes the median and scales the data according to the
+    quantile range (defaults to IQR: Interquartile Range). The IQR is the range
+    between the 1st quartile (25th quantile) and the 3rd quartile (75th
+    quantile).
 
-    Centering and scaling happen independently on each feature by
-    computing the relevant statistics on the samples in the training
-    set. Median and interquartile range are then stored to be used on
-    later data using the ``transform`` method.
+    Centering and scaling happen independently on each feature by computing the
+    relevant statistics on the samples in the training set. Median and
+    interquartile range are then stored to be used on later data using the
+    ``transform`` method.
 
-    Standardization of a dataset is a common requirement for many
-    machine learning estimators. Typically this is done by removing the mean
-    and scaling to unit variance. However, outliers can often influence the
-    sample mean / variance in a negative way. In such cases, the median and
-    the interquartile range often give better results.
+    Standardization of a dataset is a common requirement for many machine
+    learning estimators. Typically this is done by removing the mean and
+    scaling to unit variance. However, outliers can often influence the sample
+    mean / variance in a negative way. In such cases, the median and the
+    interquartile range often give better results.
 
     Parameters
     ----------
-    with_centering : boolean, True by default
+
+    with_centering : boolean, default=True
         If True, center the data before scaling.
         This will cause ``transform`` to raise an exception when attempted on
         sparse matrices, because centering them entails building a dense
         matrix which in common use cases is likely to be too large to fit in
         memory.
 
-    with_scaling : boolean, True by default
+    with_scaling : boolean, default=True
         If True, scale the data to interquartile range.
 
     quantile_range : tuple (q_min, q_max), 0.0 < q_min < q_max < 100.0
         Default: (25.0, 75.0) = (1st quantile, 3rd quantile) = IQR
         Quantile range used to calculate ``scale_``.
 
-    copy : boolean, optional, default is True
-        If False, try to avoid a copy and do inplace scaling instead.
-        This is not guaranteed to always work inplace; e.g. if the data is
-        not a NumPy array or scipy.sparse CSR matrix, a copy may still be
-        returned.
+    copy : boolean, optional, default=True
+        Whether a forced copy will be triggered. If copy=False, a copy might
+        be triggered by a conversion.
 
     Attributes
     ----------
@@ -1120,9 +1125,6 @@ class RobustScaler(TransformerMixin, BaseEstimator):
     scale_ : array of floats
         The (scaled) interquartile range for each feature in the training set.
 
-        .. versionadded:: 0.17
-           *scale_* attribute.
-
     Examples
     --------
     >>> from cuml.preprocessing import RobustScaler
@@ -1139,14 +1141,12 @@ class RobustScaler(TransformerMixin, BaseEstimator):
 
     See also
     --------
+
     robust_scale: Equivalent function without the estimator API.
 
-    :class:`cuml.decomposition.PCA`
-        Further removes the linear correlation across features with
-        'whiten=True'.
+    cuml.decomposition.PCA: Further removes the linear correlation across
+        features with ``whiten=True``.
 
-    https://en.wikipedia.org/wiki/Median
-    https://en.wikipedia.org/wiki/Interquartile_range
     """
     @_deprecate_positional_args
     def __init__(self, *, with_centering=True, with_scaling=True,
@@ -1161,7 +1161,7 @@ def fit(self, X, y=None):
 
         Parameters
         ----------
-        X : array-like, shape [n_samples, n_features]
+        X : {array-like, CSC matrix}, shape [n_samples, n_features]
             The data used to compute the median and quantiles
             used for later scaling along the features axis.
         """
@@ -1247,7 +1247,7 @@ def inverse_transform(self, X):
 
         Parameters
         ----------
-        X : array-like
+        X : {array-like, sparse matrix}
             The data used to scale along the specified axis.
         """
         check_is_fitted(self)
@@ -1274,16 +1274,15 @@ def _more_tags(self):
 @_deprecate_positional_args
 def robust_scale(X, *, axis=0, with_centering=True, with_scaling=True,
                  quantile_range=(25.0, 75.0), copy=True):
-    """Standardize a dataset along any axis
+    """
+    Standardize a dataset along any axis
 
     Center to the median and component wise scale
     according to the interquartile range.
 
-    Read more in the :ref:`User Guide <preprocessing_scaler>`.
-
     Parameters
     ----------
-    X : array-like
+    X : {array-like, sparse matrix}
         The data to center and scale.
 
     axis : int (0 by default)
@@ -1302,22 +1301,19 @@ def robust_scale(X, *, axis=0, with_centering=True, with_scaling=True,
         Default: (25.0, 75.0) = (1st quantile, 3rd quantile) = IQR
         Quantile range used to calculate ``scale_``.
 
-        .. versionadded:: 0.18
-
     copy : boolean, optional, default is True
-        set to False to perform inplace row normalization and avoid a
-        copy (if the input is already a numpy array or a scipy.sparse
-        CSR matrix and if axis is 1).
+        Whether a forced copy will be triggered. If copy=False, a copy might
+        be triggered by a conversion.
 
     Notes
     -----
-    This implementation will refuse to center scipy.sparse matrices
+    This implementation will refuse to center sparse matrices
     since it would make them non-sparse and would potentially crash the
     program with memory exhaustion problems.
 
     Instead the caller is expected to either set explicitly
     `with_centering=False` (in that case, only variance scaling will be
-    performed on the features of the CSR matrix) or to call `X.toarray()`
+    performed on the features of the CSR matrix) or to densify the matrix
     if he/she expects the materialized dense array to fit in memory.
 
     To avoid memory copy the caller should pass a CSR matrix.
@@ -1325,6 +1321,7 @@ def robust_scale(X, *, axis=0, with_centering=True, with_scaling=True,
     See also
     --------
     RobustScaler: Performs centering and scaling using the ``Transformer`` API
+
     """
     output_type = get_input_type(X)
     X = check_array(X, accept_sparse=('csr', 'csc'), copy=False,
@@ -1348,7 +1345,6 @@ def robust_scale(X, *, axis=0, with_centering=True, with_scaling=True,
     return to_output_type(X, output_type)
 
 
-@check_cupy8()
 class PolynomialFeatures(TransformerMixin, BaseEstimator):
     """Generate polynomial and interaction features.
 
@@ -1415,6 +1411,7 @@ class PolynomialFeatures(TransformerMixin, BaseEstimator):
     polynomially in the number of features of the input array, and
     exponentially in the degree. High degrees can cause overfitting.
     """
+    @check_cupy8()
     @_deprecate_positional_args
     def __init__(self, degree=2, *, interaction_only=False, include_bias=True,
                  order='C'):
@@ -1424,6 +1421,7 @@ def __init__(self, degree=2, *, interaction_only=False, include_bias=True,
         self.order = order
 
     @staticmethod
+    @check_cupy8()
     def _combinations(n_features, degree, interaction_only, include_bias):
         comb = (combinations if interaction_only else combinations_w_r)
         start = int(not include_bias)
@@ -1498,7 +1496,7 @@ def transform(self, X):
 
         Parameters
         ----------
-        X : array-like or CSR/CSC sparse matrix, shape [n_samples, n_features]
+        X : {array-like, sparse matrix}, shape [n_samples, n_features]
             The data to transform, row by row.
 
             Prefer CSR over CSC for sparse input (for speed), but CSC is
@@ -1516,7 +1514,7 @@ def transform(self, X):
 
         Returns
         -------
-        XP : np.ndarray or CSR/CSC sparse matrix, shape [n_samples, NP]
+        XP : {array-like, sparse matrix}, shape [n_samples, NP]
             The matrix of features, where NP is the number of polynomial
             features generated from the combination of inputs.
         """
@@ -1633,8 +1631,8 @@ def normalize(X, norm='l2', *, axis=1, copy=True, return_norm=False):
     ----------
     X : {array-like, sparse matrix}, shape [n_samples, n_features]
         The data to normalize, element by element.
-        scipy.sparse matrices should be in CSR format to avoid an
-        un-necessary copy.
+        Please provide CSC matrix to normalize on axis 0,
+        conversely provide CSR matrix to normalize on axis 1
 
     norm : 'l1', 'l2', or 'max', optional ('l2' by default)
         The norm to use to normalize each non zero sample (or each non-zero
@@ -1645,9 +1643,8 @@ def normalize(X, norm='l2', *, axis=1, copy=True, return_norm=False):
         each sample, otherwise (if 0) normalize each feature.
 
     copy : boolean, optional, default True
-        set to False to perform inplace row normalization and avoid a
-        copy (if the input is already a numpy array or a scipy.sparse
-        CSR matrix and if axis is 1).
+        Whether a forced copy will be triggered. If copy=False, a copy might
+        be triggered by a conversion.
 
     return_norm : boolean, default False
         whether to return the computed norms
@@ -1719,7 +1716,6 @@ def normalize(X, norm='l2', *, axis=1, copy=True, return_norm=False):
         return X
 
 
-@check_cupy8()
 class Normalizer(TransformerMixin, BaseEstimator):
     """Normalize samples individually to unit norm.
 
@@ -1728,8 +1724,7 @@ class Normalizer(TransformerMixin, BaseEstimator):
     that its norm (l1, l2 or inf) equals one.
 
     This transformer is able to work both with dense numpy arrays and
-    scipy.sparse matrix (use CSR format if you want to avoid the burden of
-    a copy / conversion).
+    sparse matrix
 
     Scaling inputs to unit norms is a common operation for text
     classification or clustering for instance. For instance the dot
@@ -1745,9 +1740,8 @@ class Normalizer(TransformerMixin, BaseEstimator):
         values.
 
     copy : boolean, optional, default True
-        set to False to perform inplace row normalization and avoid a
-        copy (if the input is already a numpy array or a scipy.sparse
-        CSR matrix).
+        Whether a forced copy will be triggered. If copy=False, a copy might
+        be triggered by a conversion.
 
     Examples
     --------
@@ -1774,6 +1768,7 @@ class Normalizer(TransformerMixin, BaseEstimator):
     normalize: Equivalent function without the estimator API.
     """
 
+    @check_cupy8()
     @_deprecate_positional_args
     def __init__(self, norm='l2', *, copy=True):
         self.norm = norm
@@ -1787,7 +1782,7 @@ def fit(self, X, y=None):
 
         Parameters
         ----------
-        X : array-like
+        X : {array-like, CSR matrix}
         """
         self._validate_data(X, accept_sparse='csr')
         return self
@@ -1797,11 +1792,11 @@ def transform(self, X, copy=None):
 
         Parameters
         ----------
-        X : {array-like, sparse matrix}, shape [n_samples, n_features]
-            The data to normalize, row by row. scipy.sparse matrices should be
-            in CSR format to avoid an un-necessary copy.
+        X : {array-like, CSR matrix}, shape [n_samples, n_features]
+            The data to normalize, row by row.
         copy : bool, optional (default: None)
-            Copy the input X or not.
+            Whether a forced copy will be triggered. If copy=False,
+            a copy might be triggered by a conversion.
         """
         output_type = get_input_type(X)
         copy = copy if copy is not None else self.copy
@@ -1815,23 +1810,20 @@ def _more_tags(self):
 
 @_deprecate_positional_args
 def binarize(X, *, threshold=0.0, copy=True):
-    """Boolean thresholding of array-like or scipy.sparse matrix
+    """Boolean thresholding of array-like or sparse matrix
 
     Parameters
     ----------
     X : {array-like, sparse matrix}, shape [n_samples, n_features]
         The data to binarize, element by element.
-        scipy.sparse matrices should be in CSR or CSC format to avoid an
-        un-necessary copy.
 
     threshold : float, optional (0.0 by default)
         Feature values below or equal to this are replaced by 0, above it by 1.
         Threshold may not be less than 0 for operations on sparse matrices.
 
     copy : boolean, optional, default True
-        set to False to perform inplace binarization and avoid a copy
-        (if the input is already a numpy array or a scipy.sparse CSR / CSC
-        matrix and if axis is 1).
+        Whether a forced copy will be triggered. If copy=False, a copy might
+        be triggered by a conversion.
 
     See also
     --------
@@ -1878,8 +1870,8 @@ class Binarizer(TransformerMixin, BaseEstimator):
         Threshold may not be less than 0 for operations on sparse matrices.
 
     copy : boolean, optional, default True
-        set to False to perform inplace binarization and avoid a copy (if
-        the input is already a numpy array or a scipy.sparse CSR matrix).
+        Whether a forced copy will be triggered. If copy=False, a copy might
+        be triggered by a conversion.
 
     Examples
     --------
@@ -1921,7 +1913,7 @@ def fit(self, X, y=None):
 
         Parameters
         ----------
-        X : array-like
+        X : {array-like, sparse matrix}
         """
         self._validate_data(X, accept_sparse=['csr', 'csc'])
         return self
@@ -1933,11 +1925,10 @@ def transform(self, X, copy=None):
         ----------
         X : {array-like, sparse matrix}, shape [n_samples, n_features]
             The data to binarize, element by element.
-            scipy.sparse matrices should be in CSR format to avoid an
-            un-necessary copy.
 
         copy : bool
-            Copy the input X or not.
+            Whether a forced copy will be triggered. If copy=False,
+            a copy might be triggered by a conversion.
         """
         copy = copy if copy is not None else self.copy
         return binarize(X, threshold=self.threshold, copy=copy)
@@ -2024,7 +2015,8 @@ def transform(self, K, copy=True):
             Kernel matrix.
 
         copy : boolean, optional, default True
-            Set to False to perform inplace computation.
+            Whether a forced copy will be triggered. If copy=False,
+            a copy might be triggered by a conversion.
 
         Returns
         -------
@@ -2748,6 +2740,7 @@ class PowerTransformer(TransformerMixin, BaseEstimator):
 
     .. [2] G.E.P. Box and D.R. Cox, "An Analysis of Transformations", Journal
            of the Royal Statistical Society B, 26, 211-252 (1964).
+
     """
     @_deprecate_positional_args
     def __init__(self, method='yeo-johnson', *, standardize=True, copy=True):
diff --git a/python/cuml/_thirdparty/sklearn/preprocessing/_discretization.py b/python/cuml/_thirdparty/sklearn/preprocessing/_discretization.py
index 4dae6a7424..66ce22b82d 100644
--- a/python/cuml/_thirdparty/sklearn/preprocessing/_discretization.py
+++ b/python/cuml/_thirdparty/sklearn/preprocessing/_discretization.py
@@ -1,11 +1,18 @@
 # -*- coding: utf-8 -*-
 
-# Author: Henry Lin <hlin117@gmail.com>
+# Original authors from Sckit-Learn:
+#         Henry Lin <hlin117@gmail.com>
 #         Tom Dupré la Tour
 
 # License: BSD
 
 
+# This code originates from the Scikit-Learn library,
+# it was since modified to allow GPU acceleration.
+# This code is under BSD 3 clause license.
+# Authors mentioned above do not endorse or promote this production.
+
+
 import numbers
 import cupy as np
 import numpy as cpu_np
@@ -37,15 +44,10 @@ def digitize(x, bins):
     return out
 
 
-@check_cupy8()
 class KBinsDiscretizer(TransformerMixin, BaseEstimator):
     """
     Bin continuous data into intervals.
 
-    Read more in the :ref:`User Guide <preprocessing_discretization>`.
-
-    .. versionadded:: 0.20
-
     Parameters
     ----------
     n_bins : int or array-like, shape (n_features,) (default=5)
@@ -88,7 +90,7 @@ class KBinsDiscretizer(TransformerMixin, BaseEstimator):
 
     See Also
     --------
-     sklearn.preprocessing.Binarizer : Class used to bin values as ``0`` or
+     cuml.preprocessing.Binarizer : Class used to bin values as ``0`` or
         ``1`` based on a parameter ``threshold``.
 
     Notes
@@ -138,6 +140,7 @@ class KBinsDiscretizer(TransformerMixin, BaseEstimator):
 
     """
 
+    @check_cupy8()
     @_deprecate_positional_args
     def __init__(self, n_bins=5, *, encode='onehot', strategy='quantile'):
         self.n_bins = n_bins
diff --git a/python/cuml/_thirdparty/sklearn/preprocessing/_encoders.py b/python/cuml/_thirdparty/sklearn/preprocessing/_encoders.py
index 3b0e43c151..bcfb00ffd8 100644
--- a/python/cuml/_thirdparty/sklearn/preprocessing/_encoders.py
+++ b/python/cuml/_thirdparty/sklearn/preprocessing/_encoders.py
@@ -1,7 +1,15 @@
-# Authors: Andreas Mueller <amueller@ais.uni-bonn.de>
+# Original authors from Sckit-Learn:
+#          Andreas Mueller <amueller@ais.uni-bonn.de>
 #          Joris Van den Bossche <jorisvandenbossche@gmail.com>
 # License: BSD 3 clause
 
+
+# This code originates from the Scikit-Learn library,
+# it was since modified to allow GPU acceleration.
+# This code is under BSD 3 clause license.
+# Authors mentioned above do not endorse or promote this production.
+
+
 import numpy as np
 from scipy import sparse
 
diff --git a/python/cuml/_thirdparty/sklearn/preprocessing/_function_transformer.py b/python/cuml/_thirdparty/sklearn/preprocessing/_function_transformer.py
index 9dd8e3abe8..563ea952ca 100644
--- a/python/cuml/_thirdparty/sklearn/preprocessing/_function_transformer.py
+++ b/python/cuml/_thirdparty/sklearn/preprocessing/_function_transformer.py
@@ -1,3 +1,8 @@
+# This code originates from the Scikit-Learn library,
+# it was since modified to allow GPU acceleration.
+# This code is under BSD 3 clause license.
+
+
 import warnings
 
 from ..utils.skl_dependencies import BaseEstimator, TransformerMixin
diff --git a/python/cuml/_thirdparty/sklearn/preprocessing/_imputation.py b/python/cuml/_thirdparty/sklearn/preprocessing/_imputation.py
index 0ddd39b369..d71ba989f8 100644
--- a/python/cuml/_thirdparty/sklearn/preprocessing/_imputation.py
+++ b/python/cuml/_thirdparty/sklearn/preprocessing/_imputation.py
@@ -1,7 +1,15 @@
-# Authors: Nicolas Tresegnie <nicolas.tresegnie@gmail.com>
+# Original authors from Sckit-Learn:
+#          Nicolas Tresegnie <nicolas.tresegnie@gmail.com>
 #          Sergey Feldman <sergeyfeldman@gmail.com>
 # License: BSD 3 clause
 
+
+# This code originates from the Scikit-Learn library,
+# it was since modified to allow GPU acceleration.
+# This code is under BSD 3 clause license.
+# Authors mentioned above do not endorse or promote this production.
+
+
 import numbers
 import warnings
 
@@ -136,7 +144,6 @@ def _more_tags(self):
         return {'allow_nan': is_scalar_nan(self.missing_values)}
 
 
-@check_cupy8()
 class SimpleImputer(_BaseImputer):
     """Imputation transformer for completing missing values.
 
@@ -160,8 +167,7 @@ class SimpleImputer(_BaseImputer):
         - If "constant", then replace missing values with fill_value. Can be
           used with strings or numeric data.
 
-        .. versionadded:: 0.20
-           strategy="constant" for fixed value imputation.
+        strategy="constant" for fixed value imputation.
 
     fill_value : string or numerical value, default=None
         When strategy == "constant", fill_value is used to replace all
@@ -220,6 +226,7 @@ class SimpleImputer(_BaseImputer):
     upon :meth:`transform` if strategy is not "constant".
 
     """
+    @check_cupy8()
     @_deprecate_positional_args
     def __init__(self, *, missing_values=np.nan, strategy="mean",
                  fill_value=None, verbose=0, copy=True, add_indicator=False):
diff --git a/python/cuml/_thirdparty/sklearn/preprocessing/_label.py b/python/cuml/_thirdparty/sklearn/preprocessing/_label.py
index 88fad3670c..360f566c10 100644
--- a/python/cuml/_thirdparty/sklearn/preprocessing/_label.py
+++ b/python/cuml/_thirdparty/sklearn/preprocessing/_label.py
@@ -1,4 +1,5 @@
-# Authors: Alexandre Gramfort <alexandre.gramfort@inria.fr>
+# Original authors from Sckit-Learn:
+#          Alexandre Gramfort <alexandre.gramfort@inria.fr>
 #          Mathieu Blondel <mathieu@mblondel.org>
 #          Olivier Grisel <olivier.grisel@ensta.org>
 #          Andreas Mueller <amueller@ais.uni-bonn.de>
@@ -6,6 +7,13 @@
 #          Hamzeh Alsalhi <ha258@cornell.edu>
 # License: BSD 3 clause
 
+
+# This code originates from the Scikit-Learn library,
+# it was since modified to allow GPU acceleration.
+# This code is under BSD 3 clause license.
+# Authors mentioned above do not endorse or promote this production.
+
+
 from collections import defaultdict
 import itertools
 import array
diff --git a/python/cuml/_thirdparty/sklearn/utils/__init__.py b/python/cuml/_thirdparty/sklearn/utils/__init__.py
index 0ec0a96c42..bcbed6b6cd 100644
--- a/python/cuml/_thirdparty/sklearn/utils/__init__.py
+++ b/python/cuml/_thirdparty/sklearn/utils/__init__.py
@@ -1,2 +1,7 @@
+# This code originates from the Scikit-Learn library,
+# it was since modified to allow GPU acceleration.
+# This code is under BSD 3 clause license.
+
+
 from . import validation
 from . import extmath
diff --git a/python/cuml/_thirdparty/sklearn/utils/extmath.py b/python/cuml/_thirdparty/sklearn/utils/extmath.py
index b07006c951..af1b6a6d32 100644
--- a/python/cuml/_thirdparty/sklearn/utils/extmath.py
+++ b/python/cuml/_thirdparty/sklearn/utils/extmath.py
@@ -1,7 +1,8 @@
 """
 Extended math utilities.
 """
-# Authors: Gael Varoquaux
+# Original authors from Sckit-Learn:
+#          Gael Varoquaux
 #          Alexandre Gramfort
 #          Alexandre T. Passos
 #          Olivier Grisel
@@ -12,6 +13,12 @@
 # License: BSD 3 clause
 
 
+# This code originates from the Scikit-Learn library,
+# it was since modified to allow GPU acceleration.
+# This code is under BSD 3 clause license.
+# Authors mentioned above do not endorse or promote this production.
+
+
 import cupy as np
 import cupyx
 from cupy import sparse
diff --git a/python/cuml/_thirdparty/sklearn/utils/skl_dependencies.py b/python/cuml/_thirdparty/sklearn/utils/skl_dependencies.py
index 7fb4fe6ba8..a0554c80cd 100644
--- a/python/cuml/_thirdparty/sklearn/utils/skl_dependencies.py
+++ b/python/cuml/_thirdparty/sklearn/utils/skl_dependencies.py
@@ -3,9 +3,17 @@
 require a version-dependent import from the sklearn library
 """
 
-# Author: Gael Varoquaux <gael.varoquaux@normalesup.org>
+# Original authors from Sckit-Learn:
+#         Gael Varoquaux <gael.varoquaux@normalesup.org>
 # License: BSD 3 clause
 
+
+# This code originates from the Scikit-Learn library,
+# it was since modified to allow GPU acceleration.
+# This code is under BSD 3 clause license.
+# Authors mentioned above do not endorse or promote this production.
+
+
 import warnings
 from collections import defaultdict
 import inspect
diff --git a/python/cuml/_thirdparty/sklearn/utils/sparsefuncs.py b/python/cuml/_thirdparty/sklearn/utils/sparsefuncs.py
index 4cbdd5f12a..ec6f2289c5 100644
--- a/python/cuml/_thirdparty/sklearn/utils/sparsefuncs.py
+++ b/python/cuml/_thirdparty/sklearn/utils/sparsefuncs.py
@@ -1,8 +1,17 @@
-# Authors: Manoj Kumar
+# Original authors from Sckit-Learn:
+#          Manoj Kumar
 #          Thomas Unterthiner
 #          Giorgio Patrini
 #
 # License: BSD 3 clause
+
+
+# This code originates from the Scikit-Learn library,
+# it was since modified to allow GPU acceleration.
+# This code is under BSD 3 clause license.
+# Authors mentioned above do not endorse or promote this production.
+
+
 from scipy import sparse as cpu_sp
 from cupy import sparse as gpu_sp
 import cupy as np
diff --git a/python/cuml/_thirdparty/sklearn/utils/validation.py b/python/cuml/_thirdparty/sklearn/utils/validation.py
index 146e8e7925..d288de7ae1 100644
--- a/python/cuml/_thirdparty/sklearn/utils/validation.py
+++ b/python/cuml/_thirdparty/sklearn/utils/validation.py
@@ -1,6 +1,7 @@
 """Utilities for input validation"""
 
-# Authors: Olivier Grisel
+# Original authors from Sckit-Learn:
+#          Olivier Grisel
 #          Gael Varoquaux
 #          Andreas Mueller
 #          Lars Buitinck
@@ -9,6 +10,13 @@
 #          Sylvain Marie
 # License: BSD 3 clause
 
+
+# This code originates from the Scikit-Learn library,
+# it was since modified to allow GPU acceleration.
+# This code is under BSD 3 clause license.
+# Authors mentioned above do not endorse or promote this production.
+
+
 from functools import wraps
 import warnings
 import numbers
diff --git a/python/cuml/common/import_utils.py b/python/cuml/common/import_utils.py
index 8fd6346111..a6c642fefd 100644
--- a/python/cuml/common/import_utils.py
+++ b/python/cuml/common/import_utils.py
@@ -15,6 +15,7 @@
 #
 
 
+import inspect
 import numba
 
 from distutils.version import LooseVersion
@@ -127,6 +128,11 @@ def check_cupy8(conf=None):
 
     """
     def check_cupy8_dec(func):
+
+        assert not inspect.isclass(func), \
+            ("Do not use this decorator on classes. Instead decorate "
+             "__init__  and any static or class methods.")
+
         @wraps(func)
         def inner(*args, **kwargs):
             import cupy as cp
diff --git a/python/cuml/experimental/decomposition/incremental_pca.py b/python/cuml/experimental/decomposition/incremental_pca.py
index 80664e3b2c..b302a8e499 100644
--- a/python/cuml/experimental/decomposition/incremental_pca.py
+++ b/python/cuml/experimental/decomposition/incremental_pca.py
@@ -457,11 +457,15 @@ def _validate_sparse_input(X):
 
     Parameters
     ----------
+
     X : scipy.sparse or cupyx.scipy.sparse object
         A sparse input
+
     Returns
     -------
+
     X : The input converted to a cupyx.scipy.sparse.csr_matrix object
+
     """
 
     acceptable_dtypes = ('float32', 'float64')
@@ -494,14 +498,18 @@ def _gen_batches(n, batch_size, min_batch_size=0):
 
     Parameters
     ----------
+
     n : int
     batch_size : int
         Number of element in each batch
     min_batch_size : int, default=0
         Minimum batch size to produce.
+
     Yields
     ------
+
     slice of batch_size elements
+
     """
 
     if not isinstance(batch_size, numbers.Integral):
@@ -526,8 +534,10 @@ def _safe_accumulator_op(op, x, *args, **kwargs):
     This function provides numpy accumulator functions with a float64 dtype
     when used on a floating point input. This prevents accumulator overflow on
     smaller floating point dtypes.
+
     Parameters
     ----------
+
     op : function
         A cupy accumulator function such as cp.mean or cp.sum
     x : cupy array
@@ -537,9 +547,12 @@ def _safe_accumulator_op(op, x, *args, **kwargs):
         input x
     **kwargs : keyword arguments
         Keyword arguments passed to the accumulator function
+
     Returns
     -------
+
     result : The output of the accumulator function passed to this function
+
     """
 
     if cp.issubdtype(x.dtype, cp.floating) and x.dtype.itemsize < 8:
@@ -550,7 +563,8 @@ def _safe_accumulator_op(op, x, *args, **kwargs):
 
 
 def _incremental_mean_and_var(X, last_mean, last_variance, last_sample_count):
-    """Calculate mean update and a Youngs and Cramer variance update.
+    """
+    Calculate mean update and a Youngs and Cramer variance update.
     last_mean and last_variance are statistics computed at the last step by the
     function. Both must be initialized to 0.0. In case no scaling is required
     last_variance can be None. The mean is always required and returned because
@@ -558,27 +572,34 @@ def _incremental_mean_and_var(X, last_mean, last_variance, last_sample_count):
     number of samples encountered until now.
     From the paper "Algorithms for computing the sample variance: analysis and
     recommendations", by Chan, Golub, and LeVeque.
+
     Parameters
     ----------
+
     X : array-like, shape (n_samples, n_features)
         Data to use for variance update
     last_mean : array-like, shape: (n_features,)
     last_variance : array-like, shape: (n_features,)
     last_sample_count : array-like, shape (n_features,)
+
     Returns
     -------
+
     updated_mean : array, shape (n_features,)
     updated_variance : array, shape (n_features,)
         If None, only mean is computed
     updated_sample_count : array, shape (n_features,)
+
     Notes
     -----
     NaNs are ignored during the algorithm.
+
     References
     ----------
     T. Chan, G. Golub, R. LeVeque. Algorithms for computing the sample
         variance: recommendations, The American Statistician, Vol. 37, No. 3,
         pp. 242-247
+
     """
 
     # old = stats until now
@@ -616,11 +637,14 @@ def _incremental_mean_and_var(X, last_mean, last_variance, last_sample_count):
 
 
 def _svd_flip(u, v, u_based_decision=True):
-    """Sign correction to ensure deterministic output from SVD.
+    """
+    Sign correction to ensure deterministic output from SVD.
     Adjusts the columns of u and the rows of v such that the loadings in the
     columns in u that are largest in absolute value are always positive.
+
     Parameters
     ----------
+
     u : cupy.ndarray
         u and v are the output of `cupy.linalg.svd`
     v : cupy.ndarray
@@ -629,9 +653,11 @@ def _svd_flip(u, v, u_based_decision=True):
         If True, use the columns of u as the basis for sign flipping.
         Otherwise, use the rows of v. The choice of which variable to base the
         decision on is generally algorithm dependent.
+
     Returns
     -------
     u_adjusted, v_adjusted : arrays with the same dimensions as the input.
+
     """
     if u_based_decision:
         # columns of u, rows of v
diff --git a/python/cuml/experimental/hyperopt_utils/plotting_utils.py b/python/cuml/experimental/hyperopt_utils/plotting_utils.py
index 035d42701a..887717ceb3 100644
--- a/python/cuml/experimental/hyperopt_utils/plotting_utils.py
+++ b/python/cuml/experimental/hyperopt_utils/plotting_utils.py
@@ -22,8 +22,8 @@
 
 def plot_heatmap(df, col1, col2):
     """
-    Generates a heatmap to highlight interactions
-    of two parameters specified in col1 and col2.
+    Generates a heatmap to highlight interactions of two parameters specified
+    in col1 and col2.
 
     Parameters
     ----------
@@ -32,9 +32,6 @@ def plot_heatmap(df, col1, col2):
     col1 : string; Name of the first parameter
     col2: string; Name of the second parameter
 
-    Output
-    ----------
-    A heatmap using seaborn
     """
     max_scores = df.groupby([col1, col2]).max()
     max_scores = max_scores.unstack()[['mean_test_score']]
@@ -43,8 +40,8 @@ def plot_heatmap(df, col1, col2):
 
 def plot_search_results(res):
     """
-    Plots by fixing all paramters except one parameter to
-    its best value using matplotlib.
+    Plots by fixing all paramters except one parameter to its best value using
+    matplotlib.
 
     Accepts results from grid or random search from dask-ml.
 
@@ -52,9 +49,6 @@ def plot_search_results(res):
     ----------
     res : results from Grid or Random Search
 
-    Output
-    ----------
-    As many plots as the parameters that were tuned
     """
     # Results from grid search
     results = res.cv_results_
diff --git a/python/cuml/experimental/preprocessing/__init__.py b/python/cuml/experimental/preprocessing/__init__.py
index c7f2cbfaa7..ab3cdf8124 100644
--- a/python/cuml/experimental/preprocessing/__init__.py
+++ b/python/cuml/experimental/preprocessing/__init__.py
@@ -19,3 +19,23 @@
     SimpleImputer, RobustScaler, KBinsDiscretizer
 from cuml._thirdparty.sklearn.preprocessing import scale, minmax_scale, \
     normalize, add_dummy_feature, binarize, robust_scale
+
+__all__ = [
+    # Classes
+    'Binarizer',
+    'KBinsDiscretizer',
+    'MaxAbsScaler',
+    'MinMaxScaler',
+    'Normalizer',
+    'PolynomialFeatures',
+    'RobustScaler',
+    'SimpleImputer',
+    'StandardScaler',
+    # Functions
+    'add_dummy_feature',
+    'binarize',
+    'minmax_scale',
+    'normalize',
+    'robust_scale',
+    'scale',
+]