scikit-learn · NicolasHug · Oct 25, 2019 · Oct 21, 2019 · Oct 23, 2019 · Oct 23, 2019
diff --git a/.gitignore b/.gitignore
@@ -130,6 +130,10 @@ sklearn/svm/libsvm.py
 sklearn/svm/libsvm_sparse.py
 sklearn/svm/liblinear.py
 
+sklearn/feature_extraction/dict_vectorizer.py
+sklearn/feature_extraction/hashing.py
+sklearn/feature_extraction/stop_words.py
+
 sklearn/linear_model/base.py
 sklearn/linear_model/bayes.py
 sklearn/linear_model/cd_fast.py

diff --git a/sklearn/_build_utils/deprecated_modules.py b/sklearn/_build_utils/deprecated_modules.py
@@ -86,6 +86,13 @@
      'set_verbosity_wrap'),
     ('_liblinear', 'sklearn.svm.liblinear', 'sklearn.svm', 'train_wrap'),
 
+    ('_dict_vectorizer', 'sklearn.feature_extraction.dict_vectorizer',
+     'sklearn.feature_extraction', 'DictVectorizer'),
+    ('_hashing', 'sklearn.feature_extraction.hashing',
+     'sklearn.feature_extraction', 'FeatureHasher'),
+    ('_stop_words', 'sklearn.feature_extraction.stop_words',
+     'sklearn.feature_extraction.text', 'ENGLISH_STOP_WORDS'),
+
     ('_base', 'sklearn.linear_model.base', 'sklearn.linear_model',
      'LinearRegression'),
     ('_cd_fast', 'sklearn.linear_model.cd_fast', 'sklearn.linear_model',

diff --git a/sklearn/feature_extraction/__init__.py b/sklearn/feature_extraction/__init__.py
@@ -4,8 +4,8 @@
 images.
 """
 
-from .dict_vectorizer import DictVectorizer
-from .hashing import FeatureHasher
+from ._dict_vectorizer import DictVectorizer
+from ._hashing import FeatureHasher
 from .image import img_to_graph, grid_to_graph
 from . import text
 

diff --git a/...arn/feature_extraction/dict_vectorizer.py → ...rn/feature_extraction/_dict_vectorizer.py b/...arn/feature_extraction/dict_vectorizer.py → ...rn/feature_extraction/_dict_vectorizer.py
diff --git a/sklearn/feature_extraction/hashing.py → sklearn/feature_extraction/_hashing.py b/sklearn/feature_extraction/hashing.py → sklearn/feature_extraction/_hashing.py
@@ -10,7 +10,7 @@
 from ..base import BaseEstimator, TransformerMixin
 
 if not IS_PYPY:
-    from ._hashing import transform as _hashing_transform
+    from ._hashing_fast import transform as _hashing_transform
 else:
     def _hashing_transform(*args, **kwargs):
         raise NotImplementedError(

diff --git a/sklearn/feature_extraction/_hashing.pyx → sklearn/feature_extraction/_hashing_fast.pyx b/sklearn/feature_extraction/_hashing.pyx → sklearn/feature_extraction/_hashing_fast.pyx
diff --git a/sklearn/feature_extraction/stop_words.py → sklearn/feature_extraction/_stop_words.py b/sklearn/feature_extraction/stop_words.py → sklearn/feature_extraction/_stop_words.py
diff --git a/sklearn/feature_extraction/image.py b/sklearn/feature_extraction/image.py
@@ -15,7 +15,7 @@
 from scipy import sparse
 from numpy.lib.stride_tricks import as_strided
 
-from ..utils import check_array, check_random_state
+from ..utils import check_array, check_random_state, deprecated
 from ..base import BaseEstimator
 
 __all__ = ['PatchExtractor',
@@ -241,7 +241,7 @@ def _compute_n_patches(i_h, i_w, p_h, p_w, max_patches=None):
         return all_patches
 
 
-def extract_patches(arr, patch_shape=8, extraction_step=1):
+def _extract_patches(arr, patch_shape=8, extraction_step=1):
     """Extracts patches of any n-dimensional array in place using strides.
 
     Given an n-dimensional array it will return a 2n-dimensional array with
@@ -298,6 +298,46 @@ def extract_patches(arr, patch_shape=8, extraction_step=1):
     patches = as_strided(arr, shape=shape, strides=strides)
     return patches
 
+@deprecated("The function feature_extraction.image.extract_patches has been "
+            "deprecated in 0.22 and will be removed in 0.24.")
+def extract_patches(arr, patch_shape=8, extraction_step=1):
+    """Extracts patches of any n-dimensional array in place using strides.
+
+    Given an n-dimensional array it will return a 2n-dimensional array with
+    the first n dimensions indexing patch position and the last n indexing
+    the patch content. This operation is immediate (O(1)). A reshape
+    performed on the first n dimensions will cause numpy to copy data, leading
+    to a list of extracted patches.
+
+    Read more in the :ref:`User Guide <image_feature_extraction>`.
+
+    Parameters
+    ----------
+    arr : ndarray
+        n-dimensional array of which patches are to be extracted
+
+    patch_shape : integer or tuple of length arr.ndim
+        Indicates the shape of the patches to be extracted. If an
+        integer is given, the shape will be a hypercube of
+        sidelength given by its value.
+
+    extraction_step : integer or tuple of length arr.ndim
+        Indicates step size at which extraction shall be performed.
+        If integer is given, then the step is uniform in all dimensions.
+
+
+    Returns
+    -------
+    patches : strided ndarray
+        2n-dimensional array indexing patches on first n dimensions and
+        containing patches on the last n dimensions. These dimensions
+        are fake, but this way no data is copied. A simple reshape invokes
+        a copying operation to obtain a list of patches:
+        result.reshape([-1] + list(patch_shape))
+    """
+    return _extract_patches(arr, patch_shape=patch_shape,
+                            extraction_step=extraction_step)
+
 
 def extract_patches_2d(image, patch_size, max_patches=None, random_state=None):
     """Reshape a 2D image into a collection of patches
@@ -373,9 +413,9 @@ def extract_patches_2d(image, patch_size, max_patches=None, random_state=None):
     image = image.reshape((i_h, i_w, -1))
     n_colors = image.shape[-1]
 
-    extracted_patches = extract_patches(image,
-                                        patch_shape=(p_h, p_w, n_colors),
-                                        extraction_step=1)
+    extracted_patches = _extract_patches(image,
+                                         patch_shape=(p_h, p_w, n_colors),
+                                         extraction_step=1)
 
     n_patches = _compute_n_patches(i_h, i_w, p_h, p_w, max_patches)
     if max_patches:

diff --git a/sklearn/feature_extraction/setup.py b/sklearn/feature_extraction/setup.py
@@ -12,8 +12,8 @@ def configuration(parent_package='', top_path=None):
         libraries.append('m')
 
     if platform.python_implementation() != 'PyPy':
-        config.add_extension('_hashing',
-                             sources=['_hashing.pyx'],
+        config.add_extension('_hashing_fast',
+                             sources=['_hashing_fast.pyx'],
                              include_dirs=[numpy.get_include()],
                              libraries=libraries)
     config.add_subpackage("tests")

diff --git a/sklearn/feature_extraction/tests/test_feature_hasher.py b/sklearn/feature_extraction/tests/test_feature_hasher.py
@@ -48,7 +48,7 @@ def test_feature_hasher_strings():
 def test_hashing_transform_seed():
     # check the influence of the seed when computing the hashes
     # import is here to avoid importing on pypy
-    from sklearn.feature_extraction._hashing import (
+    from sklearn.feature_extraction._hashing_fast import (
             transform as _hashing_transform)
     raw_X = [["foo", "bar", "baz", "foo".encode("ascii")],
              ["bar".encode("ascii"), "baz", "quux"]]

diff --git a/sklearn/feature_extraction/tests/test_image.py b/sklearn/feature_extraction/tests/test_image.py
@@ -10,7 +10,8 @@
 
 from sklearn.feature_extraction.image import (
     img_to_graph, grid_to_graph, extract_patches_2d,
-    reconstruct_from_patches_2d, PatchExtractor, extract_patches)
+    reconstruct_from_patches_2d, PatchExtractor, _extract_patches,
+    extract_patches)
 from sklearn.utils.testing import ignore_warnings
 
 
@@ -303,8 +304,8 @@ def test_extract_patches_strided():
          last_patch) in zip(image_shapes, patch_sizes, patch_steps,
                             expected_views, last_patches):
         image = np.arange(np.prod(image_shape)).reshape(image_shape)
-        patches = extract_patches(image, patch_shape=patch_size,
-                                  extraction_step=patch_step)
+        patches = _extract_patches(image, patch_shape=patch_size,
+                                   extraction_step=patch_step)
 
         ndim = len(image_shape)
 
@@ -321,7 +322,7 @@ def test_extract_patches_square():
     i_h, i_w = face.shape
     p = 8
     expected_n_patches = ((i_h - p + 1), (i_w - p + 1))
-    patches = extract_patches(face, patch_shape=p)
+    patches = _extract_patches(face, patch_shape=p)
     assert patches.shape == (expected_n_patches[0],
                              expected_n_patches[1], p, p)
 
@@ -333,3 +334,11 @@ def test_width_patch():
         extract_patches_2d(x, (4, 1))
     with pytest.raises(ValueError):
         extract_patches_2d(x, (1, 4))
+
+
+# TODO: Remove in 0.24
+def test_extract_patches_deprecated():
+    msg = ("The function feature_extraction.image.extract_patches has been "
+           "deprecated in 0.22 and will be removed in 0.24.")
+    with pytest.warns(DeprecationWarning, match=msg):
+        extract_patches(downsampled_face)
diff --git a/sklearn/feature_extraction/tests/test_text.py b/sklearn/feature_extraction/tests/test_text.py
@@ -14,6 +14,7 @@
 from sklearn.feature_extraction.text import CountVectorizer
 from sklearn.feature_extraction.text import TfidfTransformer
 from sklearn.feature_extraction.text import TfidfVectorizer
+from sklearn.feature_extraction.text import VectorizerMixin
 
 from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
 
@@ -1358,3 +1359,14 @@ def test_unused_parameters_warn(Vectorizer, stop_words,
            )
     with pytest.warns(UserWarning, match=msg):
         vect.fit(train_data)
+
+
+# TODO: Remove in 0.24
+def test_vectorizermixin_is_deprecated():
+    class MyVectorizer(VectorizerMixin):
+        pass
+
+    msg = ("VectorizerMixin is deprecated in version 0.22 and will be removed "
+           "in version 0.24.")
+    with pytest.warns(DeprecationWarning, match=msg):
+        MyVectorizer()
diff --git a/sklearn/feature_extraction/text.py b/sklearn/feature_extraction/text.py
@@ -27,10 +27,10 @@
 
 from ..base import BaseEstimator, TransformerMixin
 from ..preprocessing import normalize
-from .hashing import FeatureHasher
-from .stop_words import ENGLISH_STOP_WORDS
+from ._hashing import FeatureHasher
+from ._stop_words import ENGLISH_STOP_WORDS
 from ..utils.validation import check_is_fitted, check_array, FLOAT_DTYPES
-from ..utils import _IS_32BIT
+from ..utils import _IS_32BIT, deprecated
 from ..utils.fixes import _astype_copy_false
 from ..exceptions import ChangedBehaviorWarning, NotFittedError
 
@@ -184,7 +184,7 @@ def _check_stop_list(stop):
         return frozenset(stop)
 
 
-class VectorizerMixin:
+class _VectorizerMixin:
     """Provides common code for text vectorizers (tokenization logic)."""
 
     _white_spaces = re.compile(r"\s\s+")
@@ -500,7 +500,13 @@ def _warn_for_unused_params(self):
                               " since 'analyzer' != 'word'")
 
 
-class HashingVectorizer(TransformerMixin, VectorizerMixin, BaseEstimator):
+@deprecated("VectorizerMixin is deprecated in version "
+            "0.22 and will be removed in version 0.24.")
+class VectorizerMixin(_VectorizerMixin):
+    pass
+
+
+class HashingVectorizer(TransformerMixin, _VectorizerMixin, BaseEstimator):
     """Convert a collection of text documents to a matrix of token occurrences
 
     It turns a collection of text documents into a scipy.sparse matrix holding
@@ -790,7 +796,7 @@ def _document_frequency(X):
         return np.diff(X.indptr)
 
 
-class CountVectorizer(VectorizerMixin, BaseEstimator):
+class CountVectorizer(_VectorizerMixin, BaseEstimator):
     """Convert a collection of text documents to a matrix of token counts
 
     This implementation produces a sparse representation of the counts using

diff --git a/sklearn/tests/test_common.py b/sklearn/tests/test_common.py
@@ -191,7 +191,7 @@ def test_import_all_consistency():
         if ".tests." in modname:
             continue
         if IS_PYPY and ('_svmlight_format' in modname or
-                        'feature_extraction._hashing' in modname):
+                        'feature_extraction._hashing_fast' in modname):
             continue
         package = __import__(modname, fromlist="dummy")
         for name in getattr(package, '__all__', ()):

diff --git a/sklearn/tests/test_docstring_parameters.py b/sklearn/tests/test_docstring_parameters.py
@@ -132,7 +132,7 @@ def test_tabs():
                                                   prefix='sklearn.'):
 
         if IS_PYPY and ('_svmlight_format' in modname or
-                        'feature_extraction._hashing' in modname):
+                        'feature_extraction._hashing_fast' in modname):
             continue
 
         # because we don't import

diff --git a/sklearn/utils/testing.py b/sklearn/utils/testing.py
@@ -512,7 +512,7 @@ def is_abstract(c):
         if ".tests." in modname or "externals" in modname:
             continue
         if IS_PYPY and ('_svmlight_format' in modname or
-                        'feature_extraction._hashing' in modname):
+                        'feature_extraction._hashing_fast' in modname):
             continue
         # Ignore deprecation warnings triggered at import time.
         with ignore_warnings(category=DeprecationWarning):