Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

MNT Make modules private in feature_extraction #15321

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
4 changes: 4 additions & 0 deletions .gitignore
Expand Up @@ -130,6 +130,10 @@ sklearn/svm/libsvm.py
sklearn/svm/libsvm_sparse.py
sklearn/svm/liblinear.py

sklearn/feature_extraction/dict_vectorizer.py
sklearn/feature_extraction/hashing.py
NicolasHug marked this conversation as resolved.
Show resolved Hide resolved
sklearn/feature_extraction/stop_words.py

sklearn/linear_model/base.py
sklearn/linear_model/bayes.py
sklearn/linear_model/cd_fast.py
Expand Down
7 changes: 7 additions & 0 deletions sklearn/_build_utils/deprecated_modules.py
Expand Up @@ -86,6 +86,13 @@
'set_verbosity_wrap'),
('_liblinear', 'sklearn.svm.liblinear', 'sklearn.svm', 'train_wrap'),

('_dict_vectorizer', 'sklearn.feature_extraction.dict_vectorizer',
'sklearn.feature_extraction', 'DictVectorizer'),
NicolasHug marked this conversation as resolved.
Show resolved Hide resolved
('_hashing', 'sklearn.feature_extraction.hashing',
'sklearn.feature_extraction', 'FeatureHasher'),
('_stop_words', 'sklearn.feature_extraction.stop_words',
'sklearn.feature_extraction.text', 'ENGLISH_STOP_WORDS'),

('_base', 'sklearn.linear_model.base', 'sklearn.linear_model',
'LinearRegression'),
('_cd_fast', 'sklearn.linear_model.cd_fast', 'sklearn.linear_model',
Expand Down
4 changes: 2 additions & 2 deletions sklearn/feature_extraction/__init__.py
Expand Up @@ -4,8 +4,8 @@
images.
"""

from .dict_vectorizer import DictVectorizer
from .hashing import FeatureHasher
from ._dict_vectorizer import DictVectorizer
from ._hashing import FeatureHasher
from .image import img_to_graph, grid_to_graph
from . import text

Expand Down
Expand Up @@ -10,7 +10,7 @@
from ..base import BaseEstimator, TransformerMixin

if not IS_PYPY:
from ._hashing import transform as _hashing_transform
from ._hashing_fast import transform as _hashing_transform
else:
def _hashing_transform(*args, **kwargs):
raise NotImplementedError(
Expand Down
50 changes: 45 additions & 5 deletions sklearn/feature_extraction/image.py
Expand Up @@ -15,7 +15,7 @@
from scipy import sparse
from numpy.lib.stride_tricks import as_strided

from ..utils import check_array, check_random_state
from ..utils import check_array, check_random_state, deprecated
from ..base import BaseEstimator

__all__ = ['PatchExtractor',
Expand Down Expand Up @@ -241,7 +241,7 @@ def _compute_n_patches(i_h, i_w, p_h, p_w, max_patches=None):
return all_patches


def extract_patches(arr, patch_shape=8, extraction_step=1):
def _extract_patches(arr, patch_shape=8, extraction_step=1):
"""Extracts patches of any n-dimensional array in place using strides.

Given an n-dimensional array it will return a 2n-dimensional array with
Expand Down Expand Up @@ -298,6 +298,46 @@ def extract_patches(arr, patch_shape=8, extraction_step=1):
patches = as_strided(arr, shape=shape, strides=strides)
return patches

@deprecated("The function feature_extraction.image.extract_patches has been "
"deprecated in 0.22 and will be removed in 0.24.")
def extract_patches(arr, patch_shape=8, extraction_step=1):
"""Extracts patches of any n-dimensional array in place using strides.

Given an n-dimensional array it will return a 2n-dimensional array with
the first n dimensions indexing patch position and the last n indexing
the patch content. This operation is immediate (O(1)). A reshape
performed on the first n dimensions will cause numpy to copy data, leading
to a list of extracted patches.

Read more in the :ref:`User Guide <image_feature_extraction>`.

Parameters
----------
arr : ndarray
n-dimensional array of which patches are to be extracted

patch_shape : integer or tuple of length arr.ndim
Indicates the shape of the patches to be extracted. If an
integer is given, the shape will be a hypercube of
sidelength given by its value.

extraction_step : integer or tuple of length arr.ndim
Indicates step size at which extraction shall be performed.
If integer is given, then the step is uniform in all dimensions.


Returns
-------
patches : strided ndarray
2n-dimensional array indexing patches on first n dimensions and
containing patches on the last n dimensions. These dimensions
are fake, but this way no data is copied. A simple reshape invokes
a copying operation to obtain a list of patches:
result.reshape([-1] + list(patch_shape))
"""
return _extract_patches(arr, patch_shape=patch_shape,
extraction_step=extraction_step)


def extract_patches_2d(image, patch_size, max_patches=None, random_state=None):
"""Reshape a 2D image into a collection of patches
Expand Down Expand Up @@ -373,9 +413,9 @@ def extract_patches_2d(image, patch_size, max_patches=None, random_state=None):
image = image.reshape((i_h, i_w, -1))
n_colors = image.shape[-1]

extracted_patches = extract_patches(image,
patch_shape=(p_h, p_w, n_colors),
extraction_step=1)
extracted_patches = _extract_patches(image,
patch_shape=(p_h, p_w, n_colors),
extraction_step=1)

n_patches = _compute_n_patches(i_h, i_w, p_h, p_w, max_patches)
if max_patches:
Expand Down
4 changes: 2 additions & 2 deletions sklearn/feature_extraction/setup.py
Expand Up @@ -12,8 +12,8 @@ def configuration(parent_package='', top_path=None):
libraries.append('m')

if platform.python_implementation() != 'PyPy':
config.add_extension('_hashing',
sources=['_hashing.pyx'],
config.add_extension('_hashing_fast',
sources=['_hashing_fast.pyx'],
include_dirs=[numpy.get_include()],
libraries=libraries)
config.add_subpackage("tests")
Expand Down
2 changes: 1 addition & 1 deletion sklearn/feature_extraction/tests/test_feature_hasher.py
Expand Up @@ -48,7 +48,7 @@ def test_feature_hasher_strings():
def test_hashing_transform_seed():
# check the influence of the seed when computing the hashes
# import is here to avoid importing on pypy
from sklearn.feature_extraction._hashing import (
from sklearn.feature_extraction._hashing_fast import (
transform as _hashing_transform)
raw_X = [["foo", "bar", "baz", "foo".encode("ascii")],
["bar".encode("ascii"), "baz", "quux"]]
Expand Down
17 changes: 13 additions & 4 deletions sklearn/feature_extraction/tests/test_image.py
Expand Up @@ -10,7 +10,8 @@

from sklearn.feature_extraction.image import (
img_to_graph, grid_to_graph, extract_patches_2d,
reconstruct_from_patches_2d, PatchExtractor, extract_patches)
reconstruct_from_patches_2d, PatchExtractor, _extract_patches,
extract_patches)
from sklearn.utils.testing import ignore_warnings


Expand Down Expand Up @@ -303,8 +304,8 @@ def test_extract_patches_strided():
last_patch) in zip(image_shapes, patch_sizes, patch_steps,
expected_views, last_patches):
image = np.arange(np.prod(image_shape)).reshape(image_shape)
patches = extract_patches(image, patch_shape=patch_size,
extraction_step=patch_step)
patches = _extract_patches(image, patch_shape=patch_size,
extraction_step=patch_step)

ndim = len(image_shape)

Expand All @@ -321,7 +322,7 @@ def test_extract_patches_square():
i_h, i_w = face.shape
p = 8
expected_n_patches = ((i_h - p + 1), (i_w - p + 1))
patches = extract_patches(face, patch_shape=p)
patches = _extract_patches(face, patch_shape=p)
assert patches.shape == (expected_n_patches[0],
expected_n_patches[1], p, p)

Expand All @@ -333,3 +334,11 @@ def test_width_patch():
extract_patches_2d(x, (4, 1))
with pytest.raises(ValueError):
extract_patches_2d(x, (1, 4))


# TODO: Remove in 0.24
def test_extract_patches_deprecated():
msg = ("The function feature_extraction.image.extract_patches has been "
"deprecated in 0.22 and will be removed in 0.24.")
with pytest.warns(DeprecationWarning, match=msg):
extract_patches(downsampled_face)
12 changes: 12 additions & 0 deletions sklearn/feature_extraction/tests/test_text.py
Expand Up @@ -14,6 +14,7 @@
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import VectorizerMixin

from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS

Expand Down Expand Up @@ -1358,3 +1359,14 @@ def test_unused_parameters_warn(Vectorizer, stop_words,
)
with pytest.warns(UserWarning, match=msg):
vect.fit(train_data)


# TODO: Remove in 0.24
def test_vectorizermixin_is_deprecated():
class MyVectorizer(VectorizerMixin):
pass

msg = ("VectorizerMixin is deprecated in version 0.22 and will be removed "
"in version 0.24.")
with pytest.warns(DeprecationWarning, match=msg):
MyVectorizer()
18 changes: 12 additions & 6 deletions sklearn/feature_extraction/text.py
Expand Up @@ -27,10 +27,10 @@

from ..base import BaseEstimator, TransformerMixin
from ..preprocessing import normalize
from .hashing import FeatureHasher
from .stop_words import ENGLISH_STOP_WORDS
from ._hashing import FeatureHasher
from ._stop_words import ENGLISH_STOP_WORDS
from ..utils.validation import check_is_fitted, check_array, FLOAT_DTYPES
from ..utils import _IS_32BIT
from ..utils import _IS_32BIT, deprecated
from ..utils.fixes import _astype_copy_false
from ..exceptions import ChangedBehaviorWarning, NotFittedError

Expand Down Expand Up @@ -184,7 +184,7 @@ def _check_stop_list(stop):
return frozenset(stop)


class VectorizerMixin:
class _VectorizerMixin:
"""Provides common code for text vectorizers (tokenization logic)."""

_white_spaces = re.compile(r"\s\s+")
Expand Down Expand Up @@ -500,7 +500,13 @@ def _warn_for_unused_params(self):
" since 'analyzer' != 'word'")


class HashingVectorizer(TransformerMixin, VectorizerMixin, BaseEstimator):
@deprecated("VectorizerMixin is deprecated in version "
"0.22 and will be removed in version 0.24.")
class VectorizerMixin(_VectorizerMixin):
pass


class HashingVectorizer(TransformerMixin, _VectorizerMixin, BaseEstimator):
"""Convert a collection of text documents to a matrix of token occurrences

It turns a collection of text documents into a scipy.sparse matrix holding
Expand Down Expand Up @@ -790,7 +796,7 @@ def _document_frequency(X):
return np.diff(X.indptr)


class CountVectorizer(VectorizerMixin, BaseEstimator):
class CountVectorizer(_VectorizerMixin, BaseEstimator):
"""Convert a collection of text documents to a matrix of token counts

This implementation produces a sparse representation of the counts using
Expand Down
2 changes: 1 addition & 1 deletion sklearn/tests/test_common.py
Expand Up @@ -191,7 +191,7 @@ def test_import_all_consistency():
if ".tests." in modname:
continue
if IS_PYPY and ('_svmlight_format' in modname or
'feature_extraction._hashing' in modname):
'feature_extraction._hashing_fast' in modname):
continue
package = __import__(modname, fromlist="dummy")
for name in getattr(package, '__all__', ()):
Expand Down
2 changes: 1 addition & 1 deletion sklearn/tests/test_docstring_parameters.py
Expand Up @@ -132,7 +132,7 @@ def test_tabs():
prefix='sklearn.'):

if IS_PYPY and ('_svmlight_format' in modname or
'feature_extraction._hashing' in modname):
'feature_extraction._hashing_fast' in modname):
continue

# because we don't import
Expand Down
2 changes: 1 addition & 1 deletion sklearn/utils/testing.py
Expand Up @@ -512,7 +512,7 @@ def is_abstract(c):
if ".tests." in modname or "externals" in modname:
continue
if IS_PYPY and ('_svmlight_format' in modname or
'feature_extraction._hashing' in modname):
'feature_extraction._hashing_fast' in modname):
continue
# Ignore deprecation warnings triggered at import time.
with ignore_warnings(category=DeprecationWarning):
Expand Down