Skip to content
Please note that GitHub no longer supports Internet Explorer.

We recommend upgrading to the latest Microsoft Edge, Google Chrome, or Firefox.

Learn more
Permalink
Browse files

FIX an issue w/ large sparse matrix indices in CountVectorizer (#11295)

  • Loading branch information
gvacaliuc authored and jnothman committed Jan 30, 2019
1 parent fdf2f38 commit 5fc5c6e62e163a20b890b64cb1efa8ed151bbc18
Showing with 45 additions and 8 deletions.
  1. +8 −0 doc/whats_new/v0.20.rst
  2. +31 −1 sklearn/feature_extraction/tests/test_text.py
  3. +6 −7 sklearn/feature_extraction/text.py
@@ -51,6 +51,14 @@ Changelog
combination with ``handle_unknown='ignore'``.
:issue:`12881` by `Joris Van den Bossche`_.

:mod:`sklearn.feature_extraction.text`
......................................

- |Fix| Fixed a bug in :class:`feature_extraction.text.CountVectorizer` which
would result in the sparse feature matrix having conflicting `indptr` and
`indices` precisions under very large vocabularies. :issue:`11295` by
:user:`Gabriel Vacaliuc <gvacaliuc>`.

.. _changes_0_20_2:

Version 0.20.2
@@ -36,7 +36,8 @@
assert_warns_message, assert_raise_message,
clean_warning_registry, ignore_warnings,
SkipTest, assert_raises, assert_no_warnings,
fails_if_pypy, assert_allclose_dense_sparse)
fails_if_pypy, assert_allclose_dense_sparse,
skip_if_32bit)
from collections import defaultdict
from functools import partial
import pickle
@@ -1144,6 +1145,35 @@ def test_vectorizer_stop_words_inconsistent():
['hello world'])


@skip_if_32bit
def test_countvectorizer_sort_features_64bit_sparse_indices():
"""
Check that CountVectorizer._sort_features preserves the dtype of its sparse
feature matrix.
This test is skipped on 32bit platforms, see:
https://github.com/scikit-learn/scikit-learn/pull/11295
for more details.
"""

X = sparse.csr_matrix((5, 5), dtype=np.int64)

# force indices and indptr to int64.
INDICES_DTYPE = np.int64
X.indices = X.indices.astype(INDICES_DTYPE)
X.indptr = X.indptr.astype(INDICES_DTYPE)

vocabulary = {
"scikit-learn": 0,
"is": 1,
"great!": 2
}

Xs = CountVectorizer()._sort_features(X, vocabulary)

assert INDICES_DTYPE == Xs.indices.dtype


@fails_if_pypy
@pytest.mark.parametrize('Estimator',
[CountVectorizer, TfidfVectorizer, HashingVectorizer])
@@ -31,6 +31,7 @@
from .stop_words import ENGLISH_STOP_WORDS
from ..utils.validation import check_is_fitted, check_array, FLOAT_DTYPES
from ..utils.fixes import sp_version
from ..utils import _IS_32BIT


__all__ = ['HashingVectorizer',
@@ -871,7 +872,7 @@ def _sort_features(self, X, vocabulary):
Returns a reordered matrix and modifies the vocabulary in place
"""
sorted_features = sorted(vocabulary.items())
map_index = np.empty(len(sorted_features), dtype=np.int32)
map_index = np.empty(len(sorted_features), dtype=X.indices.dtype)
for new_val, (term, old_val) in enumerate(sorted_features):
vocabulary[term] = new_val
map_index[old_val] = new_val
@@ -961,14 +962,12 @@ def _count_vocab(self, raw_documents, fixed_vocab):
" contain stop words")

if indptr[-1] > 2147483648: # = 2**31 - 1
if sp_version >= (0, 14):
indices_dtype = np.int64
else:
if _IS_32BIT:
raise ValueError(('sparse CSR array has {} non-zero '
'elements and requires 64 bit indexing, '
' which is unsupported with scipy {}. '
'Please upgrade to scipy >=0.14')
.format(indptr[-1], '.'.join(sp_version)))
'which is unsupported with 32 bit Python.')
.format(indptr[-1]))
indices_dtype = np.int64

else:
indices_dtype = np.int32

0 comments on commit 5fc5c6e

Please sign in to comment.
You can’t perform that action at this time.