Permalink
Join GitHub today
GitHub is home to over 40 million developers working together to host and review code, manage projects, and build software together.
Sign up# -*- coding: utf-8 -*- | |
# Authors: Olivier Grisel <olivier.grisel@ensta.org> | |
# Mathieu Blondel <mathieu@mblondel.org> | |
# Lars Buitinck | |
# Robert Layton <robertlayton@gmail.com> | |
# Jochen Wersdörfer <jochen@wersdoerfer.de> | |
# Roman Sinayev <roman.sinayev@gmail.com> | |
# | |
# License: BSD 3 clause | |
""" | |
The :mod:`sklearn.feature_extraction.text` submodule gathers utilities to | |
build feature vectors from text documents. | |
""" | |
import array | |
from collections import defaultdict | |
from collections.abc import Mapping | |
from functools import partial | |
import numbers | |
from operator import itemgetter | |
import re | |
import unicodedata | |
import warnings | |
import numpy as np | |
import scipy.sparse as sp | |
from ..base import BaseEstimator, TransformerMixin | |
from ..preprocessing import normalize | |
from ._hashing import FeatureHasher | |
from ._stop_words import ENGLISH_STOP_WORDS | |
from ..utils.validation import check_is_fitted, check_array, FLOAT_DTYPES | |
from ..utils import _IS_32BIT, deprecated | |
from ..utils.fixes import _astype_copy_false | |
from ..exceptions import ChangedBehaviorWarning, NotFittedError | |
__all__ = ['HashingVectorizer', | |
'CountVectorizer', | |
'ENGLISH_STOP_WORDS', | |
'TfidfTransformer', | |
'TfidfVectorizer', | |
'strip_accents_ascii', | |
'strip_accents_unicode', | |
'strip_tags'] | |
def _preprocess(doc, accent_function=None, lower=False): | |
"""Chain together an optional series of text preprocessing steps to | |
apply to a document. | |
Parameters | |
---------- | |
doc: str | |
The string to preprocess | |
accent_function: callable | |
Function for handling accented characters. Common strategies include | |
normalizing and removing. | |
lower: bool | |
Whether to use str.lower to lowercase all fo the text | |
Returns | |
------- | |
doc: str | |
preprocessed string | |
""" | |
if lower: | |
doc = doc.lower() | |
if accent_function is not None: | |
doc = accent_function(doc) | |
return doc | |
def _analyze(doc, analyzer=None, tokenizer=None, ngrams=None, | |
preprocessor=None, decoder=None, stop_words=None): | |
"""Chain together an optional series of text processing steps to go from | |
a single document to ngrams, with or without tokenizing or preprocessing. | |
If analyzer is used, only the decoder argument is used, as the analyzer is | |
intended to replace the preprocessor, tokenizer, and ngrams steps. | |
Parameters | |
---------- | |
analyzer: callable | |
tokenizer: callable | |
ngrams: callable | |
preprocessor: callable | |
decoder: callable | |
stop_words: list | |
Returns | |
------- | |
ngrams: list | |
A sequence of tokens, possibly with pairs, triples, etc. | |
""" | |
if decoder is not None: | |
doc = decoder(doc) | |
if analyzer is not None: | |
doc = analyzer(doc) | |
else: | |
if preprocessor is not None: | |
doc = preprocessor(doc) | |
if tokenizer is not None: | |
doc = tokenizer(doc) | |
if ngrams is not None: | |
if stop_words is not None: | |
doc = ngrams(doc, stop_words) | |
else: | |
doc = ngrams(doc) | |
return doc | |
def strip_accents_unicode(s): | |
"""Transform accentuated unicode symbols into their simple counterpart | |
Warning: the python-level loop and join operations make this | |
implementation 20 times slower than the strip_accents_ascii basic | |
normalization. | |
Parameters | |
---------- | |
s : string | |
The string to strip | |
See Also | |
-------- | |
strip_accents_ascii | |
Remove accentuated char for any unicode symbol that has a direct | |
ASCII equivalent. | |
""" | |
try: | |
# If `s` is ASCII-compatible, then it does not contain any accented | |
# characters and we can avoid an expensive list comprehension | |
s.encode("ASCII", errors="strict") | |
return s | |
except UnicodeEncodeError: | |
normalized = unicodedata.normalize('NFKD', s) | |
return ''.join([c for c in normalized if not unicodedata.combining(c)]) | |
def strip_accents_ascii(s): | |
"""Transform accentuated unicode symbols into ascii or nothing | |
Warning: this solution is only suited for languages that have a direct | |
transliteration to ASCII symbols. | |
Parameters | |
---------- | |
s : string | |
The string to strip | |
See Also | |
-------- | |
strip_accents_unicode | |
Remove accentuated char for any unicode symbol. | |
""" | |
nkfd_form = unicodedata.normalize('NFKD', s) | |
return nkfd_form.encode('ASCII', 'ignore').decode('ASCII') | |
def strip_tags(s): | |
"""Basic regexp based HTML / XML tag stripper function | |
For serious HTML/XML preprocessing you should rather use an external | |
library such as lxml or BeautifulSoup. | |
Parameters | |
---------- | |
s : string | |
The string to strip | |
""" | |
return re.compile(r"<([^>]+)>", flags=re.UNICODE).sub(" ", s) | |
def _check_stop_list(stop): | |
if stop == "english": | |
return ENGLISH_STOP_WORDS | |
elif isinstance(stop, str): | |
raise ValueError("not a built-in stop list: %s" % stop) | |
elif stop is None: | |
return None | |
else: # assume it's a collection | |
return frozenset(stop) | |
class _VectorizerMixin: | |
"""Provides common code for text vectorizers (tokenization logic).""" | |
_white_spaces = re.compile(r"\s\s+") | |
def decode(self, doc): | |
"""Decode the input into a string of unicode symbols. | |
The decoding strategy depends on the vectorizer parameters. | |
Parameters | |
---------- | |
doc : str | |
The string to decode. | |
Returns | |
------- | |
doc: str | |
A string of unicode symbols. | |
""" | |
if self.input == 'filename': | |
with open(doc, 'rb') as fh: | |
doc = fh.read() | |
elif self.input == 'file': | |
doc = doc.read() | |
if isinstance(doc, bytes): | |
doc = doc.decode(self.encoding, self.decode_error) | |
if doc is np.nan: | |
raise ValueError("np.nan is an invalid document, expected byte or " | |
"unicode string.") | |
return doc | |
def _word_ngrams(self, tokens, stop_words=None): | |
"""Turn tokens into a sequence of n-grams after stop words filtering""" | |
# handle stop words | |
if stop_words is not None: | |
tokens = [w for w in tokens if w not in stop_words] | |
# handle token n-grams | |
min_n, max_n = self.ngram_range | |
if max_n != 1: | |
original_tokens = tokens | |
if min_n == 1: | |
# no need to do any slicing for unigrams | |
# just iterate through the original tokens | |
tokens = list(original_tokens) | |
min_n += 1 | |
else: | |
tokens = [] | |
n_original_tokens = len(original_tokens) | |
# bind method outside of loop to reduce overhead | |
tokens_append = tokens.append | |
space_join = " ".join | |
for n in range(min_n, | |
min(max_n + 1, n_original_tokens + 1)): | |
for i in range(n_original_tokens - n + 1): | |
tokens_append(space_join(original_tokens[i: i + n])) | |
return tokens | |
def _char_ngrams(self, text_document): | |
"""Tokenize text_document into a sequence of character n-grams""" | |
# normalize white spaces | |
text_document = self._white_spaces.sub(" ", text_document) | |
text_len = len(text_document) | |
min_n, max_n = self.ngram_range | |
if min_n == 1: | |
# no need to do any slicing for unigrams | |
# iterate through the string | |
ngrams = list(text_document) | |
min_n += 1 | |
else: | |
ngrams = [] | |
# bind method outside of loop to reduce overhead | |
ngrams_append = ngrams.append | |
for n in range(min_n, min(max_n + 1, text_len + 1)): | |
for i in range(text_len - n + 1): | |
ngrams_append(text_document[i: i + n]) | |
return ngrams | |
def _char_wb_ngrams(self, text_document): | |
"""Whitespace sensitive char-n-gram tokenization. | |
Tokenize text_document into a sequence of character n-grams | |
operating only inside word boundaries. n-grams at the edges | |
of words are padded with space.""" | |
# normalize white spaces | |
text_document = self._white_spaces.sub(" ", text_document) | |
min_n, max_n = self.ngram_range | |
ngrams = [] | |
# bind method outside of loop to reduce overhead | |
ngrams_append = ngrams.append | |
for w in text_document.split(): | |
w = ' ' + w + ' ' | |
w_len = len(w) | |
for n in range(min_n, max_n + 1): | |
offset = 0 | |
ngrams_append(w[offset:offset + n]) | |
while offset + n < w_len: | |
offset += 1 | |
ngrams_append(w[offset:offset + n]) | |
if offset == 0: # count a short word (w_len < n) only once | |
break | |
return ngrams | |
def build_preprocessor(self): | |
"""Return a function to preprocess the text before tokenization. | |
Returns | |
------- | |
preprocessor: callable | |
A function to preprocess the text before tokenization. | |
""" | |
if self.preprocessor is not None: | |
return self.preprocessor | |
# accent stripping | |
if not self.strip_accents: | |
strip_accents = None | |
elif callable(self.strip_accents): | |
strip_accents = self.strip_accents | |
elif self.strip_accents == 'ascii': | |
strip_accents = strip_accents_ascii | |
elif self.strip_accents == 'unicode': | |
strip_accents = strip_accents_unicode | |
else: | |
raise ValueError('Invalid value for "strip_accents": %s' % | |
self.strip_accents) | |
return partial( | |
_preprocess, accent_function=strip_accents, lower=self.lowercase | |
) | |
def build_tokenizer(self): | |
"""Return a function that splits a string into a sequence of tokens. | |
Returns | |
------- | |
tokenizer: callable | |
A function to split a string into a sequence of tokens. | |
""" | |
if self.tokenizer is not None: | |
return self.tokenizer | |
token_pattern = re.compile(self.token_pattern) | |
return token_pattern.findall | |
def get_stop_words(self): | |
"""Build or fetch the effective stop words list. | |
Returns | |
------- | |
stop_words: list or None | |
A list of stop words. | |
""" | |
return _check_stop_list(self.stop_words) | |
def _check_stop_words_consistency(self, stop_words, preprocess, tokenize): | |
"""Check if stop words are consistent | |
Returns | |
------- | |
is_consistent : True if stop words are consistent with the preprocessor | |
and tokenizer, False if they are not, None if the check | |
was previously performed, "error" if it could not be | |
performed (e.g. because of the use of a custom | |
preprocessor / tokenizer) | |
""" | |
if id(self.stop_words) == getattr(self, '_stop_words_id', None): | |
# Stop words are were previously validated | |
return None | |
# NB: stop_words is validated, unlike self.stop_words | |
try: | |
inconsistent = set() | |
for w in stop_words or (): | |
tokens = list(tokenize(preprocess(w))) | |
for token in tokens: | |
if token not in stop_words: | |
inconsistent.add(token) | |
self._stop_words_id = id(self.stop_words) | |
if inconsistent: | |
warnings.warn('Your stop_words may be inconsistent with ' | |
'your preprocessing. Tokenizing the stop ' | |
'words generated tokens %r not in ' | |
'stop_words.' % sorted(inconsistent)) | |
return not inconsistent | |
except Exception: | |
# Failed to check stop words consistency (e.g. because a custom | |
# preprocessor or tokenizer was used) | |
self._stop_words_id = id(self.stop_words) | |
return 'error' | |
def _validate_custom_analyzer(self): | |
# This is to check if the given custom analyzer expects file or a | |
# filename instead of data. | |
# Behavior changed in v0.21, function could be removed in v0.23 | |
import tempfile | |
with tempfile.NamedTemporaryFile() as f: | |
fname = f.name | |
# now we're sure fname doesn't exist | |
msg = ("Since v0.21, vectorizers pass the data to the custom analyzer " | |
"and not the file names or the file objects. This warning " | |
"will be removed in v0.23.") | |
try: | |
self.analyzer(fname) | |
except FileNotFoundError: | |
warnings.warn(msg, ChangedBehaviorWarning) | |
except AttributeError as e: | |
if str(e) == "'str' object has no attribute 'read'": | |
warnings.warn(msg, ChangedBehaviorWarning) | |
except Exception: | |
pass | |
def build_analyzer(self): | |
"""Return a callable that handles preprocessing, tokenization | |
and n-grams generation. | |
Returns | |
------- | |
analyzer: callable | |
A function to handle preprocessing, tokenization | |
and n-grams generation. | |
""" | |
if callable(self.analyzer): | |
if self.input in ['file', 'filename']: | |
self._validate_custom_analyzer() | |
return partial( | |
_analyze, analyzer=self.analyzer, decoder=self.decode | |
) | |
preprocess = self.build_preprocessor() | |
if self.analyzer == 'char': | |
return partial(_analyze, ngrams=self._char_ngrams, | |
preprocessor=preprocess, decoder=self.decode) | |
elif self.analyzer == 'char_wb': | |
return partial(_analyze, ngrams=self._char_wb_ngrams, | |
preprocessor=preprocess, decoder=self.decode) | |
elif self.analyzer == 'word': | |
stop_words = self.get_stop_words() | |
tokenize = self.build_tokenizer() | |
self._check_stop_words_consistency(stop_words, preprocess, | |
tokenize) | |
return partial(_analyze, ngrams=self._word_ngrams, | |
tokenizer=tokenize, preprocessor=preprocess, | |
decoder=self.decode, stop_words=stop_words) | |
else: | |
raise ValueError('%s is not a valid tokenization scheme/analyzer' % | |
self.analyzer) | |
def _validate_vocabulary(self): | |
vocabulary = self.vocabulary | |
if vocabulary is not None: | |
if isinstance(vocabulary, set): | |
vocabulary = sorted(vocabulary) | |
if not isinstance(vocabulary, Mapping): | |
vocab = {} | |
for i, t in enumerate(vocabulary): | |
if vocab.setdefault(t, i) != i: | |
msg = "Duplicate term in vocabulary: %r" % t | |
raise ValueError(msg) | |
vocabulary = vocab | |
else: | |
indices = set(vocabulary.values()) | |
if len(indices) != len(vocabulary): | |
raise ValueError("Vocabulary contains repeated indices.") | |
for i in range(len(vocabulary)): | |
if i not in indices: | |
msg = ("Vocabulary of size %d doesn't contain index " | |
"%d." % (len(vocabulary), i)) | |
raise ValueError(msg) | |
if not vocabulary: | |
raise ValueError("empty vocabulary passed to fit") | |
self.fixed_vocabulary_ = True | |
self.vocabulary_ = dict(vocabulary) | |
else: | |
self.fixed_vocabulary_ = False | |
def _check_vocabulary(self): | |
"""Check if vocabulary is empty or missing (not fitted)""" | |
if not hasattr(self, 'vocabulary_'): | |
self._validate_vocabulary() | |
if not self.fixed_vocabulary_: | |
raise NotFittedError("Vocabulary not fitted or provided") | |
if len(self.vocabulary_) == 0: | |
raise ValueError("Vocabulary is empty") | |
def _validate_params(self): | |
"""Check validity of ngram_range parameter""" | |
min_n, max_m = self.ngram_range | |
if min_n > max_m: | |
raise ValueError( | |
"Invalid value for ngram_range=%s " | |
"lower boundary larger than the upper boundary." | |
% str(self.ngram_range)) | |
def _warn_for_unused_params(self): | |
if self.tokenizer is not None and self.token_pattern is not None: | |
warnings.warn("The parameter 'token_pattern' will not be used" | |
" since 'tokenizer' is not None'") | |
if self.preprocessor is not None and callable(self.analyzer): | |
warnings.warn("The parameter 'preprocessor' will not be used" | |
" since 'analyzer' is callable'") | |
if (self.ngram_range != (1, 1) and self.ngram_range is not None | |
and callable(self.analyzer)): | |
warnings.warn("The parameter 'ngram_range' will not be used" | |
" since 'analyzer' is callable'") | |
if self.analyzer != 'word' or callable(self.analyzer): | |
if self.stop_words is not None: | |
warnings.warn("The parameter 'stop_words' will not be used" | |
" since 'analyzer' != 'word'") | |
if self.token_pattern is not None and \ | |
self.token_pattern != r"(?u)\b\w\w+\b": | |
warnings.warn("The parameter 'token_pattern' will not be used" | |
" since 'analyzer' != 'word'") | |
if self.tokenizer is not None: | |
warnings.warn("The parameter 'tokenizer' will not be used" | |
" since 'analyzer' != 'word'") | |
@deprecated("VectorizerMixin is deprecated in version " | |
"0.22 and will be removed in version 0.24.") | |
class VectorizerMixin(_VectorizerMixin): | |
pass | |
class HashingVectorizer(TransformerMixin, _VectorizerMixin, BaseEstimator): | |
"""Convert a collection of text documents to a matrix of token occurrences | |
It turns a collection of text documents into a scipy.sparse matrix holding | |
token occurrence counts (or binary occurrence information), possibly | |
normalized as token frequencies if norm='l1' or projected on the euclidean | |
unit sphere if norm='l2'. | |
This text vectorizer implementation uses the hashing trick to find the | |
token string name to feature integer index mapping. | |
This strategy has several advantages: | |
- it is very low memory scalable to large datasets as there is no need to | |
store a vocabulary dictionary in memory | |
- it is fast to pickle and un-pickle as it holds no state besides the | |
constructor parameters | |
- it can be used in a streaming (partial fit) or parallel pipeline as there | |
is no state computed during fit. | |
There are also a couple of cons (vs using a CountVectorizer with an | |
in-memory vocabulary): | |
- there is no way to compute the inverse transform (from feature indices to | |
string feature names) which can be a problem when trying to introspect | |
which features are most important to a model. | |
- there can be collisions: distinct tokens can be mapped to the same | |
feature index. However in practice this is rarely an issue if n_features | |
is large enough (e.g. 2 ** 18 for text classification problems). | |
- no IDF weighting as this would render the transformer stateful. | |
The hash function employed is the signed 32-bit version of Murmurhash3. | |
Read more in the :ref:`User Guide <text_feature_extraction>`. | |
Parameters | |
---------- | |
input : string {'filename', 'file', 'content'} | |
If 'filename', the sequence passed as an argument to fit is | |
expected to be a list of filenames that need reading to fetch | |
the raw content to analyze. | |
If 'file', the sequence items must have a 'read' method (file-like | |
object) that is called to fetch the bytes in memory. | |
Otherwise the input is expected to be a sequence of items that | |
can be of type string or byte. | |
encoding : string, default='utf-8' | |
If bytes or files are given to analyze, this encoding is used to | |
decode. | |
decode_error : {'strict', 'ignore', 'replace'} | |
Instruction on what to do if a byte sequence is given to analyze that | |
contains characters not of the given `encoding`. By default, it is | |
'strict', meaning that a UnicodeDecodeError will be raised. Other | |
values are 'ignore' and 'replace'. | |
strip_accents : {'ascii', 'unicode', None} | |
Remove accents and perform other character normalization | |
during the preprocessing step. | |
'ascii' is a fast method that only works on characters that have | |
an direct ASCII mapping. | |
'unicode' is a slightly slower method that works on any characters. | |
None (default) does nothing. | |
Both 'ascii' and 'unicode' use NFKD normalization from | |
:func:`unicodedata.normalize`. | |
lowercase : boolean, default=True | |
Convert all characters to lowercase before tokenizing. | |
preprocessor : callable or None (default) | |
Override the preprocessing (string transformation) stage while | |
preserving the tokenizing and n-grams generation steps. | |
Only applies if ``analyzer is not callable``. | |
tokenizer : callable or None (default) | |
Override the string tokenization step while preserving the | |
preprocessing and n-grams generation steps. | |
Only applies if ``analyzer == 'word'``. | |
stop_words : string {'english'}, list, or None (default) | |
If 'english', a built-in stop word list for English is used. | |
There are several known issues with 'english' and you should | |
consider an alternative (see :ref:`stop_words`). | |
If a list, that list is assumed to contain stop words, all of which | |
will be removed from the resulting tokens. | |
Only applies if ``analyzer == 'word'``. | |
token_pattern : string | |
Regular expression denoting what constitutes a "token", only used | |
if ``analyzer == 'word'``. The default regexp selects tokens of 2 | |
or more alphanumeric characters (punctuation is completely ignored | |
and always treated as a token separator). | |
ngram_range : tuple (min_n, max_n), default=(1, 1) | |
The lower and upper boundary of the range of n-values for different | |
n-grams to be extracted. All values of n such that min_n <= n <= max_n | |
will be used. For example an ``ngram_range`` of ``(1, 1)`` means only | |
unigrams, ``(1, 2)`` means unigrams and bigrams, and ``(2, 2)`` means | |
only bigrams. | |
Only applies if ``analyzer is not callable``. | |
analyzer : string, {'word', 'char', 'char_wb'} or callable | |
Whether the feature should be made of word or character n-grams. | |
Option 'char_wb' creates character n-grams only from text inside | |
word boundaries; n-grams at the edges of words are padded with space. | |
If a callable is passed it is used to extract the sequence of features | |
out of the raw, unprocessed input. | |
.. versionchanged:: 0.21 | |
Since v0.21, if ``input`` is ``filename`` or ``file``, the data is | |
first read from the file and then passed to the given callable | |
analyzer. | |
n_features : integer, default=(2 ** 20) | |
The number of features (columns) in the output matrices. Small numbers | |
of features are likely to cause hash collisions, but large numbers | |
will cause larger coefficient dimensions in linear learners. | |
binary : boolean, default=False. | |
If True, all non zero counts are set to 1. This is useful for discrete | |
probabilistic models that model binary events rather than integer | |
counts. | |
norm : 'l1', 'l2' or None, optional | |
Norm used to normalize term vectors. None for no normalization. | |
alternate_sign : boolean, optional, default True | |
When True, an alternating sign is added to the features as to | |
approximately conserve the inner product in the hashed space even for | |
small n_features. This approach is similar to sparse random projection. | |
.. versionadded:: 0.19 | |
dtype : type, optional | |
Type of the matrix returned by fit_transform() or transform(). | |
Examples | |
-------- | |
>>> from sklearn.feature_extraction.text import HashingVectorizer | |
>>> corpus = [ | |
... 'This is the first document.', | |
... 'This document is the second document.', | |
... 'And this is the third one.', | |
... 'Is this the first document?', | |
... ] | |
>>> vectorizer = HashingVectorizer(n_features=2**4) | |
>>> X = vectorizer.fit_transform(corpus) | |
>>> print(X.shape) | |
(4, 16) | |
See Also | |
-------- | |
CountVectorizer, TfidfVectorizer | |
""" | |
def __init__(self, input='content', encoding='utf-8', | |
decode_error='strict', strip_accents=None, | |
lowercase=True, preprocessor=None, tokenizer=None, | |
stop_words=None, token_pattern=r"(?u)\b\w\w+\b", | |
ngram_range=(1, 1), analyzer='word', n_features=(2 ** 20), | |
binary=False, norm='l2', alternate_sign=True, | |
dtype=np.float64): | |
self.input = input | |
self.encoding = encoding | |
self.decode_error = decode_error | |
self.strip_accents = strip_accents | |
self.preprocessor = preprocessor | |
self.tokenizer = tokenizer | |
self.analyzer = analyzer | |
self.lowercase = lowercase | |
self.token_pattern = token_pattern | |
self.stop_words = stop_words | |
self.n_features = n_features | |
self.ngram_range = ngram_range | |
self.binary = binary | |
self.norm = norm | |
self.alternate_sign = alternate_sign | |
self.dtype = dtype | |
def partial_fit(self, X, y=None): | |
"""Does nothing: this transformer is stateless. | |
This method is just there to mark the fact that this transformer | |
can work in a streaming setup. | |
Parameters | |
---------- | |
X : array-like, shape [n_samples, n_features] | |
Training data. | |
""" | |
return self | |
def fit(self, X, y=None): | |
"""Does nothing: this transformer is stateless. | |
Parameters | |
---------- | |
X : array-like, shape [n_samples, n_features] | |
Training data. | |
""" | |
# triggers a parameter validation | |
if isinstance(X, str): | |
raise ValueError( | |
"Iterable over raw text documents expected, " | |
"string object received.") | |
self._warn_for_unused_params() | |
self._validate_params() | |
self._get_hasher().fit(X, y=y) | |
return self | |
def transform(self, X): | |
"""Transform a sequence of documents to a document-term matrix. | |
Parameters | |
---------- | |
X : iterable over raw text documents, length = n_samples | |
Samples. Each sample must be a text document (either bytes or | |
unicode strings, file name or file object depending on the | |
constructor argument) which will be tokenized and hashed. | |
Returns | |
------- | |
X : sparse matrix of shape (n_samples, n_features) | |
Document-term matrix. | |
""" | |
if isinstance(X, str): | |
raise ValueError( | |
"Iterable over raw text documents expected, " | |
"string object received.") | |
self._validate_params() | |
analyzer = self.build_analyzer() | |
X = self._get_hasher().transform(analyzer(doc) for doc in X) | |
if self.binary: | |
X.data.fill(1) | |
if self.norm is not None: | |
X = normalize(X, norm=self.norm, copy=False) | |
return X | |
def fit_transform(self, X, y=None): | |
"""Transform a sequence of documents to a document-term matrix. | |
Parameters | |
---------- | |
X : iterable over raw text documents, length = n_samples | |
Samples. Each sample must be a text document (either bytes or | |
unicode strings, file name or file object depending on the | |
constructor argument) which will be tokenized and hashed. | |
y : any | |
Ignored. This parameter exists only for compatibility with | |
sklearn.pipeline.Pipeline. | |
Returns | |
------- | |
X : sparse matrix of shape (n_samples, n_features) | |
Document-term matrix. | |
""" | |
return self.fit(X, y).transform(X) | |
def _get_hasher(self): | |
return FeatureHasher(n_features=self.n_features, | |
input_type='string', dtype=self.dtype, | |
alternate_sign=self.alternate_sign) | |
def _more_tags(self): | |
return {'X_types': ['string']} | |
def _document_frequency(X): | |
"""Count the number of non-zero values for each feature in sparse X.""" | |
if sp.isspmatrix_csr(X): | |
return np.bincount(X.indices, minlength=X.shape[1]) | |
else: | |
return np.diff(X.indptr) | |
class CountVectorizer(_VectorizerMixin, BaseEstimator): | |
"""Convert a collection of text documents to a matrix of token counts | |
This implementation produces a sparse representation of the counts using | |
scipy.sparse.csr_matrix. | |
If you do not provide an a-priori dictionary and you do not use an analyzer | |
that does some kind of feature selection then the number of features will | |
be equal to the vocabulary size found by analyzing the data. | |
Read more in the :ref:`User Guide <text_feature_extraction>`. | |
Parameters | |
---------- | |
input : string {'filename', 'file', 'content'} | |
If 'filename', the sequence passed as an argument to fit is | |
expected to be a list of filenames that need reading to fetch | |
the raw content to analyze. | |
If 'file', the sequence items must have a 'read' method (file-like | |
object) that is called to fetch the bytes in memory. | |
Otherwise the input is expected to be a sequence of items that | |
can be of type string or byte. | |
encoding : string, 'utf-8' by default. | |
If bytes or files are given to analyze, this encoding is used to | |
decode. | |
decode_error : {'strict', 'ignore', 'replace'} | |
Instruction on what to do if a byte sequence is given to analyze that | |
contains characters not of the given `encoding`. By default, it is | |
'strict', meaning that a UnicodeDecodeError will be raised. Other | |
values are 'ignore' and 'replace'. | |
strip_accents : {'ascii', 'unicode', None} | |
Remove accents and perform other character normalization | |
during the preprocessing step. | |
'ascii' is a fast method that only works on characters that have | |
an direct ASCII mapping. | |
'unicode' is a slightly slower method that works on any characters. | |
None (default) does nothing. | |
Both 'ascii' and 'unicode' use NFKD normalization from | |
:func:`unicodedata.normalize`. | |
lowercase : boolean, True by default | |
Convert all characters to lowercase before tokenizing. | |
preprocessor : callable or None (default) | |
Override the preprocessing (string transformation) stage while | |
preserving the tokenizing and n-grams generation steps. | |
Only applies if ``analyzer is not callable``. | |
tokenizer : callable or None (default) | |
Override the string tokenization step while preserving the | |
preprocessing and n-grams generation steps. | |
Only applies if ``analyzer == 'word'``. | |
stop_words : string {'english'}, list, or None (default) | |
If 'english', a built-in stop word list for English is used. | |
There are several known issues with 'english' and you should | |
consider an alternative (see :ref:`stop_words`). | |
If a list, that list is assumed to contain stop words, all of which | |
will be removed from the resulting tokens. | |
Only applies if ``analyzer == 'word'``. | |
If None, no stop words will be used. max_df can be set to a value | |
in the range [0.7, 1.0) to automatically detect and filter stop | |
words based on intra corpus document frequency of terms. | |
token_pattern : string | |
Regular expression denoting what constitutes a "token", only used | |
if ``analyzer == 'word'``. The default regexp select tokens of 2 | |
or more alphanumeric characters (punctuation is completely ignored | |
and always treated as a token separator). | |
ngram_range : tuple (min_n, max_n), default=(1, 1) | |
The lower and upper boundary of the range of n-values for different | |
word n-grams or char n-grams to be extracted. All values of n such | |
such that min_n <= n <= max_n will be used. For example an | |
``ngram_range`` of ``(1, 1)`` means only unigrams, ``(1, 2)`` means | |
unigrams and bigrams, and ``(2, 2)`` means only bigrams. | |
Only applies if ``analyzer is not callable``. | |
analyzer : string, {'word', 'char', 'char_wb'} or callable | |
Whether the feature should be made of word n-gram or character | |
n-grams. | |
Option 'char_wb' creates character n-grams only from text inside | |
word boundaries; n-grams at the edges of words are padded with space. | |
If a callable is passed it is used to extract the sequence of features | |
out of the raw, unprocessed input. | |
.. versionchanged:: 0.21 | |
Since v0.21, if ``input`` is ``filename`` or ``file``, the data is | |
first read from the file and then passed to the given callable | |
analyzer. | |
max_df : float in range [0.0, 1.0] or int, default=1.0 | |
When building the vocabulary ignore terms that have a document | |
frequency strictly higher than the given threshold (corpus-specific | |
stop words). | |
If float, the parameter represents a proportion of documents, integer | |
absolute counts. | |
This parameter is ignored if vocabulary is not None. | |
min_df : float in range [0.0, 1.0] or int, default=1 | |
When building the vocabulary ignore terms that have a document | |
frequency strictly lower than the given threshold. This value is also | |
called cut-off in the literature. | |
If float, the parameter represents a proportion of documents, integer | |
absolute counts. | |
This parameter is ignored if vocabulary is not None. | |
max_features : int or None, default=None | |
If not None, build a vocabulary that only consider the top | |
max_features ordered by term frequency across the corpus. | |
This parameter is ignored if vocabulary is not None. | |
vocabulary : Mapping or iterable, optional | |
Either a Mapping (e.g., a dict) where keys are terms and values are | |
indices in the feature matrix, or an iterable over terms. If not | |
given, a vocabulary is determined from the input documents. Indices | |
in the mapping should not be repeated and should not have any gap | |
between 0 and the largest index. | |
binary : boolean, default=False | |
If True, all non zero counts are set to 1. This is useful for discrete | |
probabilistic models that model binary events rather than integer | |
counts. | |
dtype : type, optional | |
Type of the matrix returned by fit_transform() or transform(). | |
Attributes | |
---------- | |
vocabulary_ : dict | |
A mapping of terms to feature indices. | |
fixed_vocabulary_: boolean | |
True if a fixed vocabulary of term to indices mapping | |
is provided by the user | |
stop_words_ : set | |
Terms that were ignored because they either: | |
- occurred in too many documents (`max_df`) | |
- occurred in too few documents (`min_df`) | |
- were cut off by feature selection (`max_features`). | |
This is only available if no vocabulary was given. | |
Examples | |
-------- | |
>>> from sklearn.feature_extraction.text import CountVectorizer | |
>>> corpus = [ | |
... 'This is the first document.', | |
... 'This document is the second document.', | |
... 'And this is the third one.', | |
... 'Is this the first document?', | |
... ] | |
>>> vectorizer = CountVectorizer() | |
>>> X = vectorizer.fit_transform(corpus) | |
>>> print(vectorizer.get_feature_names()) | |
['and', 'document', 'first', 'is', 'one', 'second', 'the', 'third', 'this'] | |
>>> print(X.toarray()) | |
[[0 1 1 1 0 0 1 0 1] | |
[0 2 0 1 0 1 1 0 1] | |
[1 0 0 1 1 0 1 1 1] | |
[0 1 1 1 0 0 1 0 1]] | |
>>> vectorizer2 = CountVectorizer(analyzer='word', ngram_range=(2, 2)) | |
>>> X2 = vectorizer2.fit_transform(corpus) | |
>>> print(vectorizer2.get_feature_names()) | |
['and this', 'document is', 'first document', 'is the', 'is this', | |
'second document', 'the first', 'the second', 'the third', 'third one', | |
'this document', 'this is', 'this the'] | |
>>> print(X2.toarray()) | |
[[0 0 1 1 0 0 1 0 0 0 0 1 0] | |
[0 1 0 1 0 1 0 1 0 0 1 0 0] | |
[1 0 0 1 0 0 0 0 1 1 0 1 0] | |
[0 0 1 0 1 0 1 0 0 0 0 0 1]] | |
See Also | |
-------- | |
HashingVectorizer, TfidfVectorizer | |
Notes | |
----- | |
The ``stop_words_`` attribute can get large and increase the model size | |
when pickling. This attribute is provided only for introspection and can | |
be safely removed using delattr or set to None before pickling. | |
""" | |
def __init__(self, input='content', encoding='utf-8', | |
decode_error='strict', strip_accents=None, | |
lowercase=True, preprocessor=None, tokenizer=None, | |
stop_words=None, token_pattern=r"(?u)\b\w\w+\b", | |
ngram_range=(1, 1), analyzer='word', | |
max_df=1.0, min_df=1, max_features=None, | |
vocabulary=None, binary=False, dtype=np.int64): | |
self.input = input | |
self.encoding = encoding | |
self.decode_error = decode_error | |
self.strip_accents = strip_accents | |
self.preprocessor = preprocessor | |
self.tokenizer = tokenizer | |
self.analyzer = analyzer | |
self.lowercase = lowercase | |
self.token_pattern = token_pattern | |
self.stop_words = stop_words | |
self.max_df = max_df | |
self.min_df = min_df | |
if max_df < 0 or min_df < 0: | |
raise ValueError("negative value for max_df or min_df") | |
self.max_features = max_features | |
if max_features is not None: | |
if (not isinstance(max_features, numbers.Integral) or | |
max_features <= 0): | |
raise ValueError( | |
"max_features=%r, neither a positive integer nor None" | |
% max_features) | |
self.ngram_range = ngram_range | |
self.vocabulary = vocabulary | |
self.binary = binary | |
self.dtype = dtype | |
def _sort_features(self, X, vocabulary): | |
"""Sort features by name | |
Returns a reordered matrix and modifies the vocabulary in place | |
""" | |
sorted_features = sorted(vocabulary.items()) | |
map_index = np.empty(len(sorted_features), dtype=X.indices.dtype) | |
for new_val, (term, old_val) in enumerate(sorted_features): | |
vocabulary[term] = new_val | |
map_index[old_val] = new_val | |
X.indices = map_index.take(X.indices, mode='clip') | |
return X | |
def _limit_features(self, X, vocabulary, high=None, low=None, | |
limit=None): | |
"""Remove too rare or too common features. | |
Prune features that are non zero in more samples than high or less | |
documents than low, modifying the vocabulary, and restricting it to | |
at most the limit most frequent. | |
This does not prune samples with zero features. | |
""" | |
if high is None and low is None and limit is None: | |
return X, set() | |
# Calculate a mask based on document frequencies | |
dfs = _document_frequency(X) | |
mask = np.ones(len(dfs), dtype=bool) | |
if high is not None: | |
mask &= dfs <= high | |
if low is not None: | |
mask &= dfs >= low | |
if limit is not None and mask.sum() > limit: | |
tfs = np.asarray(X.sum(axis=0)).ravel() | |
mask_inds = (-tfs[mask]).argsort()[:limit] | |
new_mask = np.zeros(len(dfs), dtype=bool) | |
new_mask[np.where(mask)[0][mask_inds]] = True | |
mask = new_mask | |
new_indices = np.cumsum(mask) - 1 # maps old indices to new | |
removed_terms = set() | |
for term, old_index in list(vocabulary.items()): | |
if mask[old_index]: | |
vocabulary[term] = new_indices[old_index] | |
else: | |
del vocabulary[term] | |
removed_terms.add(term) | |
kept_indices = np.where(mask)[0] | |
if len(kept_indices) == 0: | |
raise ValueError("After pruning, no terms remain. Try a lower" | |
" min_df or a higher max_df.") | |
return X[:, kept_indices], removed_terms | |
def _count_vocab(self, raw_documents, fixed_vocab): | |
"""Create sparse feature matrix, and vocabulary where fixed_vocab=False | |
""" | |
if fixed_vocab: | |
vocabulary = self.vocabulary_ | |
else: | |
# Add a new value when a new vocabulary item is seen | |
vocabulary = defaultdict() | |
vocabulary.default_factory = vocabulary.__len__ | |
analyze = self.build_analyzer() | |
j_indices = [] | |
indptr = [] | |
values = _make_int_array() | |
indptr.append(0) | |
for doc in raw_documents: | |
feature_counter = {} | |
for feature in analyze(doc): | |
try: | |
feature_idx = vocabulary[feature] | |
if feature_idx not in feature_counter: | |
feature_counter[feature_idx] = 1 | |
else: | |
feature_counter[feature_idx] += 1 | |
except KeyError: | |
# Ignore out-of-vocabulary items for fixed_vocab=True | |
continue | |
j_indices.extend(feature_counter.keys()) | |
values.extend(feature_counter.values()) | |
indptr.append(len(j_indices)) | |
if not fixed_vocab: | |
# disable defaultdict behaviour | |
vocabulary = dict(vocabulary) | |
if not vocabulary: | |
raise ValueError("empty vocabulary; perhaps the documents only" | |
" contain stop words") | |
if indptr[-1] > 2147483648: # = 2**31 - 1 | |
if _IS_32BIT: | |
raise ValueError(('sparse CSR array has {} non-zero ' | |
'elements and requires 64 bit indexing, ' | |
'which is unsupported with 32 bit Python.') | |
.format(indptr[-1])) | |
indices_dtype = np.int64 | |
else: | |
indices_dtype = np.int32 | |
j_indices = np.asarray(j_indices, dtype=indices_dtype) | |
indptr = np.asarray(indptr, dtype=indices_dtype) | |
values = np.frombuffer(values, dtype=np.intc) | |
X = sp.csr_matrix((values, j_indices, indptr), | |
shape=(len(indptr) - 1, len(vocabulary)), | |
dtype=self.dtype) | |
X.sort_indices() | |
return vocabulary, X | |
def fit(self, raw_documents, y=None): | |
"""Learn a vocabulary dictionary of all tokens in the raw documents. | |
Parameters | |
---------- | |
raw_documents : iterable | |
An iterable which yields either str, unicode or file objects. | |
Returns | |
------- | |
self | |
""" | |
self._warn_for_unused_params() | |
self.fit_transform(raw_documents) | |
return self | |
def fit_transform(self, raw_documents, y=None): | |
"""Learn the vocabulary dictionary and return term-document matrix. | |
This is equivalent to fit followed by transform, but more efficiently | |
implemented. | |
Parameters | |
---------- | |
raw_documents : iterable | |
An iterable which yields either str, unicode or file objects. | |
Returns | |
------- | |
X : array, [n_samples, n_features] | |
Document-term matrix. | |
""" | |
# We intentionally don't call the transform method to make | |
# fit_transform overridable without unwanted side effects in | |
# TfidfVectorizer. | |
if isinstance(raw_documents, str): | |
raise ValueError( | |
"Iterable over raw text documents expected, " | |
"string object received.") | |
self._validate_params() | |
self._validate_vocabulary() | |
max_df = self.max_df | |
min_df = self.min_df | |
max_features = self.max_features | |
vocabulary, X = self._count_vocab(raw_documents, | |
self.fixed_vocabulary_) | |
if self.binary: | |
X.data.fill(1) | |
if not self.fixed_vocabulary_: | |
n_doc = X.shape[0] | |
max_doc_count = (max_df | |
if isinstance(max_df, numbers.Integral) | |
else max_df * n_doc) | |
min_doc_count = (min_df | |
if isinstance(min_df, numbers.Integral) | |
else min_df * n_doc) | |
if max_doc_count < min_doc_count: | |
raise ValueError( | |
"max_df corresponds to < documents than min_df") | |
X, self.stop_words_ = self._limit_features(X, vocabulary, | |
max_doc_count, | |
min_doc_count, | |
max_features) | |
X = self._sort_features(X, vocabulary) | |
self.vocabulary_ = vocabulary | |
return X | |
def transform(self, raw_documents): | |
"""Transform documents to document-term matrix. | |
Extract token counts out of raw text documents using the vocabulary | |
fitted with fit or the one provided to the constructor. | |
Parameters | |
---------- | |
raw_documents : iterable | |
An iterable which yields either str, unicode or file objects. | |
Returns | |
------- | |
X : sparse matrix, [n_samples, n_features] | |
Document-term matrix. | |
""" | |
if isinstance(raw_documents, str): | |
raise ValueError( | |
"Iterable over raw text documents expected, " | |
"string object received.") | |
self._check_vocabulary() | |
# use the same matrix-building strategy as fit_transform | |
_, X = self._count_vocab(raw_documents, fixed_vocab=True) | |
if self.binary: | |
X.data.fill(1) | |
return X | |
def inverse_transform(self, X): | |
"""Return terms per document with nonzero entries in X. | |
Parameters | |
---------- | |
X : {array-like, sparse matrix} of shape (n_samples, n_features) | |
Document-term matrix. | |
Returns | |
------- | |
X_inv : list of arrays, len = n_samples | |
List of arrays of terms. | |
""" | |
self._check_vocabulary() | |
if sp.issparse(X): | |
# We need CSR format for fast row manipulations. | |
X = X.tocsr() | |
else: | |
# We need to convert X to a matrix, so that the indexing | |
# returns 2D objects | |
X = np.asmatrix(X) | |
n_samples = X.shape[0] | |
terms = np.array(list(self.vocabulary_.keys())) | |
indices = np.array(list(self.vocabulary_.values())) | |
inverse_vocabulary = terms[np.argsort(indices)] | |
return [inverse_vocabulary[X[i, :].nonzero()[1]].ravel() | |
for i in range(n_samples)] | |
def get_feature_names(self): | |
"""Array mapping from feature integer indices to feature name. | |
Returns | |
------- | |
feature_names : list | |
A list of feature names. | |
""" | |
self._check_vocabulary() | |
return [t for t, i in sorted(self.vocabulary_.items(), | |
key=itemgetter(1))] | |
def _more_tags(self): | |
return {'X_types': ['string']} | |
def _make_int_array(): | |
"""Construct an array.array of a type suitable for scipy.sparse indices.""" | |
return array.array(str("i")) | |
class TfidfTransformer(TransformerMixin, BaseEstimator): | |
"""Transform a count matrix to a normalized tf or tf-idf representation | |
Tf means term-frequency while tf-idf means term-frequency times inverse | |
document-frequency. This is a common term weighting scheme in information | |
retrieval, that has also found good use in document classification. | |
The goal of using tf-idf instead of the raw frequencies of occurrence of a | |
token in a given document is to scale down the impact of tokens that occur | |
very frequently in a given corpus and that are hence empirically less | |
informative than features that occur in a small fraction of the training | |
corpus. | |
The formula that is used to compute the tf-idf for a term t of a document d | |
in a document set is tf-idf(t, d) = tf(t, d) * idf(t), and the idf is | |
computed as idf(t) = log [ n / df(t) ] + 1 (if ``smooth_idf=False``), where | |
n is the total number of documents in the document set and df(t) is the | |
document frequency of t; the document frequency is the number of documents | |
in the document set that contain the term t. The effect of adding "1" to | |
the idf in the equation above is that terms with zero idf, i.e., terms | |
that occur in all documents in a training set, will not be entirely | |
ignored. | |
(Note that the idf formula above differs from the standard textbook | |
notation that defines the idf as | |
idf(t) = log [ n / (df(t) + 1) ]). | |
If ``smooth_idf=True`` (the default), the constant "1" is added to the | |
numerator and denominator of the idf as if an extra document was seen | |
containing every term in the collection exactly once, which prevents | |
zero divisions: idf(d, t) = log [ (1 + n) / (1 + df(d, t)) ] + 1. | |
Furthermore, the formulas used to compute tf and idf depend | |
on parameter settings that correspond to the SMART notation used in IR | |
as follows: | |
Tf is "n" (natural) by default, "l" (logarithmic) when | |
``sublinear_tf=True``. | |
Idf is "t" when use_idf is given, "n" (none) otherwise. | |
Normalization is "c" (cosine) when ``norm='l2'``, "n" (none) | |
when ``norm=None``. | |
Read more in the :ref:`User Guide <text_feature_extraction>`. | |
Parameters | |
---------- | |
norm : 'l1', 'l2' or None, optional (default='l2') | |
Each output row will have unit norm, either: | |
* 'l2': Sum of squares of vector elements is 1. The cosine | |
similarity between two vectors is their dot product when l2 norm has | |
been applied. | |
* 'l1': Sum of absolute values of vector elements is 1. | |
See :func:`preprocessing.normalize` | |
use_idf : boolean (default=True) | |
Enable inverse-document-frequency reweighting. | |
smooth_idf : boolean (default=True) | |
Smooth idf weights by adding one to document frequencies, as if an | |
extra document was seen containing every term in the collection | |
exactly once. Prevents zero divisions. | |
sublinear_tf : boolean (default=False) | |
Apply sublinear tf scaling, i.e. replace tf with 1 + log(tf). | |
Attributes | |
---------- | |
idf_ : array, shape (n_features) | |
The inverse document frequency (IDF) vector; only defined | |
if ``use_idf`` is True. | |
Examples | |
-------- | |
>>> from sklearn.feature_extraction.text import TfidfTransformer | |
>>> from sklearn.feature_extraction.text import CountVectorizer | |
>>> from sklearn.pipeline import Pipeline | |
>>> import numpy as np | |
>>> corpus = ['this is the first document', | |
... 'this document is the second document', | |
... 'and this is the third one', | |
... 'is this the first document'] | |
>>> vocabulary = ['this', 'document', 'first', 'is', 'second', 'the', | |
... 'and', 'one'] | |
>>> pipe = Pipeline([('count', CountVectorizer(vocabulary=vocabulary)), | |
... ('tfid', TfidfTransformer())]).fit(corpus) | |
>>> pipe['count'].transform(corpus).toarray() | |
array([[1, 1, 1, 1, 0, 1, 0, 0], | |
[1, 2, 0, 1, 1, 1, 0, 0], | |
[1, 0, 0, 1, 0, 1, 1, 1], | |
[1, 1, 1, 1, 0, 1, 0, 0]]) | |
>>> pipe['tfid'].idf_ | |
array([1. , 1.22314355, 1.51082562, 1. , 1.91629073, | |
1. , 1.91629073, 1.91629073]) | |
>>> pipe.transform(corpus).shape | |
(4, 8) | |
References | |
---------- | |
.. [Yates2011] R. Baeza-Yates and B. Ribeiro-Neto (2011). Modern | |
Information Retrieval. Addison Wesley, pp. 68-74. | |
.. [MRS2008] C.D. Manning, P. Raghavan and H. Schütze (2008). | |
Introduction to Information Retrieval. Cambridge University | |
Press, pp. 118-120. | |
""" | |
def __init__(self, norm='l2', use_idf=True, smooth_idf=True, | |
sublinear_tf=False): | |
self.norm = norm | |
self.use_idf = use_idf | |
self.smooth_idf = smooth_idf | |
self.sublinear_tf = sublinear_tf | |
def fit(self, X, y=None): | |
"""Learn the idf vector (global term weights) | |
Parameters | |
---------- | |
X : sparse matrix, [n_samples, n_features] | |
a matrix of term/token counts | |
""" | |
X = check_array(X, accept_sparse=('csr', 'csc')) | |
if not sp.issparse(X): | |
X = sp.csr_matrix(X) | |
dtype = X.dtype if X.dtype in FLOAT_DTYPES else np.float64 | |
if self.use_idf: | |
n_samples, n_features = X.shape | |
df = _document_frequency(X) | |
df = df.astype(dtype, **_astype_copy_false(df)) | |
# perform idf smoothing if required | |
df += int(self.smooth_idf) | |
n_samples += int(self.smooth_idf) | |
# log+1 instead of log makes sure terms with zero idf don't get | |
# suppressed entirely. | |
idf = np.log(n_samples / df) + 1 | |
self._idf_diag = sp.diags(idf, offsets=0, | |
shape=(n_features, n_features), | |
format='csr', | |
dtype=dtype) | |
return self | |
def transform(self, X, copy=True): | |
"""Transform a count matrix to a tf or tf-idf representation | |
Parameters | |
---------- | |
X : sparse matrix, [n_samples, n_features] | |
a matrix of term/token counts | |
copy : boolean, default True | |
Whether to copy X and operate on the copy or perform in-place | |
operations. | |
Returns | |
------- | |
vectors : sparse matrix, [n_samples, n_features] | |
""" | |
X = check_array(X, accept_sparse='csr', dtype=FLOAT_DTYPES, copy=copy) | |
if not sp.issparse(X): | |
X = sp.csr_matrix(X, dtype=np.float64) | |
n_samples, n_features = X.shape | |
if self.sublinear_tf: | |
np.log(X.data, X.data) | |
X.data += 1 | |
if self.use_idf: | |
check_is_fitted(self, msg='idf vector is not fitted') | |
expected_n_features = self._idf_diag.shape[0] | |
if n_features != expected_n_features: | |
raise ValueError("Input has n_features=%d while the model" | |
" has been trained with n_features=%d" % ( | |
n_features, expected_n_features)) | |
# *= doesn't work | |
X = X * self._idf_diag | |
if self.norm: | |
X = normalize(X, norm=self.norm, copy=False) | |
return X | |
@property | |
def idf_(self): | |
# if _idf_diag is not set, this will raise an attribute error, | |
# which means hasattr(self, "idf_") is False | |
return np.ravel(self._idf_diag.sum(axis=0)) | |
@idf_.setter | |
def idf_(self, value): | |
value = np.asarray(value, dtype=np.float64) | |
n_features = value.shape[0] | |
self._idf_diag = sp.spdiags(value, diags=0, m=n_features, | |
n=n_features, format='csr') | |
def _more_tags(self): | |
return {'X_types': 'sparse'} | |
class TfidfVectorizer(CountVectorizer): | |
"""Convert a collection of raw documents to a matrix of TF-IDF features. | |
Equivalent to :class:`CountVectorizer` followed by | |
:class:`TfidfTransformer`. | |
Read more in the :ref:`User Guide <text_feature_extraction>`. | |
Parameters | |
---------- | |
input : str {'filename', 'file', 'content'} | |
If 'filename', the sequence passed as an argument to fit is | |
expected to be a list of filenames that need reading to fetch | |
the raw content to analyze. | |
If 'file', the sequence items must have a 'read' method (file-like | |
object) that is called to fetch the bytes in memory. | |
Otherwise the input is expected to be a sequence of items that | |
can be of type string or byte. | |
encoding : str, default='utf-8' | |
If bytes or files are given to analyze, this encoding is used to | |
decode. | |
decode_error : {'strict', 'ignore', 'replace'} (default='strict') | |
Instruction on what to do if a byte sequence is given to analyze that | |
contains characters not of the given `encoding`. By default, it is | |
'strict', meaning that a UnicodeDecodeError will be raised. Other | |
values are 'ignore' and 'replace'. | |
strip_accents : {'ascii', 'unicode', None} (default=None) | |
Remove accents and perform other character normalization | |
during the preprocessing step. | |
'ascii' is a fast method that only works on characters that have | |
an direct ASCII mapping. | |
'unicode' is a slightly slower method that works on any characters. | |
None (default) does nothing. | |
Both 'ascii' and 'unicode' use NFKD normalization from | |
:func:`unicodedata.normalize`. | |
lowercase : bool (default=True) | |
Convert all characters to lowercase before tokenizing. | |
preprocessor : callable or None (default=None) | |
Override the preprocessing (string transformation) stage while | |
preserving the tokenizing and n-grams generation steps. | |
Only applies if ``analyzer is not callable``. | |
tokenizer : callable or None (default=None) | |
Override the string tokenization step while preserving the | |
preprocessing and n-grams generation steps. | |
Only applies if ``analyzer == 'word'``. | |
analyzer : str, {'word', 'char', 'char_wb'} or callable | |
Whether the feature should be made of word or character n-grams. | |
Option 'char_wb' creates character n-grams only from text inside | |
word boundaries; n-grams at the edges of words are padded with space. | |
If a callable is passed it is used to extract the sequence of features | |
out of the raw, unprocessed input. | |
.. versionchanged:: 0.21 | |
Since v0.21, if ``input`` is ``filename`` or ``file``, the data is | |
first read from the file and then passed to the given callable | |
analyzer. | |
stop_words : str {'english'}, list, or None (default=None) | |
If a string, it is passed to _check_stop_list and the appropriate stop | |
list is returned. 'english' is currently the only supported string | |
value. | |
There are several known issues with 'english' and you should | |
consider an alternative (see :ref:`stop_words`). | |
If a list, that list is assumed to contain stop words, all of which | |
will be removed from the resulting tokens. | |
Only applies if ``analyzer == 'word'``. | |
If None, no stop words will be used. max_df can be set to a value | |
in the range [0.7, 1.0) to automatically detect and filter stop | |
words based on intra corpus document frequency of terms. | |
token_pattern : str | |
Regular expression denoting what constitutes a "token", only used | |
if ``analyzer == 'word'``. The default regexp selects tokens of 2 | |
or more alphanumeric characters (punctuation is completely ignored | |
and always treated as a token separator). | |
ngram_range : tuple (min_n, max_n), default=(1, 1) | |
The lower and upper boundary of the range of n-values for different | |
n-grams to be extracted. All values of n such that min_n <= n <= max_n | |
will be used. For example an ``ngram_range`` of ``(1, 1)`` means only | |
unigrams, ``(1, 2)`` means unigrams and bigrams, and ``(2, 2)`` means | |
only bigrams. | |
Only applies if ``analyzer is not callable``. | |
max_df : float in range [0.0, 1.0] or int (default=1.0) | |
When building the vocabulary ignore terms that have a document | |
frequency strictly higher than the given threshold (corpus-specific | |
stop words). | |
If float, the parameter represents a proportion of documents, integer | |
absolute counts. | |
This parameter is ignored if vocabulary is not None. | |
min_df : float in range [0.0, 1.0] or int (default=1) | |
When building the vocabulary ignore terms that have a document | |
frequency strictly lower than the given threshold. This value is also | |
called cut-off in the literature. | |
If float, the parameter represents a proportion of documents, integer | |
absolute counts. | |
This parameter is ignored if vocabulary is not None. | |
max_features : int or None (default=None) | |
If not None, build a vocabulary that only consider the top | |
max_features ordered by term frequency across the corpus. | |
This parameter is ignored if vocabulary is not None. | |
vocabulary : Mapping or iterable, optional (default=None) | |
Either a Mapping (e.g., a dict) where keys are terms and values are | |
indices in the feature matrix, or an iterable over terms. If not | |
given, a vocabulary is determined from the input documents. | |
binary : bool (default=False) | |
If True, all non-zero term counts are set to 1. This does not mean | |
outputs will have only 0/1 values, only that the tf term in tf-idf | |
is binary. (Set idf and normalization to False to get 0/1 outputs). | |
dtype : type, optional (default=float64) | |
Type of the matrix returned by fit_transform() or transform(). | |
norm : 'l1', 'l2' or None, optional (default='l2') | |
Each output row will have unit norm, either: | |
* 'l2': Sum of squares of vector elements is 1. The cosine | |
similarity between two vectors is their dot product when l2 norm has | |
been applied. | |
* 'l1': Sum of absolute values of vector elements is 1. | |
See :func:`preprocessing.normalize`. | |
use_idf : bool (default=True) | |
Enable inverse-document-frequency reweighting. | |
smooth_idf : bool (default=True) | |
Smooth idf weights by adding one to document frequencies, as if an | |
extra document was seen containing every term in the collection | |
exactly once. Prevents zero divisions. | |
sublinear_tf : bool (default=False) | |
Apply sublinear tf scaling, i.e. replace tf with 1 + log(tf). | |
Attributes | |
---------- | |
vocabulary_ : dict | |
A mapping of terms to feature indices. | |
fixed_vocabulary_: bool | |
True if a fixed vocabulary of term to indices mapping | |
is provided by the user | |
idf_ : array, shape (n_features) | |
The inverse document frequency (IDF) vector; only defined | |
if ``use_idf`` is True. | |
stop_words_ : set | |
Terms that were ignored because they either: | |
- occurred in too many documents (`max_df`) | |
- occurred in too few documents (`min_df`) | |
- were cut off by feature selection (`max_features`). | |
This is only available if no vocabulary was given. | |
See Also | |
-------- | |
CountVectorizer : Transforms text into a sparse matrix of n-gram counts. | |
TfidfTransformer : Performs the TF-IDF transformation from a provided | |
matrix of counts. | |
Notes | |
----- | |
The ``stop_words_`` attribute can get large and increase the model size | |
when pickling. This attribute is provided only for introspection and can | |
be safely removed using delattr or set to None before pickling. | |
Examples | |
-------- | |
>>> from sklearn.feature_extraction.text import TfidfVectorizer | |
>>> corpus = [ | |
... 'This is the first document.', | |
... 'This document is the second document.', | |
... 'And this is the third one.', | |
... 'Is this the first document?', | |
... ] | |
>>> vectorizer = TfidfVectorizer() | |
>>> X = vectorizer.fit_transform(corpus) | |
>>> print(vectorizer.get_feature_names()) | |
['and', 'document', 'first', 'is', 'one', 'second', 'the', 'third', 'this'] | |
>>> print(X.shape) | |
(4, 9) | |
""" | |
def __init__(self, input='content', encoding='utf-8', | |
decode_error='strict', strip_accents=None, lowercase=True, | |
preprocessor=None, tokenizer=None, analyzer='word', | |
stop_words=None, token_pattern=r"(?u)\b\w\w+\b", | |
ngram_range=(1, 1), max_df=1.0, min_df=1, | |
max_features=None, vocabulary=None, binary=False, | |
dtype=np.float64, norm='l2', use_idf=True, smooth_idf=True, | |
sublinear_tf=False): | |
super().__init__( | |
input=input, encoding=encoding, decode_error=decode_error, | |
strip_accents=strip_accents, lowercase=lowercase, | |
preprocessor=preprocessor, tokenizer=tokenizer, analyzer=analyzer, | |
stop_words=stop_words, token_pattern=token_pattern, | |
ngram_range=ngram_range, max_df=max_df, min_df=min_df, | |
max_features=max_features, vocabulary=vocabulary, binary=binary, | |
dtype=dtype) | |
self._tfidf = TfidfTransformer(norm=norm, use_idf=use_idf, | |
smooth_idf=smooth_idf, | |
sublinear_tf=sublinear_tf) | |
# Broadcast the TF-IDF parameters to the underlying transformer instance | |
# for easy grid search and repr | |
@property | |
def norm(self): | |
return self._tfidf.norm | |
@norm.setter | |
def norm(self, value): | |
self._tfidf.norm = value | |
@property | |
def use_idf(self): | |
return self._tfidf.use_idf | |
@use_idf.setter | |
def use_idf(self, value): | |
self._tfidf.use_idf = value | |
@property | |
def smooth_idf(self): | |
return self._tfidf.smooth_idf | |
@smooth_idf.setter | |
def smooth_idf(self, value): | |
self._tfidf.smooth_idf = value | |
@property | |
def sublinear_tf(self): | |
return self._tfidf.sublinear_tf | |
@sublinear_tf.setter | |
def sublinear_tf(self, value): | |
self._tfidf.sublinear_tf = value | |
@property | |
def idf_(self): | |
return self._tfidf.idf_ | |
@idf_.setter | |
def idf_(self, value): | |
self._validate_vocabulary() | |
if hasattr(self, 'vocabulary_'): | |
if len(self.vocabulary_) != len(value): | |
raise ValueError("idf length = %d must be equal " | |
"to vocabulary size = %d" % | |
(len(value), len(self.vocabulary))) | |
self._tfidf.idf_ = value | |
def _check_params(self): | |
if self.dtype not in FLOAT_DTYPES: | |
warnings.warn("Only {} 'dtype' should be used. {} 'dtype' will " | |
"be converted to np.float64." | |
.format(FLOAT_DTYPES, self.dtype), | |
UserWarning) | |
def fit(self, raw_documents, y=None): | |
"""Learn vocabulary and idf from training set. | |
Parameters | |
---------- | |
raw_documents : iterable | |
An iterable which yields either str, unicode or file objects. | |
y : None | |
This parameter is not needed to compute tfidf. | |
Returns | |
------- | |
self : object | |
Fitted vectorizer. | |
""" | |
self._check_params() | |
self._warn_for_unused_params() | |
X = super().fit_transform(raw_documents) | |
self._tfidf.fit(X) | |
return self | |
def fit_transform(self, raw_documents, y=None): | |
"""Learn vocabulary and idf, return term-document matrix. | |
This is equivalent to fit followed by transform, but more efficiently | |
implemented. | |
Parameters | |
---------- | |
raw_documents : iterable | |
An iterable which yields either str, unicode or file objects. | |
y : None | |
This parameter is ignored. | |
Returns | |
------- | |
X : sparse matrix, [n_samples, n_features] | |
Tf-idf-weighted document-term matrix. | |
""" | |
self._check_params() | |
X = super().fit_transform(raw_documents) | |
self._tfidf.fit(X) | |
# X is already a transformed view of raw_documents so | |
# we set copy to False | |
return self._tfidf.transform(X, copy=False) | |
def transform(self, raw_documents, copy="deprecated"): | |
"""Transform documents to document-term matrix. | |
Uses the vocabulary and document frequencies (df) learned by fit (or | |
fit_transform). | |
Parameters | |
---------- | |
raw_documents : iterable | |
An iterable which yields either str, unicode or file objects. | |
copy : bool, default True | |
Whether to copy X and operate on the copy or perform in-place | |
operations. | |
.. deprecated:: 0.22 | |
The `copy` parameter is unused and was deprecated in version | |
0.22 and will be removed in 0.24. This parameter will be | |
ignored. | |
Returns | |
------- | |
X : sparse matrix, [n_samples, n_features] | |
Tf-idf-weighted document-term matrix. | |
""" | |
check_is_fitted(self, msg='The tfidf vector is not fitted') | |
# FIXME Remove copy parameter support in 0.24 | |
if copy != "deprecated": | |
msg = ("'copy' param is unused and has been deprecated since " | |
"version 0.22. Backward compatibility for 'copy' will " | |
"be removed in 0.24.") | |
warnings.warn(msg, FutureWarning) | |
X = super().transform(raw_documents) | |
return self._tfidf.transform(X, copy=False) | |
def _more_tags(self): | |
return {'X_types': ['string'], '_skip_test': True} |