Skip to content

Commit

Permalink
FIX TfidfVectorizer to no longer ignore binary param
Browse files Browse the repository at this point in the history
Also changed the docs to clarify that binary=True means
binary tf, not binary output.
  • Loading branch information
larsmans committed Mar 24, 2014
1 parent faba875 commit 39b859b
Show file tree
Hide file tree
Showing 2 changed files with 16 additions and 5 deletions.
11 changes: 11 additions & 0 deletions sklearn/feature_extraction/tests/test_text.py
Expand Up @@ -866,3 +866,14 @@ def test_pickling_transformer():
def test_non_unique_vocab():
vocab = ['a', 'b', 'c', 'a', 'a']
assert_raises(ValueError, CountVectorizer, vocabulary=vocab)


def test_tfidfvectorizer_binary():
# Non-regression test: TfidfVectorizer used to ignore its "binary" param.
v = TfidfVectorizer(binary=True, use_idf=False, norm=None)
assert_true(v.binary)

X = v.fit_transform(['hello world', 'hello hello']).toarray()
assert_array_equal(X.ravel(), [1, 1, 1, 0])
X2 = v.transform(['hello world', 'hello hello']).toarray()
assert_array_equal(X2.ravel(), [1, 1, 1, 0])
10 changes: 5 additions & 5 deletions sklearn/feature_extraction/text.py
Expand Up @@ -1120,9 +1120,9 @@ class TfidfVectorizer(CountVectorizer):
given, a vocabulary is determined from the input documents.
binary : boolean, False by default.
If True, all non zero counts are set to 1. This is useful for discrete
probabilistic models that model binary events rather than integer
counts.
If True, all non-zero term counts are set to 1. This does not mean
outputs will have only 0/1 values, only that the tf term in tf-idf
is binary. (Set idf and normalization to False to get 0/1 outputs.)
dtype : type, optional
Type of the matrix returned by fit_transform() or transform().
Expand Down Expand Up @@ -1170,7 +1170,7 @@ def __init__(self, input='content', encoding='utf-8', charset=None,
preprocessor=preprocessor, tokenizer=tokenizer, analyzer=analyzer,
stop_words=stop_words, token_pattern=token_pattern,
ngram_range=ngram_range, max_df=max_df, min_df=min_df,
max_features=max_features, vocabulary=vocabulary, binary=False,
max_features=max_features, vocabulary=vocabulary, binary=binary,
dtype=dtype)

self._tfidf = TfidfTransformer(norm=norm, use_idf=use_idf,
Expand Down Expand Up @@ -1249,4 +1249,4 @@ def transform(self, raw_documents, copy=True):
vectors : sparse matrix, [n_samples, n_features]
"""
X = super(TfidfVectorizer, self).transform(raw_documents)
return self._tfidf.transform(X, copy)
return self._tfidf.transform(X, copy=False)

0 comments on commit 39b859b

Please sign in to comment.