FIX TfidfVectorizer to no longer ignore binary param

Also changed the docs to clarify that binary=True means binary tf, not binary output.
scikit-learn · Mar 24, 2014 · 39b859b · 39b859b
1 parent faba875
commit 39b859b
Show file tree

Hide file tree

Showing 2 changed files with 16 additions and 5 deletions.
diff --git a/sklearn/feature_extraction/tests/test_text.py b/sklearn/feature_extraction/tests/test_text.py
@@ -866,3 +866,14 @@ def test_pickling_transformer():
 def test_non_unique_vocab():
     vocab = ['a', 'b', 'c', 'a', 'a']
     assert_raises(ValueError, CountVectorizer, vocabulary=vocab)
+
+
+def test_tfidfvectorizer_binary():
+    # Non-regression test: TfidfVectorizer used to ignore its "binary" param.
+    v = TfidfVectorizer(binary=True, use_idf=False, norm=None)
+    assert_true(v.binary)
+
+    X = v.fit_transform(['hello world', 'hello hello']).toarray()
+    assert_array_equal(X.ravel(), [1, 1, 1, 0])
+    X2 = v.transform(['hello world', 'hello hello']).toarray()
+    assert_array_equal(X2.ravel(), [1, 1, 1, 0])
diff --git a/sklearn/feature_extraction/text.py b/sklearn/feature_extraction/text.py
@@ -1120,9 +1120,9 @@ class TfidfVectorizer(CountVectorizer):
         given, a vocabulary is determined from the input documents.
 
     binary : boolean, False by default.
-        If True, all non zero counts are set to 1. This is useful for discrete
-        probabilistic models that model binary events rather than integer
-        counts.
+        If True, all non-zero term counts are set to 1. This does not mean
+        outputs will have only 0/1 values, only that the tf term in tf-idf
+        is binary. (Set idf and normalization to False to get 0/1 outputs.)
 
     dtype : type, optional
         Type of the matrix returned by fit_transform() or transform().
@@ -1170,7 +1170,7 @@ def __init__(self, input='content', encoding='utf-8', charset=None,
             preprocessor=preprocessor, tokenizer=tokenizer, analyzer=analyzer,
             stop_words=stop_words, token_pattern=token_pattern,
             ngram_range=ngram_range, max_df=max_df, min_df=min_df,
-            max_features=max_features, vocabulary=vocabulary, binary=False,
+            max_features=max_features, vocabulary=vocabulary, binary=binary,
             dtype=dtype)
 
         self._tfidf = TfidfTransformer(norm=norm, use_idf=use_idf,
@@ -1249,4 +1249,4 @@ def transform(self, raw_documents, copy=True):
         vectors : sparse matrix, [n_samples, n_features]
         """
         X = super(TfidfVectorizer, self).transform(raw_documents)
-        return self._tfidf.transform(X, copy)
+        return self._tfidf.transform(X, copy=False)