Revert "BUG fixed and cosmetics in CountVectorizer"

This fix has introduced a major performance regression that renders the text feature extraction unusable. I revert it pending a better fix. This reverts commit a95af4d.
scikit-learn · Oct 2, 2011 · de6e930 · de6e930
1 parent 569e446
commit de6e930
Show file tree

Hide file tree

Showing 2 changed files with 12 additions and 21 deletions.
diff --git a/sklearn/feature_extraction/text.py b/sklearn/feature_extraction/text.py
@@ -318,27 +318,29 @@ def fit_transform(self, raw_documents, y=None):
         # TODO: parallelize the following loop with joblib?
         # (see XXX up ahead)
         for doc in raw_documents:
-            term_count_current = Counter(self.analyzer.analyze(doc))
-            term_counts += term_count_current
+            term_count_current = Counter()
 
-            if max_df < 1.0:
-                document_counts.update(term_count_current)
+            for term in self.analyzer.analyze(doc):
+                term_count_current[term] += 1
+                term_counts[term] += 1
+
+            if max_df is not None:
+                for term in term_count_current:
+                    document_counts[term] += 1
 
             term_counts_per_doc.append(term_count_current)
 
         n_doc = len(term_counts_per_doc)
 
         # filter out stop words: terms that occur in almost all documents
-        if max_df < 1.0:
+        if max_df is not None:
             max_document_count = max_df * n_doc
             stop_words = set(t for t, dc in document_counts.iteritems()
                                if dc > max_document_count)
-        else:
-            stop_words = set()
 
         # list the terms that should be part of the vocabulary
         if max_features is None:
-            terms = set(term_counts) - stop_words
+            terms = [t for t in term_counts if t not in stop_words]
         else:
             # extract the most frequent terms for the vocabulary
             terms = set()

diff --git a/sklearn/utils/fixes.py b/sklearn/utils/fixes.py
@@ -24,25 +24,14 @@ def product(*args, **kwds):
 try:
     Counter = collections.Counter
 except AttributeError:
-    # Partial replacement for Python 2.7 collections.Counter
+    # Partial replacement for Python 2.7 Counter
     class Counter(collections.defaultdict):
-        def __init__(self, iterable=(), **kwargs):
+        def __init__(self, **kwargs):
             super(Counter, self).__init__(int, **kwargs)
-            self.update(iterable)
-
-        def __iadd__(self, other):
-            """self += other; adds counts for elements in other"""
-            for x, n in other.iteritems():
-                self[x] += n
-            return self
 
         def most_common(self):
             return sorted(self.iteritems(), key=itemgetter(1), reverse=True)
 
-        def update(self, iterable):
-            for x in iterable:
-                self[x] += 1
-
 
 def _unique(ar, return_index=False, return_inverse=False):
     """A replacement for the np.unique that appeared in numpy 1.4.