diff --git a/sklearn/feature_extraction/text.py b/sklearn/feature_extraction/text.py index 4e9e14e28a1dc..1f0d8fd3c0e24 100644 --- a/sklearn/feature_extraction/text.py +++ b/sklearn/feature_extraction/text.py @@ -318,27 +318,29 @@ def fit_transform(self, raw_documents, y=None): # TODO: parallelize the following loop with joblib? # (see XXX up ahead) for doc in raw_documents: - term_count_current = Counter(self.analyzer.analyze(doc)) - term_counts += term_count_current + term_count_current = Counter() - if max_df < 1.0: - document_counts.update(term_count_current) + for term in self.analyzer.analyze(doc): + term_count_current[term] += 1 + term_counts[term] += 1 + + if max_df is not None: + for term in term_count_current: + document_counts[term] += 1 term_counts_per_doc.append(term_count_current) n_doc = len(term_counts_per_doc) # filter out stop words: terms that occur in almost all documents - if max_df < 1.0: + if max_df is not None: max_document_count = max_df * n_doc stop_words = set(t for t, dc in document_counts.iteritems() if dc > max_document_count) - else: - stop_words = set() # list the terms that should be part of the vocabulary if max_features is None: - terms = set(term_counts) - stop_words + terms = [t for t in term_counts if t not in stop_words] else: # extract the most frequent terms for the vocabulary terms = set() diff --git a/sklearn/utils/fixes.py b/sklearn/utils/fixes.py index 752fb59b704ff..75f8b865f6c9d 100644 --- a/sklearn/utils/fixes.py +++ b/sklearn/utils/fixes.py @@ -24,25 +24,14 @@ def product(*args, **kwds): try: Counter = collections.Counter except AttributeError: - # Partial replacement for Python 2.7 collections.Counter + # Partial replacement for Python 2.7 Counter class Counter(collections.defaultdict): - def __init__(self, iterable=(), **kwargs): + def __init__(self, **kwargs): super(Counter, self).__init__(int, **kwargs) - self.update(iterable) - - def __iadd__(self, other): - """self += other; adds counts for elements in other""" - for x, n in other.iteritems(): - self[x] += n - return self def most_common(self): return sorted(self.iteritems(), key=itemgetter(1), reverse=True) - def update(self, iterable): - for x in iterable: - self[x] += 1 - def _unique(ar, return_index=False, return_inverse=False): """A replacement for the np.unique that appeared in numpy 1.4.