Skip to content

Commit

Permalink
Revert "BUG fixed and cosmetics in CountVectorizer"
Browse files Browse the repository at this point in the history
This fix has introduced a major performance regression that renders the
text feature extraction unusable. I revert it pending a better fix.

This reverts commit a95af4d.
  • Loading branch information
ogrisel committed Oct 2, 2011
1 parent 569e446 commit de6e930
Show file tree
Hide file tree
Showing 2 changed files with 12 additions and 21 deletions.
18 changes: 10 additions & 8 deletions sklearn/feature_extraction/text.py
Original file line number Diff line number Diff line change
Expand Up @@ -318,27 +318,29 @@ def fit_transform(self, raw_documents, y=None):
# TODO: parallelize the following loop with joblib?
# (see XXX up ahead)
for doc in raw_documents:
term_count_current = Counter(self.analyzer.analyze(doc))
term_counts += term_count_current
term_count_current = Counter()

if max_df < 1.0:
document_counts.update(term_count_current)
for term in self.analyzer.analyze(doc):
term_count_current[term] += 1
term_counts[term] += 1

if max_df is not None:
for term in term_count_current:
document_counts[term] += 1

term_counts_per_doc.append(term_count_current)

n_doc = len(term_counts_per_doc)

# filter out stop words: terms that occur in almost all documents
if max_df < 1.0:
if max_df is not None:
max_document_count = max_df * n_doc
stop_words = set(t for t, dc in document_counts.iteritems()
if dc > max_document_count)
else:
stop_words = set()

# list the terms that should be part of the vocabulary
if max_features is None:
terms = set(term_counts) - stop_words
terms = [t for t in term_counts if t not in stop_words]
else:
# extract the most frequent terms for the vocabulary
terms = set()
Expand Down
15 changes: 2 additions & 13 deletions sklearn/utils/fixes.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,25 +24,14 @@ def product(*args, **kwds):
try:
Counter = collections.Counter
except AttributeError:
# Partial replacement for Python 2.7 collections.Counter
# Partial replacement for Python 2.7 Counter
class Counter(collections.defaultdict):
def __init__(self, iterable=(), **kwargs):
def __init__(self, **kwargs):
super(Counter, self).__init__(int, **kwargs)
self.update(iterable)

def __iadd__(self, other):
"""self += other; adds counts for elements in other"""
for x, n in other.iteritems():
self[x] += n
return self

def most_common(self):
return sorted(self.iteritems(), key=itemgetter(1), reverse=True)

def update(self, iterable):
for x in iterable:
self[x] += 1


def _unique(ar, return_index=False, return_inverse=False):
"""A replacement for the np.unique that appeared in numpy 1.4.
Expand Down

0 comments on commit de6e930

Please sign in to comment.