# Learning Scikit-learn: Machine Learning in Python

## Notebook for Chapter 2: Supervised Learning - Text Classification with Naïve Bayes

In [None]:
%pylab inline

Import the newsgroup Dataset, explore its structure and data

In [None]:
from sklearn.datasets import fetch_20newsgroups


In [None]:
news = fetch_20newsgroups(subset='all')

In [None]:
news.keys()

In [None]:
print type(news.data), type(news.target), type(news.target_names)
print news.target_names
print len(news.data)
print len(news.target)

In [None]:
print news.data[0]
print news.target[0], news.target_names[news.target[0]]

Build training and testing datasets:

In [7]:
SPLIT_PERC = 0.75
split_size = int(len(news.data)*SPLIT_PERC)
X_train = news.data[:split_size]
X_test = news.data[split_size:]
y_train = news.target[:split_size]
y_test = news.target[split_size:]



This function will serve to perform and evaluate a cross validation:

In [8]:
from sklearn.cross_validation import cross_val_score, KFold
from scipy.stats import sem

def evaluate_cross_validation(clf, X, y, K):
    # create a k-fold croos validation iterator of k=5 folds
    cv = KFold(len(y), K, shuffle=True, random_state=0)
    # by default the score used is the one returned by score method of the estimator (accuracy)
    scores = cross_val_score(clf, X, y, cv=cv)
    print scores
    print ("Mean score: {0:.3f} (+/-{1:.3f})").format(
        np.mean(scores), sem(scores))

Evaluate three models with the same Naive Bayes classifier, but with different vectorizers:

In [9]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer, HashingVectorizer, CountVectorizer

clf_1 = Pipeline([
    ('vect', CountVectorizer()),
    ('clf', MultinomialNB()),
])
clf_2 = Pipeline([
    ('vect', HashingVectorizer(non_negative=True)),
    ('clf', MultinomialNB()),
])
clf_3 = Pipeline([
    ('vect', TfidfVectorizer()),
    ('clf', MultinomialNB()),
])

In [10]:
clfs = [clf_1, clf_2, clf_3]
for clf in clfs:
    evaluate_cross_validation(clf, news.data, news.target, 5)


[ 0.85782493  0.85725657  0.84664367  0.85911382  0.8458477 ]
Mean score: 0.853 (+/-0.003)
[ 0.75543767  0.77659857  0.77049615  0.78508888  0.76200584]
Mean score: 0.770 (+/-0.005)
[ 0.84482759  0.85990979  0.84558238  0.85990979  0.84213319]
Mean score: 0.850 (+/-0.004)


We will keep the TF-IDF vectorizer but use a different regular expression to pefrom tokenization:

In [11]:
clf_4 = Pipeline([
    ('vect', TfidfVectorizer(
                token_pattern=ur"\b[a-z0-9_\-\.]+[a-z][a-z0-9_\-\.]+\b",
    )),
    ('clf', MultinomialNB()),
])

In [12]:
evaluate_cross_validation(clf_4, news.data, news.target, 5)

[ 0.86100796  0.8718493   0.86203237  0.87291059  0.8588485 ]
Mean score: 0.865 (+/-0.003)


Try to improve performance filtering the stop words:

In [13]:
def get_stop_words():
    result = set()
    for line in open('stopwords_en.txt', 'r').readlines():
        result.add(line.strip())
    return result

In [14]:
stop_words = get_stop_words()


In [15]:
clf_5 = Pipeline([
    ('vect', TfidfVectorizer(
                stop_words=stop_words,
                token_pattern=ur"\b[a-z0-9_\-\.]+[a-z][a-z0-9_\-\.]+\b",    
    )),
    ('clf', MultinomialNB()),
])

In [16]:
evaluate_cross_validation(clf_5, news.data, news.target, 5)

[ 0.88116711  0.89519767  0.88325816  0.89227912  0.88113558]
Mean score: 0.887 (+/-0.003)


Try to improve by adjusting the alpha parameter on the MultinomialNB classifier:

In [26]:
clf_7 = Pipeline([
    ('vect', TfidfVectorizer(
                stop_words=stop_words,
                token_pattern=ur"\b[a-z0-9_\-\.]+[a-z][a-z0-9_\-\.]+\b",         
    )),
    ('clf', MultinomialNB(alpha=0.01)),
])

In [27]:
evaluate_cross_validation(clf_7, news.data, news.target, 5)

[ 0.9204244   0.91960732  0.91828071  0.92677103  0.91854603]
Mean score: 0.921 (+/-0.002)


In [29]:
from sklearn import metrics

def train_and_evaluate(clf, X_train, X_test, y_train, y_test):
    
    clf.fit(X_train, y_train)
    
    print "Accuracy on training set:"
    print clf.score(X_train, y_train)
    print "Accuracy on testing set:"
    print clf.score(X_test, y_test)
    
    y_pred = clf.predict(X_test)
    
    print "Classification Report:"
    print metrics.classification_report(y_test, y_pred)
    print "Confusion Matrix:"
    print metrics.confusion_matrix(y_test, y_pred)

In [30]:
train_and_evaluate(clf_7, X_train, X_test, y_train, y_test)

Accuracy on training set:
0.996957690675
Accuracy on testing set:
0.917869269949
Classification Report:
             precision    recall  f1-score   support

          0       0.95      0.88      0.91       216
          1       0.85      0.85      0.85       246
          2       0.91      0.84      0.87       274
          3       0.81      0.86      0.83       235
          4       0.88      0.90      0.89       231
          5       0.89      0.91      0.90       225
          6       0.88      0.80      0.84       248
          7       0.92      0.93      0.93       275
          8       0.96      0.98      0.97       226
          9       0.97      0.94      0.96       250
         10       0.97      1.00      0.98       257
         11       0.97      0.97      0.97       261
         12       0.90      0.91      0.91       216
         13       0.94      0.95      0.95       257
         14       0.94      0.97      0.95       246
         15       0.90      0.96      0.93     

In [31]:
clf_7.named_steps['vect'].get_feature_names()

[u'0-.66d8wt',
 u'0-04g55',
 u'0-100mph',
 u'0-13-117441-x--or',
 u'0-3mb',
 u'0-40mb',
 u'0-40volts',
 u'0-5mb',
 u'0-60mph',
 u'0-8.3mb',
 u'0-a00138',
 u'0-byte',
 u'0-defects',
 u'0-e8',
 u'0-for-4',
 u'0-hc',
 u'0-ii',
 u'0-uw',
 u'0-uw0',
 u'0-uw2',
 u'0-uwa',
 u'0-uwt',
 u'0-uwt7',
 u'0-uww',
 u'0-uww7',
 u'0.-w0',
 u'0..x-1',
 u'0.00...nice',
 u'0.02cents',
 u'0.0cb',
 u'0.1-ports',
 u'0.15mb',
 u'0.2d-_',
 u'0.5db',
 u'0.6-micron',
 u'0.65mb',
 u'0.97pl4',
 u'0.b34s_',
 u'0.c0rgo5kj7pp0',
 u'0.c4',
 u'0.jy',
 u'0.s_',
 u'0.tprv6ekj7r',
 u'0.tt',
 u'0.txa_',
 u'0.txc',
 u'0.vpp',
 u'0.vpsll2',
 u'00-index.txt',
 u'000-foot',
 u'000-kg',
 u'000-man',
 u'000-maxwell',
 u'000-strong',
 u'000000.active.spx',
 u'000062david42',
 u'000100255pixel',
 u'0005111312na1em',
 u'0005111312na3em',
 u'000hz',
 u'000iu',
 u'000mg',
 u'000mi',
 u'000miles',
 u'000puq9',
 u'000rpm',
 u'000th',
 u'000ug',
 u'000usd',
 u'0010580b.0b6r49',
 u'0010580b.vma7o9',
 u'0010580b.vmcbrt',
 u'001200201pixel

In [32]:
print len(clf_7.named_steps['vect'].get_feature_names())

145771
