In [23]:
import pandas as pd
from sklearn.datasets import fetch_20newsgroups
categories = ['alt.atheism', 'soc.religion.christian', 'comp.graphics', 'sci.med']
twenty_train = fetch_20newsgroups(subset='train', categories=categories,shuffle=True)
twenty_train.target_names

['alt.atheism', 'comp.graphics', 'sci.med', 'soc.religion.christian']

In [24]:
#print the first few lines of the first document:
print "\n".join(twenty_train.data[0].split("\n")[:3])

# and print the caterogy this belongs to:
print '\nCategory: {}'.format(twenty_train.target_names[twenty_train.target[0]])

From: sd345@city.ac.uk (Michael Collier)
Subject: Converting images to HP LaserJet III?
Nntp-Posting-Host: hampton

Category: comp.graphics


In [28]:
# lets get some features
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(twenty_train.data)
X_train_counts.shape

(2257, 35788)

In [41]:
# so now we have a dictionary in a sparse matrix but we want term frequency so larger documents don't have higher 
# averages for the same words (even better would be to multiply time the inverse of document frequency to reduce 
# the value of more commonly used words like "the" or "a")
from sklearn.feature_extraction.text import TfidfTransformer
tf_idf_transformer = TfidfTransformer()
X_train_tfidf = tf_idf_transformer.fit_transform(X_train_counts)


In [42]:
# lets get a good (simple) classifier to for text
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB().fit(X_train_tfidf, twenty_train.target)

In [46]:
docs_new = ['God is love', 'OpenGL on the GPU is fast']
X_new_counts = count_vect.transform(docs_new)
X_new_tfidf = tf_idf_transformer.transform(X_new_counts)
predicted = clf.predict(X_new_tfidf)
for doc, category in zip(docs_new, predicted):
    print '{} => {}'.format(doc, twenty_train.target_names[category])

God is love => soc.religion.christian
OpenGL on the GPU is fast => comp.graphics


In [47]:
# using pipeline to shorten the code:
from sklearn.pipeline import Pipeline
text_clf = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ('clf', MultinomialNB())])

In [50]:
text_clf_instance = text_clf.fit(twenty_train.data, twenty_train.target)

In [63]:
# Evaluating the accuracy
import numpy as np
twenty_test = fetch_20newsgroups(subset='test',
    categories=categories, shuffle=True)
docs_test = twenty_test.data
predicted = text_clf_instance.predict(docs_test)
np.mean(predicted == twenty_test.target)  

0.83488681757656458

In [64]:
# now lets use a support vector machine, highly regarded as best text classifier but takes more computing time
from sklearn.linear_model import SGDClassifier
text_clf2 = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('clf', SGDClassifier(loss='hinge', penalty='l2',
                                           alpha=1e-3, n_iter=5, random_state=42)),
])

In [65]:
fitted_model = text_clf2.fit(twenty_train.data, twenty_train.target)

In [67]:
prediction = fitted_model.predict(docs_test)
np.mean(prediction == twenty_test.target)

0.9127829560585885

In [69]:
from sklearn import metrics
print(metrics.classification_report(twenty_test.target, predicted,
    target_names=twenty_test.target_names))

                        precision    recall  f1-score   support

           alt.atheism       0.97      0.60      0.74       319
         comp.graphics       0.96      0.89      0.92       389
               sci.med       0.97      0.81      0.88       396
soc.religion.christian       0.65      0.99      0.78       398

           avg / total       0.88      0.83      0.84      1502



In [70]:
metrics.confusion_matrix(twenty_test.target, predicted)


array([[192,   2,   6, 119],
       [  2, 347,   4,  36],
       [  2,  11, 322,  61],
       [  2,   2,   1, 393]])

In [72]:
from sklearn.model_selection import GridSearchCV
parameters = {'vect__ngram_range': [(1, 1), (1, 2)],
              'tfidf__use_idf': (True, False),
              'clf__alpha': (1e-2, 1e-3),
}

In [73]:
gs_clf = GridSearchCV(text_clf2, parameters, n_jobs=-1)

In [74]:
# fitting the gridsearch classifier on a subset of the data
gs_clf = gs_clf.fit(twenty_train.data[:400], twenty_train.target[:400])

In [76]:
twenty_train.target_names[gs_clf.predict(['God is love'])[0]]

'soc.religion.christian'