In [1]:
from sklearn import datasets

In [2]:
twenty_train=datasets.load_files("20news-bydate-train")

In [4]:
twenty_test=datasets.load_files("20news-bydate-test")

In [5]:
len(twenty_train.target_names),len(twenty_train.data),len(twenty_train.filenames),len(twenty_test.data)

(20, 11314, 11314, 7532)

In [6]:
print("\n".join(twenty_train.data[0].split("\n")[:3]))

From: cubbie@garnet.berkeley.edu (                               )
Subject: Re: Cubs behind Marlins? How?
Article-I.D.: agate.1pt592$f9a


In [7]:
print(twenty_train.target_names[twenty_train.target[0]])

rec.sport.baseball


In [8]:
twenty_train.target[:10]

array([ 9,  4, 11,  4,  0,  4,  5,  5, 13, 12])

In [9]:
from sklearn.feature_extraction.text import CountVectorizer

In [10]:
count_vect=CountVectorizer(stop_words='english',decode_error='ignore')

In [11]:
X_train_counts=count_vect.fit_transform(twenty_train.data)

In [12]:
X_train_counts.shape

(11314, 129783)

In [13]:
from sklearn.feature_extraction.text import TfidfTransformer

In [25]:
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
X_train_tfidf.shape

(11314, 129783)

In [17]:
from sklearn.naive_bayes import MultinomialNB

In [19]:
clf=MultinomialNB().fit(X_train_tf,twenty_train.target)

In [20]:
doc_news=['God is love','OpenGL on the GPU is fast']

In [21]:
X_new_counts=count_vect.transform(doc_news)

In [23]:
tfidf_transformer = TfidfTransformer()

In [26]:
X_new_tfidf=tfidf_transformer.transform(X_new_counts)

In [28]:
predicted=clf.predict(X_new_tfidf)

In [32]:
for doc,category in zip(doc_news,predicted):
    print("%r => %s")%(doc,twenty_train.target_names[category])

'God is love' => soc.religion.christian
'OpenGL on the GPU is fast' => comp.graphics


In [33]:
from sklearn.pipeline import Pipeline

In [34]:
text_clf=Pipeline([('vect',CountVectorizer(stop_words="english",decode_error='ignore')),
                    ('tfidf',TfidfTransformer()),
                    ('clf',MultinomialNB()),
                    ])

In [35]:
text_clf=text_clf.fit(twenty_train.data,twenty_train.target)

In [36]:
import numpy as np

In [37]:
docs_test=twenty_test.data

In [38]:
predicted=text_clf.predict(docs_test)

In [39]:
np.mean(predicted==twenty_test.target)

0.81691449814126393

In [40]:
from sklearn.linear_model import SGDClassifier

In [41]:
text_clf_2 = Pipeline([('vect',CountVectorizer(stop_words='english',decode_error='ignore')),
                      ('tfidf',TfidfTransformer()),
                      ('clf',SGDClassifier(loss = 'hinge',penalty = 'l2',
                                          alpha = 1e-3,n_iter = 5, random_state = 42)),
                      ])

In [42]:
_ = text_clf_2.fit(twenty_train.data,twenty_train.target)
predicted = text_clf_2.predict(docs_test)

np.mean(predicted == twenty_test.target)

0.82355284121083383

In [43]:
from sklearn import metrics

In [44]:
print(metrics.classification_report(twenty_test.target,predicted,target_names=twenty_test.target_names))

                          precision    recall  f1-score   support

             alt.atheism       0.71      0.71      0.71       319
           comp.graphics       0.81      0.69      0.74       389
 comp.os.ms-windows.misc       0.72      0.79      0.75       394
comp.sys.ibm.pc.hardware       0.73      0.66      0.69       392
   comp.sys.mac.hardware       0.82      0.83      0.82       385
          comp.windows.x       0.86      0.77      0.81       395
            misc.forsale       0.80      0.87      0.84       390
               rec.autos       0.91      0.90      0.90       396
         rec.motorcycles       0.93      0.97      0.95       398
      rec.sport.baseball       0.88      0.91      0.90       397
        rec.sport.hockey       0.87      0.98      0.92       399
               sci.crypt       0.85      0.96      0.90       396
         sci.electronics       0.80      0.62      0.70       393
                 sci.med       0.90      0.87      0.88       396
         

In [45]:
metrics.confusion_matrix(twenty_test.target,predicted)

array([[227,   1,   0,   1,   0,   0,   2,   0,   2,   3,   0,   2,   1,
          9,   6,  46,   2,   5,   1,  11],
       [  2, 269,  22,   8,   9,  26,   4,   1,   4,   9,   3,   8,   5,
          1,   9,   2,   2,   3,   0,   2],
       [  0,   8, 311,  22,  11,  10,   2,   1,   1,   5,   3,   7,   2,
          1,   6,   1,   0,   1,   0,   2],
       [  3,   8,  32, 260,  19,   3,  19,   3,   3,   3,   2,   3,  22,
          1,   5,   0,   2,   2,   1,   1],
       [  1,   4,   8,  21, 319,   1,  10,   0,   1,   4,   1,   1,   6,
          1,   1,   0,   2,   1,   3,   0],
       [  1,  26,  44,   0,   3, 304,   3,   0,   0,   1,   1,   2,   1,
          1,   6,   1,   1,   0,   0,   0],
       [  0,   2,   0,  17,   6,   0, 340,   9,   1,   2,   3,   1,   3,
          2,   3,   0,   1,   0,   0,   0],
       [  1,   1,   1,   2,   1,   0,  10, 356,   6,   1,   0,   0,   8,
          1,   3,   0,   4,   0,   1,   0],
       [  0,   0,   0,   1,   0,   0,   4,   5, 385,   1,   0,  

In [46]:
from sklearn.grid_search import GridSearchCV

In [47]:
parameters={'vect__ngram_range':[(1,1),(1,2)],
             'tfidf__use_idf':(True,False),
             'clf__alpha':(1e-2,1e-3)}

In [48]:
gs_clf = GridSearchCV(text_clf_2,parameters,n_jobs = -1)

In [49]:
gs_clf = gs_clf.fit(twenty_train.data,twenty_train.target)

In [50]:
best_parameters,score,_ = max(gs_clf.grid_scores_,key = lambda x:x[1])
for param_name in sorted(parameters.keys()):
    print("%s: %r" %(param_name,best_parameters[param_name]))

clf__alpha: 0.001
tfidf__use_idf: True
vect__ngram_range: (1, 2)


In [51]:
score

0.89879794944316771