In [1]:
from sklearn.datasets import fetch_20newsgroups
categories = ['alt.atheism', 'soc.religion.christian', 
              'comp.graphics', 'sci.med']
twenty_train = fetch_20newsgroups(
    subset='train', categories=categories, shuffle=True, 
    random_state=0)

In [2]:
twenty_train.target_names

['alt.atheism', 'comp.graphics', 'sci.med', 'soc.religion.christian']

In [3]:
len(twenty_train.data)

2257

In [4]:
twenty_train

{'data': ['From: reedr@cgsvax.claremont.edu\nSubject: Re: DID HE REALLY RISE???\nOrganization: The Claremont Graduate School\nLines: 29\n\nIn article <Apr.9.01.11.16.1993.16937@athos.rutgers.edu>, emery@tc.fluke.COM (John Emery) writes:\n> The one single historic event that has had the biggest impact on the\n> world over the centuries is the resurrection of Jesus.  At the same\n> time, it is one of the most hotly contested topics....\n> \n> Did Jesus Christ really rise from the dead?  Since the eyewitnesses\n> are no longer living, we have only their written accounts. ...\n> ...  Because of the magnitude of significance\n> involved here, either the resurrection is the greatest event in the\n> history of man or the greatest deception played on man.\n> [massive amounts of data deleted]\n\nJohn, \n\nWhile I will not take the time to rebut you point by point, I will suggest\nthree current works which I think will be helpful in your quest to answer\nthis question.  John Dominic Crossan (Pro

In [5]:
print("\n".join(twenty_train.data[0].split("\n")[:3]))

From: reedr@cgsvax.claremont.edu
Subject: Re: DID HE REALLY RISE???
Organization: The Claremont Graduate School


In [6]:
twenty_train.target_names[twenty_train.target[0]]

'soc.religion.christian'

In [7]:
twenty_train.target[:10]

array([3, 3, 2, 3, 0, 1, 0, 1, 2, 3], dtype=int64)

In [8]:
for t in twenty_train.target[:10]:
    print(twenty_train.target_names[t])

soc.religion.christian
soc.religion.christian
sci.med
soc.religion.christian
alt.atheism
comp.graphics
alt.atheism
comp.graphics
sci.med
soc.religion.christian


In [9]:
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer()
x_train_counts = count_vect.fit_transform(twenty_train.data)
x_train_counts.shape

(2257, 35788)

In [10]:
count_vect.vocabulary_.get(u'morning')

22230

In [11]:
from sklearn.feature_extraction.text import TfidfTransformer
tfidf_transformer = TfidfTransformer()
x_train_tfidf = tfidf_transformer.fit_transform(x_train_counts)
x_train_tfidf.shape

(2257, 35788)

In [12]:
twenty_train.target

array([3, 3, 2, ..., 1, 3, 3], dtype=int64)

In [13]:
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB().fit(x_train_tfidf, twenty_train.target)

In [14]:
docs_new = ['God is love', 'OpenGL on the GPU is fast']
x_new_counts = count_vect.transform(docs_new)
x_new_tfidf = tfidf_transformer.transform(x_new_counts)

In [15]:
predicted = clf.predict(x_new_tfidf)

In [16]:
for doc, category in zip(docs_new, predicted):
    print('%r => %s' % (doc, twenty_train.target_names[category]))

'God is love' => soc.religion.christian
'OpenGL on the GPU is fast' => comp.graphics


In [17]:
from sklearn.pipeline import Pipeline
text_clf = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', MultinomialNB())
])

In [18]:
text_clf.fit(twenty_train.data, twenty_train.target)

In [19]:
import numpy as np
twenty_test = fetch_20newsgroups(
    subset='test', categories=categories, shuffle=True,
    random_state=0
)
docs_test = twenty_test.data
predicted = text_clf.predict(docs_test)
np.mean(predicted == twenty_test.target)*100

83.48868175765645

In [97]:
from sklearn.linear_model import SGDClassifier
text_clf = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', SGDClassifier(loss='hinge', penalty='l2',
                          alpha=1e-3, random_state=0,
                          max_iter=5, tol=None)),
])


In [98]:
text_clf.fit(twenty_train.data, twenty_train.target)

In [99]:
predicted = text_clf.predict(docs_test)
np.mean(predicted == twenty_test.target)*100

91.27829560585884

In [100]:
twenty_test.target_names

['alt.atheism', 'comp.graphics', 'sci.med', 'soc.religion.christian']

In [101]:
from sklearn import metrics
print(metrics.classification_report(twenty_test.target, 
                                    predicted, 
                                    target_names=twenty_test.target_names))

                        precision    recall  f1-score   support

           alt.atheism       0.95      0.82      0.88       319
         comp.graphics       0.87      0.98      0.92       389
               sci.med       0.96      0.88      0.92       396
soc.religion.christian       0.90      0.95      0.93       398

              accuracy                           0.91      1502
             macro avg       0.92      0.91      0.91      1502
          weighted avg       0.92      0.91      0.91      1502



In [102]:
metrics.confusion_matrix(twenty_test.target, predicted)

array([[261,  10,  12,  36],
       [  4, 380,   2,   3],
       [  6,  36, 350,   4],
       [  5,  11,   2, 380]], dtype=int64)

In [136]:
from sklearn.model_selection import GridSearchCV
parameters = {
    'vect__ngram_range': [(1, 1), (1, 2)],
    'tfidf__use_idf': (True, False),
    'clf__alpha': (1e-2, 1e-3),
}

In [137]:
gs_clf = GridSearchCV(text_clf, parameters, cv=5, n_jobs=-1)

In [138]:
gs_clf = gs_clf.fit(twenty_train.data[:400], 
                    twenty_train.target[:400])

In [139]:
twenty_train.target_names[gs_clf.predict(['God is love'])[0]]

'soc.religion.christian'

In [140]:
gs_clf.best_score_

0.9199999999999999

In [141]:
parameters.keys()

dict_keys(['vect__ngram_range', 'tfidf__use_idf', 'clf__alpha'])

In [143]:
for param_name in sorted(parameters.keys()):
    print("%s: %r"%(param_name, gs_clf.best_params_[param_name]))

clf__alpha: 0.001
tfidf__use_idf: True
vect__ngram_range: (1, 2)
