Blog site: https://medium.com/@siyao_sui/nlp-with-the-20-newsgroups-dataset-ab35cd0ea902

In [27]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.metrics import log_loss
import numpy as np
from sklearn import metrics
from sklearn.metrics.scorer import accuracy_scorer
import nltk

In [2]:
train = fetch_20newsgroups(subset='train', shuffle=True)
test = fetch_20newsgroups(subset='test', shuffle=True)

Downloading 20news dataset. This may take a few minutes.
Downloading dataset from https://ndownloader.figshare.com/files/5975967 (14 MB)


In [3]:
train.target_names

['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']

In [4]:
print("\n".join(train.data[0].split("\n")[:100]))

From: lerxst@wam.umd.edu (where's my thing)
Subject: WHAT car is this!?
Nntp-Posting-Host: rac3.wam.umd.edu
Organization: University of Maryland, College Park
Lines: 15

 I was wondering if anyone out there could enlighten me on this car I saw
the other day. It was a 2-door sports car, looked to be from the late 60s/
early 70s. It was called a Bricklin. The doors were really small. In addition,
the front bumper was separate from the rest of the body. This is 
all I know. If anyone can tellme a model name, engine specs, years
of production, where this car is made, history, or whatever info you
have on this funky looking car, please e-mail.

Thanks,
- IL
   ---- brought to you by your neighborhood Lerxst ----







In [5]:
count_vec = CountVectorizer()
X_train_bow = count_vec.fit_transform(train.data)
X_train_bow.shape

(11314, 130107)

In [6]:
tfidf = TfidfTransformer()
X_train_tfidf = tfidf.fit_transform(X_train_bow)
X_train_tfidf.shape

(11314, 130107)

## Naive Bayes

In [7]:
clf = MultinomialNB().fit(X_train_tfidf, train.target)

In [8]:
NB = Pipeline([('vec', CountVectorizer(stop_words = 'english')),
              ('tfidf', TfidfTransformer()),
              ('clf', MultinomialNB())
              ])

In [9]:
NB = NB.fit(train.data, train.target)

In [10]:
pred = NB.predict(test.data)

In [11]:
accuracy = np.mean(pred == test.target)
print('Test accuracy:', accuracy)

Test accuracy: 0.8169144981412639


In [15]:
print(metrics.classification_report(test.target, pred,target_names=test.target_names))

                          precision    recall  f1-score   support

             alt.atheism       0.80      0.69      0.74       319
           comp.graphics       0.78      0.72      0.75       389
 comp.os.ms-windows.misc       0.79      0.72      0.75       394
comp.sys.ibm.pc.hardware       0.68      0.81      0.74       392
   comp.sys.mac.hardware       0.86      0.81      0.84       385
          comp.windows.x       0.87      0.78      0.82       395
            misc.forsale       0.87      0.80      0.83       390
               rec.autos       0.88      0.91      0.90       396
         rec.motorcycles       0.93      0.96      0.95       398
      rec.sport.baseball       0.91      0.92      0.92       397
        rec.sport.hockey       0.88      0.98      0.93       399
               sci.crypt       0.75      0.96      0.84       396
         sci.electronics       0.84      0.65      0.74       393
                 sci.med       0.92      0.79      0.85       396
         

In [16]:
prob = NB.predict_proba(test.data)
print('Log loss: ', log_loss(test.target, prob))

Log loss:  1.2144601393513912


### Grid Search 

In [34]:
parameters = {  
'tfidf__use_idf': (True, False),  
'tfidf__sublinear_tf': (True, False),  
'vec__binary': (True, False),  
'tfidf__norm': ('l1', 'l2'),  
'clf__alpha': (1, 0.1, 0.01, 0.001, 0.0001, 0.00001)  
}

NB_pipe = Pipeline([('vec', CountVectorizer(stop_words = 'english')),
              ('tfidf', TfidfTransformer()),
              ('clf', MultinomialNB())
              ])

def my_accuracy_scorer(*args):
    score = accuracy_scorer(*args)
    print('score is {}'.format(score))
    return score

gs_NB = GridSearchCV(NB_pipe, parameters, n_jobs = -1, scoring = my_accuracy_scorer)

In [35]:
gs_NB = gs_NB.fit(train.data, train.target)

score is 0.6778807947019867
score is 0.689901934799894
score is 0.6842804036112586
score is 0.7030463576158941
score is 0.7585886722376973
score is 0.7663439862087257
score is 0.7623211446740858
score is 0.7770261307865765
score is 0.7113702623906706
score is 0.7110993096123208
score is 0.6180132450331126
score is 0.6315928968990193
score is 0.7861026389073067
score is 0.6934606711765486
score is 0.6983158732263626
score is 0.7800741918388977
score is 0.6237387148167818
score is 0.6550716941051513
score is 0.6562417174662073
score is 0.6434437086092715
score is 0.6926338102808691
score is 0.7168786433492316
score is 0.7186049595544357
score is 0.7142857142857143
score is 0.6778807947019867
score is 0.689901934799894
score is 0.6842804036112586
score is 0.7173509933774834
score is 0.7585886722376973
score is 0.7663439862087257
score is 0.7623211446740858
score is 0.7930760047751692
score is 0.7214418234826399
score is 0.7211895910780669
score is 0.6180132450331126
score is 0.63159289689

score is 0.9977450590263961
score is 0.8963689371852637
score is 0.897238449283059
score is 0.8924503311258278
score is 0.891333156639279
score is 0.9978782654820315
score is 0.9969528351881293
score is 0.9940326216682137
score is 0.992969889905823
score is 0.8921933085501859
score is 0.8948344370860927
score is 0.89345348529022
score is 0.8953797132235793
score is 0.9925808161102279
score is 0.9916434540389972
score is 0.9937674048534677
score is 0.9919183889772125
score is 0.8980132450331125
score is 0.8950437317784257
score is 0.8993627190653213
score is 0.8972185430463576
score is 0.9984082769598089
score is 0.9984086991115236
score is 0.9977477477477478
score is 0.9976124154397135
score is 0.8939835674529553
score is 0.8943175783324482
score is 0.8924503311258278
score is 0.891333156639279
score is 0.9976130486672855
score is 0.9968203497615262
score is 0.992969889905823
score is 0.9940326216682137
score is 0.8921933085501859
score is 0.893774834437086
score is 0.8958388550225285


In [36]:
print(gs_NB.best_score_)
print(gs_NB.best_params_)

0.9065759236344352
{'clf__alpha': 0.01, 'tfidf__norm': 'l2', 'tfidf__sublinear_tf': True, 'tfidf__use_idf': True, 'vec__binary': True}


In [46]:
pred = gs_NB.best_estimator_.predict(test.data)
accuracy = np.mean(pred == test.target)
print('Test accuracy:', accuracy)

Test accuracy: 0.832979288369623


## Support Vector Machine

In [19]:
SVM = Pipeline([('vect', CountVectorizer(stop_words = 'english')),
                ('tfidf', TfidfTransformer()),
                ('clf', SGDClassifier(loss='hinge', penalty='l2',
                                     alpha=1e-3, random_state=42,
                                     max_iter=5, tol=None)),
])

svm = SVM.fit(train.data, train.target)

pred = svm.predict(test.data)
accuracy = np.mean(pred == test.target)
print('Test accuracy:', accuracy)

Test accuracy: 0.8224907063197026


In [23]:
print(metrics.classification_report(test.target, pred,target_names=test.target_names))

                          precision    recall  f1-score   support

             alt.atheism       0.72      0.71      0.71       319
           comp.graphics       0.79      0.70      0.74       389
 comp.os.ms-windows.misc       0.73      0.77      0.75       394
comp.sys.ibm.pc.hardware       0.71      0.68      0.69       392
   comp.sys.mac.hardware       0.82      0.82      0.82       385
          comp.windows.x       0.84      0.77      0.80       395
            misc.forsale       0.82      0.87      0.85       390
               rec.autos       0.91      0.89      0.90       396
         rec.motorcycles       0.92      0.97      0.94       398
      rec.sport.baseball       0.90      0.91      0.90       397
        rec.sport.hockey       0.86      0.98      0.92       399
               sci.crypt       0.85      0.96      0.90       396
         sci.electronics       0.81      0.62      0.70       393
                 sci.med       0.90      0.87      0.88       396
         

### Grid Search

In [43]:
parameters = {
    'vect__max_df': (0.5, 0.75, 1.0),
    'vect__ngram_range': ((1, 1), (1, 2)), 
    'clf__alpha': (0.00001, 0.000001),
    'clf__penalty': ('l2', 'elasticnet'),
}

gs_svm = GridSearchCV(SVM, parameters, n_jobs=-1, scoring = my_accuracy_scorer)
gs_svm = gs_svm.fit(train.data, train.target)


score is 0.9051152928703949
score is 0.9011920529801325
score is 0.9022835900159321
score is 0.9994695663705079
score is 0.9996020692399522
score is 0.9994700582935877
score is 0.9003973509933775
score is 0.9136423841059602
score is 0.9994694256532697
score is 0.9996020692399522
score is 0.9056453750331301
score is 0.9175722236946727
score is 0.9142326075411578
score is 0.9994695663705079
score is 0.9054699946893255
score is 0.9993369579631348
score is 0.9992050874403816
score is 0.9994700582935877
score is 0.9001324503311259
score is 0.9994694256532697
score is 0.9125827814569536
score is 0.9075006626027035
score is 0.9996020692399522
score is 0.9146567717996289
score is 0.9184811471056824
score is 0.9993369579631348
score is 0.9065321295804567
score is 0.9993369579631348
score is 0.9994700582935877
score is 0.9993375728669847
score is 0.898543046357616
score is 0.9133774834437086
score is 0.9996020692399522
score is 0.9114762788232176
score is 0.9123738714816781
score is 0.9996020692

In [44]:
print(gs_svm.best_score_)
print(gs_svm.best_params_)

0.915591302810677
{'clf__alpha': 1e-05, 'clf__penalty': 'elasticnet', 'vect__max_df': 0.5, 'vect__ngram_range': (1, 2)}


In [47]:
pred = gs_svm.best_estimator_.predict(test.data)
accuracy = np.mean(pred == test.target)
print('Test accuracy:', accuracy)

Test accuracy: 0.8534253850238981
