In [2]:
categories = ['alt.atheism', 'soc.religion.christian', 'comp.graphics', 'sci.med', 'rec.motorcycles']

In [3]:
from sklearn.datasets import fetch_20newsgroups
twenty_train = fetch_20newsgroups(data_home='~/AnacondaProjects/ScikitTest/Data/Train', 
                                  subset='train',categories=categories, shuffle=True, random_state=42)

In [21]:
twenty_train.target.shape[0]

2855

In [6]:
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(twenty_train.data)
X_train_counts.shape

(2855, 40829)

In [7]:
from sklearn.feature_extraction.text import TfidfTransformer
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
X_train_tfidf.shape

(2855, 40829)

In [9]:
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB().fit(X_train_tfidf, twenty_train.target)

In [11]:
docs_new = ['God is love', 'OpenGL on the GPU is fast', 'Wear helmet for safety']
X_new_counts = count_vect.transform(docs_new)
X_new_tfidf = tfidf_transformer.transform(X_new_counts)

predicted = clf.predict(X_new_tfidf)

for doc, category in zip(docs_new, predicted):
    print('%r => %s' % (doc, twenty_train.target_names[category]))

'God is love' => soc.religion.christian
'OpenGL on the GPU is fast' => comp.graphics
'Wear helmet for safety' => rec.motorcycles


In [12]:
from sklearn.pipeline import Pipeline
text_clf = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', MultinomialNB()),
])

In [1]:
text_clf.fit(twenty_train.data, twenty_train.target)

NameError: name 'text_clf' is not defined

In [14]:
docs_new = ['God is love', 'OpenGL on the GPU is fast']
X_new_counts = count_vect.transform(docs_new)
X_new_tfidf = tfidf_transformer.transform(X_new_counts)
predicted = text_clf.predict(docs_new)

for doc, category in zip(docs_new, predicted):
    print('%r => %s' % (doc, twenty_train.target_names[category]))

'God is love' => soc.religion.christian
'OpenGL on the GPU is fast' => comp.graphics


In [33]:
import numpy as np
twenty_test = fetch_20newsgroups(data_home='~/AnacondaProjects/ScikitTest/Data/Test', subset='test',
    categories=categories, shuffle=True, random_state=42)
docs_test = twenty_test.data
predicted = text_clf.predict(docs_test)
np.mean(predicted == twenty_test.target)   

0.9221052631578948

In [45]:
from sklearn.linear_model import SGDClassifier
text_clf = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', SGDClassifier(loss='hinge', penalty='l2',
                          alpha=1e-3, random_state=42,
                          max_iter=100, tol=None)),
])
text_clf.fit(twenty_train.data, twenty_train.target)
predictTrain = text_clf.predict(twenty_train.data)
predictTest = text_clf.predict(twenty_test.data)
print("train accuracy:", np.mean(predictTrain == twenty_train.target))
print("test accuracy:", np.mean(predictTest == twenty_test.target))

train accuracy: 0.9957968476357268
test accuracy: 0.9221052631578948


In [21]:
docs_new = ['God is love', 'OpenGL on the GPU is fast', "Drugs are bad", "I love Honda!", "bikes are fun"]
predicted_test_sample = text_clf.predict(docs_new)

#pred_proba = text_clf.predict_proba(docs_new)
#print(pred_proba)

dec_func = text_clf.decision_function(docs_new)
print(dec_func)

for doc, category in zip(docs_new, predicted_test_sample):
    print('%r => %s' % (doc, twenty_train.target_names[category]))

[[-0.88140486 -0.89177504 -1.51809332 -1.47135727  0.82146239]
 [-1.01132578 -0.20454709 -0.59472066 -0.81459217 -0.90935544]
 [-0.94759758 -0.54785717 -0.85579357 -0.34504279 -0.90303644]
 [-1.26072959 -0.83301715  0.01954482 -1.07058062 -0.41411371]
 [-1.01750239 -0.89477522  0.5491458  -1.06451576 -1.01217158]]
'God is love' => soc.religion.christian
'OpenGL on the GPU is fast' => comp.graphics
'Drugs are bad' => sci.med
'I love Honda!' => rec.motorcycles
'bikes are fun' => rec.motorcycles


In [20]:
from sklearn import metrics
print(metrics.classification_report(twenty_test.target, predicted,
    target_names=twenty_test.target_names))
print(metrics.confusion_matrix(twenty_test.target, predicted))

                        precision    recall  f1-score   support

           alt.atheism       0.94      0.81      0.87       319
         comp.graphics       0.89      0.96      0.93       389
       rec.motorcycles       0.96      0.99      0.98       398
               sci.med       0.95      0.87      0.91       396
soc.religion.christian       0.88      0.96      0.92       398

           avg / total       0.92      0.92      0.92      1900

[[257   4   3  13  42]
 [  4 375   3   2   5]
 [  0   2 396   0   0]
 [  7  29  11 343   6]
 [  5  10   0   2 381]]


In [27]:
len(twenty_train.data)

2855

In [28]:
from sklearn.model_selection import GridSearchCV
parameters = {
    'vect__ngram_range': [(1, 1), (1, 2)],
    'tfidf__use_idf': (True, False),
    'clf__alpha': (1e-2, 1e-3),
}
gs_clf = GridSearchCV(text_clf, parameters, cv=5, iid=False, n_jobs=-1)
gs_clf = gs_clf.fit(twenty_train.data, twenty_train.target)

In [29]:
twenty_train.target_names[gs_clf.predict(['God is love'])[0]]

'soc.religion.christian'

In [30]:
gs_clf.best_score_
for param_name in sorted(parameters.keys()):
    print("%s: %r" % (param_name, gs_clf.best_params_[param_name]))

clf__alpha: 0.001
tfidf__use_idf: True
vect__ngram_range: (1, 1)


In [31]:
gs_clf.cv_results_



{'mean_fit_time': array([ 2.75335283, 12.39395332,  3.30509176, 10.81562386,  3.02090631,
        11.73273711,  3.17826447,  8.5187408 ]),
 'std_fit_time': array([0.19255828, 0.6277583 , 0.23088554, 0.83365411, 0.41500434,
        0.26920995, 0.20359365, 0.40350859]),
 'mean_score_time': array([0.37584505, 0.67454209, 0.31729231, 0.68538327, 0.39108438,
        0.6793076 , 0.41956859, 0.4650413 ]),
 'std_score_time': array([0.07172044, 0.09962154, 0.04996068, 0.10151297, 0.06860908,
        0.10457956, 0.09644032, 0.07042995]),
 'param_clf__alpha': masked_array(data=[0.01, 0.01, 0.01, 0.01, 0.001, 0.001, 0.001, 0.001],
              mask=[False, False, False, False, False, False, False, False],
        fill_value='?',
             dtype=object),
 'param_tfidf__use_idf': masked_array(data=[True, True, False, False, True, True, False, False],
              mask=[False, False, False, False, False, False, False, False],
        fill_value='?',
             dtype=object),
 'param_vect__ngra