In [1]:
from __future__ import print_function

import numpy as np

from sklearn import __version__ as sklearn_version
print('Sklearn version:', sklearn_version)

Sklearn version: 0.19.0


# The data

The 20 newsgroups dataset comprises around 18000 newsgroups posts on 20 topics split in two subsets: one for training (or development) and the other one for testing (or for performance evaluation). The split between the train and test set is based upon a messages posted before and after a specific date.


In [2]:
from sklearn.datasets import fetch_20newsgroups

categories = ['alt.atheism', 'soc.religion.christian', 'comp.graphics', 'sci.med']

twenty_train = fetch_20newsgroups(subset='train',
                 remove=('headers', 'footers', 'quotes'),
                 categories=categories, shuffle=True, random_state=42)



## Build a pipeline

In [3]:
#Define the pipeline

from sklearn.pipeline import Pipeline

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB


text_clf = Pipeline([('vect', CountVectorizer(max_df=0.95, min_df=2, max_features=5000, stop_words='english')),
                     ('tfidf', TfidfTransformer()),
                     ('clf', MultinomialNB()),
                    ])

# Fit all the pipeline
text_clf.fit(twenty_train.data, twenty_train.target)

Pipeline(memory=None,
     steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=0.95, max_features=5000, min_df=2,
        ngram_range=(1, 1), preprocessor=None, stop_words='english',
       ...inear_tf=False, use_idf=True)), ('clf', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))])

In [4]:
#Evaluate test data
twenty_test = fetch_20newsgroups(subset='test',
                    remove=('headers', 'footers', 'quotes'),
                    categories=categories, 
                    shuffle=True, random_state=42)

predicted = text_clf.predict(twenty_test.data)

print('Test accuracy:', np.mean(predicted == twenty_test.target))

Test accuracy: 0.798934753662


## Change classifier in the pipeline
    - LinearSVC
    - k-NN
    - Random forest

In [5]:
from sklearn.svm import LinearSVC
text_clf_svm = Pipeline([('vect', CountVectorizer(max_df=0.95, min_df=2, max_features=5000, stop_words='english')),
                     ('tfidf', TfidfTransformer()),
                     ('clf', LinearSVC()),
                    ])

#Fit
_ = text_clf_svm.fit(twenty_train.data, twenty_train.target)

# Predict
predicted = text_clf_svm.predict(twenty_test.data)

# Evaluate accuracy
print('Test accuracy:', np.mean(predicted == twenty_test.target))        

Test accuracy: 0.808921438083


In [6]:
from sklearn.neighbors import KNeighborsClassifier

text_clf = Pipeline([('vect', CountVectorizer(max_df=0.95, min_df=2, max_features=5000, stop_words='english')),
                     ('tfidf', TfidfTransformer()),
                     ('clf', KNeighborsClassifier(n_neighbors=10)),
                    ])

_ = text_clf.fit(twenty_train.data, twenty_train.target)
predicted = text_clf.predict(twenty_test.data)
print('Test accuracy:', np.mean(predicted == twenty_test.target))

Test accuracy: 0.254327563249


In [7]:
from sklearn.ensemble import RandomForestClassifier

text_clf = Pipeline([('vect', CountVectorizer(max_df=0.95, min_df=2, max_features=5000, stop_words='english')),
                     ('tfidf', TfidfTransformer()),
                     ('clf', RandomForestClassifier(n_estimators=100)),
                    ])

_ = text_clf.fit(twenty_train.data, twenty_train.target)
predicted = text_clf.predict(twenty_test.data)
print('Test accuracy:', np.mean(predicted == twenty_test.target))

Test accuracy: 0.747003994674


## Use features from a factorization instead the provided by the tf-idf

In [8]:
# Text preprocessing, tokenizing and filtering of stopwords
from sklearn.feature_extraction.text import CountVectorizer

tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2,
                                max_features=5000,
                                stop_words='english')
X_train_counts = tf_vectorizer.fit_transform(twenty_train.data)
X_train_counts.shape

(2257, 5000)

In [9]:
from sklearn.decomposition import  LatentDirichletAllocation

n_components = 6
n_top_words = 20

lda = LatentDirichletAllocation(n_components=n_components,
                                max_iter=5,
                                learning_method='online',
                                learning_offset=50.,
                                random_state=0)
lda.fit(X_train_counts)

LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
             evaluate_every=-1, learning_decay=0.7,
             learning_method='online', learning_offset=50.0,
             max_doc_update_iter=100, max_iter=5, mean_change_tol=0.001,
             n_components=6, n_jobs=1, n_topics=None, perp_tol=0.1,
             random_state=0, topic_word_prior=None,
             total_samples=1000000.0, verbose=0)

In [10]:
def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("Topic #%d:" % topic_idx)
        print(" ".join([feature_names[i]
                        for i in topic.argsort()[:-n_top_words - 1:-1]]))
    print()

tf_feature_names = tf_vectorizer.get_feature_names()
print_top_words(lda, tf_feature_names, n_top_words)

Topic #0:
church pope catholic marriage authority married orthodox canon schism mass liturgy bishop ceremony st churches catholics does priest jurisdiction coptic
Topic #1:
image file jpeg use program files images gif color know format does thanks graphics software using version bit available like
Topic #2:
edu com graphics mail send pub keyboard ftp data computer information cs systems software ca faq available gov contact pc
Topic #3:
god people think don jesus just does believe know say like time bible way things good true life christian question
Topic #4:
health use medical years people disease food msg new patients like don doctor research time 1993 10 day know just
Topic #5:
banks gordon skepticism edu soon pitt geb intellect chastity n3jxp dsl shameful cadre surrender father spirit son holy int col



## Pipeline with factorization

In [11]:
from sklearn.neighbors import KNeighborsClassifier

text_lda_knn = Pipeline([('vect', CountVectorizer(max_df=0.95, min_df=2, max_features=10000, stop_words='english')),
                         ('lda', LatentDirichletAllocation(n_components=150, max_iter=15,
                                 learning_method='online',
                                 learning_offset=200.,
                                 random_state=0)),
                         ('clf', KNeighborsClassifier(n_neighbors=10))
                        ])

                         
_ = text_lda_knn.fit(twenty_train.data, twenty_train.target)
predicted = text_lda_knn.predict(twenty_test.data)
print('Test accuracy:', np.mean(predicted == twenty_test.target))

Test accuracy: 0.700399467377


In [12]:
from sklearn.ensemble import RandomForestClassifier

text_lda_rf = Pipeline([('vect', CountVectorizer(max_df=0.95, min_df=2, max_features=10000, stop_words='english')),
                         ('lda', LatentDirichletAllocation(n_components=150, max_iter=15,
                                 learning_method='online',
                                 learning_offset=200.,
                                 random_state=0)),
                         ('clf', RandomForestClassifier(n_estimators=100)),
                        ])

                         
_ = text_lda_rf.fit(twenty_train.data, twenty_train.target)

predicted = text_lda_rf.predict(twenty_test.data)
print('Test accuracy:', np.mean(predicted == twenty_test.target))

Test accuracy: 0.679094540613


## Optimize a pipeline

In [14]:
from sklearn.model_selection import RandomizedSearchCV

# Define estimator. No parameters of the search
clf = Pipeline([('vect', CountVectorizer(max_df=0.95, min_df=2)),
                ('tfidf', TfidfTransformer()),
                ('clf', LinearSVC()),
                ])

# Specify parameters and distributions to sample from
# Parameters of pipelines can be set using ‘__’ separated parameter names:
param_dist = {"vect__max_features": [1000, 2500, 5000, 7500, 10000, None], 
              "vect__stop_words": ['english', None], 
              "clf__C": [.1, .5, 1., 1.5, 2.]}

# Define randomized search
n_iter_search = 10
random_search = RandomizedSearchCV(clf, param_distributions=param_dist, n_iter=n_iter_search)

# Run the randomized search
random_search.fit(twenty_train.data, twenty_train.target)

print("Done!")

Done!


In [15]:
# Load dictionary of search results to a Pandas dataframe
import pandas as pd

df_cv_results = pd.DataFrame.from_dict(random_search.cv_results_)
df_cv_results

Unnamed: 0,mean_fit_time,mean_score_time,mean_test_score,mean_train_score,param_clf__C,param_vect__max_features,param_vect__stop_words,params,rank_test_score,split0_test_score,split0_train_score,split1_test_score,split1_train_score,split2_test_score,split2_train_score,std_fit_time,std_score_time,std_test_score,std_train_score
0,0.377081,0.139033,0.841382,0.982499,2.0,5000.0,,"{'clf__C': 2.0, 'vect__stop_words': None, 'vec...",5,0.847278,0.982048,0.833997,0.984043,0.842876,0.981408,0.001247,0.007337,0.005526,0.001122
1,0.326363,0.122656,0.866637,0.982943,1.0,7500.0,english,"{'clf__C': 1.0, 'vect__stop_words': 'english',...",1,0.87915,0.983378,0.855246,0.984043,0.865513,0.981408,0.002253,0.010846,0.009795,0.001119
2,0.349105,0.137128,0.860877,0.98117,0.5,10000.0,,"{'clf__C': 0.5, 'vect__stop_words': None, 'vec...",4,0.868526,0.980053,0.856574,0.983378,0.857523,0.98008,0.003066,0.005367,0.005426,0.001561
3,0.340423,0.125945,0.865751,0.983164,1.5,7500.0,english,"{'clf__C': 1.5, 'vect__stop_words': 'english',...",2,0.880478,0.983378,0.85259,0.984043,0.864181,0.982072,0.004421,0.007527,0.011444,0.000819
4,0.307831,0.119197,0.836952,0.950599,0.1,2500.0,english,"{'clf__C': 0.1, 'vect__stop_words': 'english',...",6,0.843293,0.950133,0.837981,0.952128,0.829561,0.949535,0.005001,0.007567,0.005652,0.001108
5,0.319679,0.119417,0.77891,0.976739,2.0,1000.0,english,"{'clf__C': 2.0, 'vect__stop_words': 'english',...",10,0.792829,0.974734,0.780876,0.978059,0.762983,0.977424,0.004224,0.006087,0.012261,0.001441
6,0.311431,0.122113,0.834293,0.978734,0.5,2500.0,english,"{'clf__C': 0.5, 'vect__stop_words': 'english',...",7,0.841965,0.978723,0.828685,0.980718,0.832224,0.97676,0.015231,0.003345,0.005617,0.001616
7,0.33862,0.136487,0.816128,0.932875,0.1,2500.0,,"{'clf__C': 0.1, 'vect__stop_words': None, 'vec...",9,0.833997,0.932846,0.800797,0.931516,0.813582,0.934263,0.004526,0.006795,0.013679,0.001122
8,0.338779,0.140351,0.828977,0.951043,0.1,,,"{'clf__C': 0.1, 'vect__stop_words': None, 'vec...",8,0.843293,0.954122,0.816733,0.951463,0.826897,0.947543,0.008832,0.010082,0.010947,0.002702
9,0.365968,0.139331,0.864422,0.982499,1.0,,,"{'clf__C': 1.0, 'vect__stop_words': None, 'vec...",3,0.875166,0.982048,0.853918,0.984043,0.864181,0.981408,0.00846,0.009392,0.00868,0.001122


In [17]:
print('Best params:', random_search.best_params_)

Best params: {'clf__C': 1.0, 'vect__stop_words': 'english', 'vect__max_features': 7500}


In [18]:
# Score & evaluate test data using the best estimator

predicted = random_search.best_estimator_.predict(twenty_test.data)

print('Test accuracy:', np.mean(predicted == twenty_test.target))        

Test accuracy: 0.820239680426


## Aditional metrics for multiclass classification

In [19]:
from sklearn import metrics

print(metrics.classification_report(twenty_test.target, 
                                    predicted,
                                    target_names=twenty_test.target_names))

                        precision    recall  f1-score   support

           alt.atheism       0.77      0.63      0.69       319
         comp.graphics       0.82      0.92      0.87       389
               sci.med       0.88      0.85      0.87       396
soc.religion.christian       0.80      0.84      0.82       398

           avg / total       0.82      0.82      0.82      1502



In [20]:
metrics.confusion_matrix(twenty_test.target, predicted)

array([[200,  22,  24,  73],
       [ 15, 359,  15,   0],
       [ 14,  32, 337,  13],
       [ 32,  23,   7, 336]])