In [1]:
from __future__ import print_function

import numpy as np

from sklearn import __version__ as sklearn_version
print('Sklearn version:', sklearn_version)

Sklearn version: 0.19.1


# The data

The 20 newsgroups dataset comprises around 18000 newsgroups posts on 20 topics split in two subsets: one for training (or development) and the other one for testing (or for performance evaluation). The split between the train and test set is based upon a messages posted before and after a specific date.


In [2]:
from sklearn.datasets import fetch_20newsgroups

categories = ['alt.atheism', 'soc.religion.christian', 'comp.graphics', 'sci.med']

twenty_train = fetch_20newsgroups(subset='train',
                 remove=('headers', 'footers', 'quotes'),
                 categories=categories, shuffle=True, random_state=42)



## Build a pipeline

In [3]:
#Define the pipeline

from sklearn.pipeline import Pipeline

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB


text_clf = Pipeline([('vect', CountVectorizer(max_df=0.95, min_df=2, max_features=5000, stop_words='english')),
                     ('tfidf', TfidfTransformer()),
                     ('clf', MultinomialNB()),
                    ])

# Fit all the pipeline
text_clf.fit(twenty_train.data, twenty_train.target)

Pipeline(memory=None,
     steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=0.95, max_features=5000, min_df=2,
        ngram_range=(1, 1), preprocessor=None, stop_words='english',
       ...inear_tf=False, use_idf=True)), ('clf', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))])

In [4]:
#Evaluate test data
twenty_test = fetch_20newsgroups(subset='test',
                    remove=('headers', 'footers', 'quotes'),
                    categories=categories, 
                    shuffle=True, random_state=42)

predicted = text_clf.predict(twenty_test.data)

print('Test accuracy:', np.mean(predicted == twenty_test.target))

Test accuracy: 0.7989347536617842


## Change classifier in the pipeline
    - LinearSVC
    - k-NN
    - Random forest

In [5]:
from sklearn.svm import LinearSVC

text_clf_svm = Pipeline([('vect', CountVectorizer(max_df=0.95, min_df=2, max_features=5000, stop_words='english')),
                     ('tfidf', TfidfTransformer()),
                     ('clf', LinearSVC()),
                    ])

#Fit
_ = text_clf_svm.fit(twenty_train.data, twenty_train.target)

# Predict
predicted = text_clf_svm.predict(twenty_test.data)

# Evaluate accuracy
print('Test accuracy:', np.mean(predicted == twenty_test.target))        

Test accuracy: 0.8089214380825566


In [6]:
from sklearn.neighbors import KNeighborsClassifier

text_clf = Pipeline([('vect', CountVectorizer(max_df=0.95, min_df=2, max_features=5000, stop_words='english')),
                     ('tfidf', TfidfTransformer()),
                     ('clf', KNeighborsClassifier(n_neighbors=20)),
                    ])

_ = text_clf.fit(twenty_train.data, twenty_train.target)
predicted = text_clf.predict(twenty_test.data)
print('Test accuracy:', np.mean(predicted == twenty_test.target))

Test accuracy: 0.2529960053262317


In [7]:
from sklearn.ensemble import RandomForestClassifier

text_clf = Pipeline([('vect', CountVectorizer(max_df=0.95, min_df=2, max_features=5000, stop_words='english')),
                     ('tfidf', TfidfTransformer()),
                     ('clf', RandomForestClassifier(n_estimators=100)),
                    ])

_ = text_clf.fit(twenty_train.data, twenty_train.target)
predicted = text_clf.predict(twenty_test.data)
print('Test accuracy:', np.mean(predicted == twenty_test.target))

Test accuracy: 0.7396804260985352


## Use features from a factorization instead the provided by the tf-idf

In [9]:
# Text preprocessing, tokenizing and filtering of stopwords
from sklearn.feature_extraction.text import CountVectorizer

tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2,
                                max_features=5000,
                                stop_words='english')
X_train_counts = tf_vectorizer.fit_transform(twenty_train.data)
X_train_counts.shape

(2257, 5000)

In [10]:
%%time

from sklearn.decomposition import  LatentDirichletAllocation

n_components = 6

lda = LatentDirichletAllocation(n_components=n_components,
                                max_iter=5,
                                learning_method='online',
                                learning_offset=50.,
                                random_state=0)
lda.fit(X_train_counts)


CPU times: user 4.13 s, sys: 15.5 ms, total: 4.14 s
Wall time: 4.15 s


In [11]:
lda.transform(X_train_counts).shape

(2257, 6)

In [12]:
def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("Topic #%d:" % topic_idx)
        print(" ".join([feature_names[i]
                        for i in topic.argsort()[:-n_top_words - 1:-1]]))
    print()

n_top_words = 20
tf_feature_names = tf_vectorizer.get_feature_names()
print_top_words(lda, tf_feature_names, n_top_words)

Topic #0:
church pope catholic marriage authority married orthodox canon schism mass liturgy bishop ceremony st churches catholics does priest jurisdiction coptic
Topic #1:
image file jpeg use program files images gif color know format does thanks graphics software using version bit available like
Topic #2:
edu com graphics mail send pub keyboard ftp data computer information cs systems software ca faq available gov contact pc
Topic #3:
god people think don jesus just does believe know say like time bible way things good true life christian question
Topic #4:
health use medical years people disease food msg new patients like don doctor research time 1993 10 day know just
Topic #5:
banks gordon skepticism edu soon pitt geb intellect chastity n3jxp dsl shameful cadre surrender father spirit son holy int col



## Pipeline with factorization

In [13]:
%%time

from sklearn.neighbors import KNeighborsClassifier

text_lda_knn = Pipeline([('vect', CountVectorizer(max_df=0.95, min_df=2, max_features=10000, stop_words='english')),
                         ('lda', LatentDirichletAllocation(n_components=150, max_iter=15,
                                 learning_method='online',
                                 learning_offset=200.,
                                 random_state=0)),
                         ('clf', KNeighborsClassifier(n_neighbors=10))
                        ])

                         
_ = text_lda_knn.fit(twenty_train.data, twenty_train.target)
predicted = text_lda_knn.predict(twenty_test.data)
print('Test accuracy:', np.mean(predicted == twenty_test.target))

Test accuracy: 0.7003994673768309
CPU times: user 46.7 s, sys: 3.39 s, total: 50.1 s
Wall time: 50 s


In [14]:
%%time

from sklearn.ensemble import RandomForestClassifier

text_lda_rf = Pipeline([('vect', CountVectorizer(max_df=0.95, min_df=2, max_features=10000, stop_words='english')),
                         ('lda', LatentDirichletAllocation(n_components=150, max_iter=15,
                                 learning_method='online',
                                 learning_offset=200.,
                                 random_state=0)),
                         ('clf', RandomForestClassifier(n_estimators=100)),
                        ])

                         
_ = text_lda_rf.fit(twenty_train.data, twenty_train.target)

predicted = text_lda_rf.predict(twenty_test.data)
print('Test accuracy:', np.mean(predicted == twenty_test.target))

Test accuracy: 0.6857523302263648
CPU times: user 47.1 s, sys: 3.27 s, total: 50.4 s
Wall time: 50.2 s


## Optimize a pipeline

In [15]:
%%time

from sklearn.model_selection import RandomizedSearchCV

# Define estimator. No parameters of the search
clf = Pipeline([('vect', CountVectorizer(max_df=0.95, min_df=2)),
                ('tfidf', TfidfTransformer()),
                ('clf', LinearSVC()),
                ])

# Specify parameters and distributions to sample from
# Parameters of pipelines can be set using ‘__’ separated parameter names:
param_dist = {"vect__max_features": [1000, 2500, 5000, 7500, 10000, None], 
              "vect__stop_words": ['english', None], 
              "clf__C": [.1, .5, 1., 1.5, 2.]}

# Define randomized search
n_iter_search = 10
random_search = RandomizedSearchCV(clf, param_distributions=param_dist, n_iter=n_iter_search, return_train_score=True)

# Run the randomized search
random_search.fit(twenty_train.data, twenty_train.target)

print("Done!")

Done!
CPU times: user 19.9 s, sys: 210 ms, total: 20.1 s
Wall time: 20.2 s


In [16]:
# Load dictionary of search results to a Pandas dataframe
import pandas as pd

df_cv_results = pd.DataFrame.from_dict(random_search.cv_results_)
df_cv_results

Unnamed: 0,mean_fit_time,mean_score_time,mean_test_score,mean_train_score,param_clf__C,param_vect__max_features,param_vect__stop_words,params,rank_test_score,split0_test_score,split0_train_score,split1_test_score,split1_train_score,split2_test_score,split2_train_score,std_fit_time,std_score_time,std_test_score,std_train_score
0,0.324663,0.123406,0.787328,0.977182,1.5,1000.0,,"{'vect__stop_words': None, 'vect__max_features...",9,0.802125,0.975399,0.791501,0.978723,0.768309,0.977424,0.002285,0.004355,0.014114,0.001368
1,0.30446,0.125341,0.781568,0.899203,0.1,1000.0,,"{'vect__stop_words': None, 'vect__max_features...",10,0.804781,0.895612,0.772908,0.902926,0.766977,0.89907,0.018027,0.001396,0.016602,0.002987
2,0.344086,0.134984,0.86132,0.982499,1.5,10000.0,,"{'vect__stop_words': None, 'vect__max_features...",3,0.869854,0.982048,0.849934,0.984043,0.864181,0.981408,0.014959,0.013755,0.008383,0.001122
3,0.318959,0.120355,0.864865,0.983607,2.0,10000.0,english,"{'vect__stop_words': 'english', 'vect__max_fea...",2,0.873838,0.984707,0.85259,0.984043,0.868176,0.982072,0.009562,0.006733,0.008988,0.001119
4,0.29086,0.110125,0.797519,0.961011,0.5,1000.0,english,"{'vect__stop_words': 'english', 'vect__max_fea...",7,0.803453,0.957447,0.798141,0.966755,0.790945,0.958831,0.002304,0.011141,0.005124,0.004101
5,0.285974,0.108181,0.821887,0.982278,1.5,2500.0,english,"{'vect__stop_words': 'english', 'vect__max_fea...",5,0.827357,0.982713,0.815405,0.982713,0.822903,0.981408,0.003816,0.008713,0.004934,0.000615
6,0.305842,0.112626,0.841382,0.983164,2.0,5000.0,english,"{'vect__stop_words': 'english', 'vect__max_fea...",4,0.849934,0.983378,0.832669,0.984043,0.841545,0.982072,0.01731,0.004515,0.007052,0.000819
7,0.291877,0.110438,0.789101,0.970093,1.0,1000.0,english,"{'vect__stop_words': 'english', 'vect__max_fea...",8,0.800797,0.96875,0.791501,0.972074,0.774967,0.969456,0.00724,0.006559,0.010679,0.00143
8,0.307934,0.113261,0.866194,0.983386,1.5,,english,"{'vect__stop_words': 'english', 'vect__max_fea...",1,0.876494,0.984043,0.853918,0.984043,0.868176,0.982072,0.005067,0.00961,0.009326,0.000929
9,0.289196,0.109234,0.818343,0.982499,2.0,2500.0,english,"{'vect__stop_words': 'english', 'vect__max_fea...",6,0.819389,0.983378,0.816733,0.982713,0.818908,0.981408,0.011861,0.006067,0.001156,0.000818


In [17]:
print('Best params:', random_search.best_params_)

Best params: {'vect__stop_words': 'english', 'vect__max_features': None, 'clf__C': 1.5}


In [18]:
# Score & evaluate test data using the best estimator

predicted = random_search.best_estimator_.predict(twenty_test.data)

print('Test accuracy:', np.mean(predicted == twenty_test.target))        

Test accuracy: 0.8209054593874834


## Aditional metrics for multiclass classification

In [19]:
from sklearn import metrics

print(metrics.classification_report(twenty_test.target, 
                                    predicted,
                                    target_names=twenty_test.target_names))

                        precision    recall  f1-score   support

           alt.atheism       0.78      0.62      0.69       319
         comp.graphics       0.83      0.92      0.87       389
               sci.med       0.87      0.86      0.87       396
soc.religion.christian       0.79      0.85      0.82       398

           avg / total       0.82      0.82      0.82      1502



In [20]:
metrics.confusion_matrix(twenty_test.target, predicted)

array([[198,  21,  24,  76],
       [ 14, 357,  16,   2],
       [ 13,  29, 340,  14],
       [ 29,  22,   9, 338]])