In [1]:
from __future__ import print_function

import numpy as np

from sklearn import __version__ as sklearn_version
print('Sklearn version:', sklearn_version)

Sklearn version: 0.20.3


# The data

The 20 newsgroups dataset comprises around 18000 newsgroups posts on 20 topics split in two subsets: one for training (or development) and the other one for testing (or for performance evaluation). The split between the train and test set is based upon a messages posted before and after a specific date.


In [0]:
from sklearn.datasets import fetch_20newsgroups

categories = ['alt.atheism', 'soc.religion.christian', 'comp.graphics', 'sci.med']

twenty_train = fetch_20newsgroups(subset='train',
                 remove=('headers', 'footers', 'quotes'),
                 categories=categories, shuffle=True, random_state=42)



## Build a pipeline

In [3]:
#Define the pipeline

from sklearn.pipeline import Pipeline

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB


text_clf = Pipeline([('vect', CountVectorizer(max_df=0.95, min_df=2, max_features=5000, stop_words='english')),
                     ('tfidf', TfidfTransformer()),
                     ('clf', MultinomialNB()),
                    ])

# Fit all the pipeline
text_clf.fit(twenty_train.data, twenty_train.target)

Pipeline(memory=None,
     steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=0.95, max_features=5000, min_df=2,
        ngram_range=(1, 1), preprocessor=None, stop_words='english',
       ...inear_tf=False, use_idf=True)), ('clf', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))])

In [4]:
#Evaluate test data
twenty_test = fetch_20newsgroups(subset='test',
                    remove=('headers', 'footers', 'quotes'),
                    categories=categories, 
                    shuffle=True, random_state=42)

predicted = text_clf.predict(twenty_test.data)

print('Test accuracy:', np.mean(predicted == twenty_test.target))

Test accuracy: 0.7989347536617842


## Change classifier in the pipeline
    - LinearSVC
    - k-NN
    - Random forest

In [5]:
from sklearn.svm import LinearSVC

text_clf_svm = Pipeline([('vect', CountVectorizer(max_df=0.95, min_df=2, max_features=5000, stop_words='english')),
                     ('tfidf', TfidfTransformer()),
                     ('clf', LinearSVC()),
                    ])

#Fit
_ = text_clf_svm.fit(twenty_train.data, twenty_train.target)

# Predict
predicted = text_clf_svm.predict(twenty_test.data)

# Evaluate accuracy
print('Test accuracy:', np.mean(predicted == twenty_test.target))        

Test accuracy: 0.8089214380825566


In [6]:
from sklearn.neighbors import KNeighborsClassifier

text_clf = Pipeline([('vect', CountVectorizer(max_df=0.95, min_df=2, max_features=5000, stop_words='english')),
                     ('tfidf', TfidfTransformer()),
                     ('clf', KNeighborsClassifier(n_neighbors=20)),
                    ])

_ = text_clf.fit(twenty_train.data, twenty_train.target)
predicted = text_clf.predict(twenty_test.data)
print('Test accuracy:', np.mean(predicted == twenty_test.target))

Test accuracy: 0.25233022636484687


In [7]:
from sklearn.ensemble import RandomForestClassifier

text_clf = Pipeline([('vect', CountVectorizer(max_df=0.95, min_df=2, max_features=5000, stop_words='english')),
                     ('tfidf', TfidfTransformer()),
                     ('clf', RandomForestClassifier(n_estimators=100)),
                    ])

_ = text_clf.fit(twenty_train.data, twenty_train.target)
predicted = text_clf.predict(twenty_test.data)
print('Test accuracy:', np.mean(predicted == twenty_test.target))

Test accuracy: 0.7476697736351531


## Use features from a factorization instead the provided by the tf-idf

In [8]:
# Text preprocessing, tokenizing and filtering of stopwords
from sklearn.feature_extraction.text import CountVectorizer

tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2,
                                max_features=5000,
                                stop_words='english')
X_train_counts = tf_vectorizer.fit_transform(twenty_train.data)
X_train_counts.shape

(2257, 5000)

In [9]:
%%time

from sklearn.decomposition import  LatentDirichletAllocation

n_components = 6

lda = LatentDirichletAllocation(n_components=n_components,
                                max_iter=5,
                                learning_method='online',
                                learning_offset=50.,
                                random_state=0)
lda.fit(X_train_counts)


CPU times: user 3.37 s, sys: 794 µs, total: 3.37 s
Wall time: 3.39 s


In [10]:
lda.transform(X_train_counts).shape

(2257, 6)

In [11]:
def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("Topic #%d:" % topic_idx)
        print(" ".join([feature_names[i]
                        for i in topic.argsort()[:-n_top_words - 1:-1]]))
    print()

n_top_words = 20
tf_feature_names = tf_vectorizer.get_feature_names()
print_top_words(lda, tf_feature_names, n_top_words)

Topic #0:
church pope catholic marriage authority married orthodox canon schism mass liturgy bishop ceremony st churches catholics does priest jurisdiction coptic
Topic #1:
image file jpeg use program files images gif color know format does thanks graphics software using version bit available like
Topic #2:
edu com graphics mail send pub keyboard ftp data computer information cs systems software ca faq available gov contact pc
Topic #3:
god people think don jesus just does believe know say like time bible way things good true life christian question
Topic #4:
health use medical years people disease food msg new patients like don doctor research time 1993 10 day know just
Topic #5:
banks gordon skepticism edu soon pitt geb intellect chastity n3jxp dsl shameful cadre surrender father spirit son holy int col



## Pipeline with factorization

In [12]:
%%time

from sklearn.neighbors import KNeighborsClassifier

text_lda_knn = Pipeline([('vect', CountVectorizer(max_df=0.95, min_df=2, max_features=10000, stop_words='english')),
                         ('lda', LatentDirichletAllocation(n_components=150, max_iter=15,
                                 learning_method='online',
                                 learning_offset=200.,
                                 random_state=0)),
                         ('clf', KNeighborsClassifier(n_neighbors=10))
                        ])

                         
_ = text_lda_knn.fit(twenty_train.data, twenty_train.target)
predicted = text_lda_knn.predict(twenty_test.data)
print('Test accuracy:', np.mean(predicted == twenty_test.target))

Test accuracy: 0.7003994673768309
CPU times: user 1min 30s, sys: 48.4 s, total: 2min 18s
Wall time: 1min 13s


In [13]:
%%time

from sklearn.ensemble import RandomForestClassifier

text_lda_rf = Pipeline([('vect', CountVectorizer(max_df=0.95, min_df=2, max_features=10000, stop_words='english')),
                         ('lda', LatentDirichletAllocation(n_components=150, max_iter=15,
                                 learning_method='online',
                                 learning_offset=200.,
                                 random_state=0)),
                         ('clf', RandomForestClassifier(n_estimators=100)),
                        ])

                         
_ = text_lda_rf.fit(twenty_train.data, twenty_train.target)

predicted = text_lda_rf.predict(twenty_test.data)
print('Test accuracy:', np.mean(predicted == twenty_test.target))

Test accuracy: 0.6864181091877497
CPU times: user 1min 32s, sys: 48.5 s, total: 2min 20s
Wall time: 1min 15s


## Optimize a pipeline

In [14]:
%%time

from sklearn.model_selection import RandomizedSearchCV

# Define estimator. No parameters of the search
clf = Pipeline([('vect', CountVectorizer(max_df=0.95, min_df=2)),
                ('tfidf', TfidfTransformer()),
                ('clf', LinearSVC()),
                ])

# Specify parameters and distributions to sample from
# Parameters of pipelines can be set using ‘__’ separated parameter names:
param_dist = {"vect__max_features": [1000, 2500, 5000, 7500, 10000, None], 
              "vect__stop_words": ['english', None], 
              "clf__C": [.1, .5, 1., 1.5, 2.]}

# Define randomized search
n_iter_search = 10
random_search = RandomizedSearchCV(clf, param_distributions=param_dist, n_iter=n_iter_search, return_train_score=True)

# Run the randomized search
random_search.fit(twenty_train.data, twenty_train.target)

print("Done!")



Done!
CPU times: user 19.1 s, sys: 53.1 ms, total: 19.1 s
Wall time: 19.1 s


In [15]:
# Load dictionary of search results to a Pandas dataframe
import pandas as pd

df_cv_results = pd.DataFrame.from_dict(random_search.cv_results_)
df_cv_results

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_vect__stop_words,param_vect__max_features,param_clf__C,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score,split0_train_score,split1_train_score,split2_train_score,mean_train_score,std_train_score
0,0.337459,0.003923,0.110284,0.006579,,2500.0,1.0,"{'vect__stop_words': None, 'vect__max_features...",0.828685,0.823373,0.824234,0.825432,0.002329,7,0.981383,0.983378,0.980744,0.981835,0.001122
1,0.273749,0.005288,0.096364,0.006197,english,2500.0,0.5,"{'vect__stop_words': 'english', 'vect__max_fea...",0.841965,0.828685,0.832224,0.834293,0.005617,5,0.978723,0.980718,0.97676,0.978734,0.001616
2,0.319346,0.006841,0.103301,0.005527,english,,1.5,"{'vect__stop_words': 'english', 'vect__max_fea...",0.876494,0.853918,0.868176,0.866194,0.009326,2,0.984043,0.984043,0.982072,0.983386,0.000929
3,0.305115,0.005771,0.10971,0.006599,,2500.0,0.1,"{'vect__stop_words': None, 'vect__max_features...",0.833997,0.800797,0.813582,0.816128,0.013679,9,0.932846,0.931516,0.934263,0.932875,0.001122
4,0.333399,0.004365,0.102019,0.005683,english,7500.0,2.0,"{'vect__stop_words': 'english', 'vect__max_fea...",0.871182,0.85259,0.858855,0.860877,0.007727,4,0.984043,0.984043,0.982072,0.983386,0.000929
5,0.309382,0.007715,0.105013,0.008684,english,7500.0,1.0,"{'vect__stop_words': 'english', 'vect__max_fea...",0.87915,0.855246,0.865513,0.866637,0.009795,1,0.983378,0.984043,0.981408,0.982943,0.001119
6,0.339095,0.003503,0.105586,0.003889,english,10000.0,2.0,"{'vect__stop_words': 'english', 'vect__max_fea...",0.873838,0.85259,0.868176,0.864865,0.008988,3,0.984707,0.984043,0.982072,0.983607,0.001119
7,0.267068,0.002429,0.094573,0.006572,english,1000.0,0.1,"{'vect__stop_words': 'english', 'vect__max_fea...",0.818061,0.806109,0.802929,0.809039,0.006515,10,0.916888,0.919548,0.916999,0.917812,0.001229
8,0.315408,0.005662,0.115294,0.007816,,10000.0,0.1,"{'vect__stop_words': None, 'vect__max_features...",0.843293,0.818061,0.825566,0.828977,0.010583,6,0.954122,0.950798,0.947543,0.950821,0.002686
9,0.295355,0.00553,0.096506,0.004703,english,2500.0,1.5,"{'vect__stop_words': 'english', 'vect__max_fea...",0.827357,0.815405,0.822903,0.821887,0.004934,8,0.982713,0.982713,0.981408,0.982278,0.000615


In [16]:
print('Best params:', random_search.best_params_)

Best params: {'vect__stop_words': 'english', 'vect__max_features': 7500, 'clf__C': 1.0}


In [17]:
# Score & evaluate test data using the best estimator

predicted = random_search.best_estimator_.predict(twenty_test.data)

print('Test accuracy:', np.mean(predicted == twenty_test.target))        

Test accuracy: 0.8202396804260985


## Aditional metrics for multiclass classification

In [18]:
from sklearn import metrics

print(metrics.classification_report(twenty_test.target, 
                                    predicted,
                                    target_names=twenty_test.target_names))

                        precision    recall  f1-score   support

           alt.atheism       0.77      0.63      0.69       319
         comp.graphics       0.82      0.92      0.87       389
               sci.med       0.88      0.85      0.87       396
soc.religion.christian       0.80      0.84      0.82       398

             micro avg       0.82      0.82      0.82      1502
             macro avg       0.82      0.81      0.81      1502
          weighted avg       0.82      0.82      0.82      1502



In [19]:
metrics.confusion_matrix(twenty_test.target, predicted)

array([[200,  22,  24,  73],
       [ 15, 359,  15,   0],
       [ 14,  32, 337,  13],
       [ 32,  23,   7, 336]])