## Feature extraction 1. Text Examples

In [15]:
# Author: Guillaume Lussier <lussier.guillaume@gmail.com>
# base of work http://scikit-learn.org/stable/modules/feature_extraction.html
# Date: Jan2017
# ipython file, kernel 2.7, required modules: sklearn, numpy, pprint, time, logging 

### Section1 :  scikitlearn Extraction from text

In [16]:
# example 1 comes from scikit-learn documentation
# http://scikit-learn.org/stable/modules/feature_extraction.html

# CountVectorizer implements both tokenization and occurrence counting in a single class
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(min_df=1)
vectorizer

#CountVectorizer(analyzer=...'word', binary=False, decode_error=...'strict',
#        dtype=<... 'numpy.int64'>, encoding=...'utf-8', input=...'content',
#        lowercase=True, max_df=1.0, max_features=None, min_df=1,
#        ngram_range=(1, 1), preprocessor=None, stop_words=None,
#        strip_accents=None, token_pattern=...'(?u)\\b\\w\\w+\\b',
#        tokenizer=None, vocabulary=None)

CountVectorizer(analyzer=u'word', binary=False, decode_error=u'strict',
        dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern=u'(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [17]:
# the data to analyze and the features extracted
corpus = ['This is the first document.', 'This is the second second document.', 'And the third one.', 'Is this the first document?']
X = vectorizer.fit_transform(corpus)
X
#<4x9 sparse matrix of type '<... 'numpy.int64'>'
#    with 19 stored elements in Compressed Sparse ... format>

<4x9 sparse matrix of type '<type 'numpy.int64'>'
	with 19 stored elements in Compressed Sparse Row format>

In [18]:
# an example of analysis
analyze = vectorizer.build_analyzer()
analyze("This is a text document to analyze.") == (['this', 'is', 'text', 'document', 'to', 'analyze'])
#True

True

In [19]:
# the list of features
vectorizer.get_feature_names() == (['and', 'document', 'first', 'is', 'one','second', 'the', 'third', 'this'])
#True

True

In [20]:
# data matrix
X.toarray()           
#array([[0, 1, 1, 1, 0, 0, 1, 0, 1],
#       [0, 1, 0, 1, 0, 2, 1, 0, 1],
#       [1, 0, 0, 0, 1, 0, 1, 1, 0],
#       [0, 1, 1, 1, 0, 0, 1, 0, 1]]...)

array([[0, 1, 1, 1, 0, 0, 1, 0, 1],
       [0, 1, 0, 1, 0, 2, 1, 0, 1],
       [1, 0, 0, 0, 1, 0, 1, 1, 0],
       [0, 1, 1, 1, 0, 0, 1, 0, 1]], dtype=int64)

In [21]:
vectorizer.vocabulary_.get('document')
#1 'document' is part of the vocabulary learned

1

In [22]:
# counter-example, text unknown to our analyzer
vectorizer.transform(['Something completely new.']).toarray()
#array([[0, 0, 0, 0, 0, 0, 0, 0, 0]]...) these three words are not

array([[0, 0, 0, 0, 0, 0, 0, 0, 0]], dtype=int64)

In [23]:
######################################################################################
# BI-GRAMS
# analyzing single words losses the ordering, so working on pairs can help
bigram_vectorizer = CountVectorizer(ngram_range=(1, 2), token_pattern=r'\b\w+\b', min_df=1)
analyze = bigram_vectorizer.build_analyzer()
analyze('Bi-grams are cool!') == (['bi', 'grams', 'are', 'cool', 'bi grams', 'grams are', 'are cool'])
#True

True

In [24]:
X_2 = bigram_vectorizer.fit_transform(corpus).toarray()
X_2
#array([[0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0],
#       [0, 0, 1, 0, 0, 1, 1, 0, 0, 2, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0],
#       [1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0],
#       [0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1]]...)

array([[0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0],
       [0, 0, 1, 0, 0, 1, 1, 0, 0, 2, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0],
       [1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0],
       [0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1]], dtype=int64)

In [25]:
feature_index = bigram_vectorizer.vocabulary_.get('is this')
X_2[:, feature_index]
#array([0, 0, 0, 1]...)

array([0, 0, 0, 1], dtype=int64)

In [26]:
######################################################################################
# TERM WEIGHTING
# tf / term frequency
# idf / inverse documentfrequency
from sklearn.feature_extraction.text import TfidfTransformer
transformer = TfidfTransformer(smooth_idf=False)
transformer   
#TfidfTransformer(norm=...'l2', smooth_idf=False, sublinear_tf=False,
#                 use_idf=True)

TfidfTransformer(norm=u'l2', smooth_idf=False, sublinear_tf=False,
         use_idf=True)

In [27]:
counts = [[3, 0, 1], [2, 0, 0], [3, 0, 0], [4, 0, 0], [3, 2, 0], [3, 0, 2]]
tfidf = transformer.fit_transform(counts)
tfidf                         
#<6x3 sparse matrix of type '<... 'numpy.float64'>'
#    with 9 stored elements in Compressed Sparse ... format>

<6x3 sparse matrix of type '<type 'numpy.float64'>'
	with 9 stored elements in Compressed Sparse Row format>

In [28]:
tfidf.toarray()                        
#array([[ 0.81940995,  0.        ,  0.57320793],
#       [ 1.        ,  0.        ,  0.        ],
#       [ 1.        ,  0.        ,  0.        ],
#       [ 1.        ,  0.        ,  0.        ],
#       [ 0.47330339,  0.88089948,  0.        ],
#       [ 0.58149261,  0.        ,  0.81355169]])

array([[ 0.81940995,  0.        ,  0.57320793],
       [ 1.        ,  0.        ,  0.        ],
       [ 1.        ,  0.        ,  0.        ],
       [ 1.        ,  0.        ,  0.        ],
       [ 0.47330339,  0.88089948,  0.        ],
       [ 0.58149261,  0.        ,  0.81355169]])

In [29]:
transformer = TfidfTransformer()
transformer.fit_transform(counts).toarray()
#array([[ 0.85151335,  0.        ,  0.52433293],
#       [ 1.        ,  0.        ,  0.        ],
#       [ 1.        ,  0.        ,  0.        ],
#       [ 1.        ,  0.        ,  0.        ],
#       [ 0.55422893,  0.83236428,  0.        ],
#       [ 0.63035731,  0.        ,  0.77630514]])

array([[ 0.85151335,  0.        ,  0.52433293],
       [ 1.        ,  0.        ,  0.        ],
       [ 1.        ,  0.        ,  0.        ],
       [ 1.        ,  0.        ,  0.        ],
       [ 0.55422893,  0.83236428,  0.        ],
       [ 0.63035731,  0.        ,  0.77630514]])

In [30]:
transformer.idf_                       
#array([ 1. ...,  2.25...,  1.84...])

array([ 1.        ,  2.25276297,  1.84729786])

In [31]:
# combining CountVectorizer and TfidfTransformer in a single model
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(min_df=1)
vectorizer.fit_transform(corpus)
#<4x9 sparse matrix of type '<... 'numpy.float64'>'
#    with 19 stored elements in Compressed Sparse ... format>

<4x9 sparse matrix of type '<type 'numpy.float64'>'
	with 19 stored elements in Compressed Sparse Row format>

In [32]:
######################################################################################
# Performance Improvement - HASHING
# Vectorization of large text corpus
from sklearn.feature_extraction.text import HashingVectorizer
hv = HashingVectorizer(n_features=10)
hv.transform(corpus)
#<4x10 sparse matrix of type '<... 'numpy.float64'>'
#    with 16 stored elements in Compressed Sparse ... format>

<4x10 sparse matrix of type '<type 'numpy.float64'>'
	with 16 stored elements in Compressed Sparse Row format>

### Section2 : some few additional basic examples

In [33]:
# simple CountVectorizer
from sklearn.feature_extraction.text import CountVectorizer

vectorizer_a = CountVectorizer(min_df=1)
corpus_ex = ['this is a text o one','this is an text two','this is text three']
X_a = vectorizer_a.fit_transform(corpus_ex)
vectorizer_a.get_feature_names()

[u'an', u'is', u'one', u'text', u'this', u'three', u'two']

In [34]:
X_a.toarray()

array([[0, 1, 1, 1, 1, 0, 0],
       [1, 1, 0, 1, 1, 0, 1],
       [0, 1, 0, 1, 1, 1, 0]], dtype=int64)

In [35]:
analyze_a = vectorizer_a.build_analyzer()
analyze_a("This is a text document to analyze.")

[u'this', u'is', u'text', u'document', u'to', u'analyze']

In [36]:
analyze_a("This text contains one, two, three.")

[u'this', u'text', u'contains', u'one', u'two', u'three']

In [37]:
# TfidfVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer_b = TfidfVectorizer(min_df=1)
X_b = vectorizer_b.fit_transform(corpus_ex)
vectorizer_b.get_feature_names()

[u'an', u'is', u'one', u'text', u'this', u'three', u'two']

### Section 3 : Application sklearn.20newsgroups dataset

We will use the scikitlearn capabilities above on a real dataset preloaded in scikitlearn: the 20newsgroups dataset  
"The 20 newsgroups dataset comprises around 18000 newsgroups posts on 20 topics split in two subsets: one for training (or development) and the other one for testing (or for performance evaluation)."

20newgroups info can be found on http://scikit-learn.org/stable/datasets/ section 5.8
The goal is to study the impact of the data filtering by comparing results on:
1. full text set of 20newsgroup
2. removing headers, footers and quotes
3. limiting the data set to smaller sizes (limited work in this file, see fextraction2 for more)

In [38]:
# Vectorization from sklearn.datasets.20newsgroups
# 20newgroups info can be found on http://scikit-learn.org/stable/datasets/ section 5.8
# goal is to study the impact of the data filtering by comparing results on 
# 1. full text set of 20newsgroup
# 2. removing headers, footers and quotes
# 3. limiting the data set to smaller sizes

from sklearn.datasets import fetch_20newsgroups
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics
import numpy as np
from pprint import pprint
from time import time

# this is to configure python logging to handle warning messages 
import logging
logging.basicConfig()

# configuration parameters, used to reduce the size of the corpus for training
fetch20_corpus_size = 100

In [39]:
# Categories and Corpus - full training set
print("Loading sklearn.20newsgroup dataset")
t0 = time()
newsgroups_train = fetch_20newsgroups(subset='train')
newsgroups_train_categories = list(newsgroups_train.target_names)
pprint(newsgroups_train_categories)
print("done in %0.3fs." % (time() - t0))
#done in 0.5..s.

Loading sklearn.20newsgroup dataset
['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']
done in 0.652s.


In [40]:
# corresponding test set
newsgroups_test = fetch_20newsgroups(subset='test', categories=newsgroups_train_categories)

In [41]:
# Categories and Corpus - filtered corpus
print("Loading and filering sklearn.20newsgroup dataset, remove headers, footers, quotes")
t0 = time()
corpus_fetch20 = fetch_20newsgroups(subset='train', shuffle=True, random_state=1, 
                                    categories= newsgroups_train_categories,
                                    remove=('headers', 'footers', 'quotes'))
# for a smaller corpus use the line below
#corpus_samples = corpus_fetch20.data[:fetch20_corpus_size]
# full filtered corpus
corpus_samples = corpus_fetch20.data
print("done in %0.3fs." % (time() - t0))
#done in 2.3..s.

Loading and filering sklearn.20newsgroup dataset, remove headers, footers, quotes
done in 2.335s.


In [42]:
# Vectorizer, full corpus
print("Vectorizer on full data *train* group")
t0 = time()
vectorizer_fetch20_full = TfidfVectorizer()
vectors_full = vectorizer_fetch20_full.fit_transform(newsgroups_train.data)
pprint(vectors_full.shape)
print("done in %0.3fs." % (time() - t0))
#(11314, 130107)

Vectorizer on full data *train* group
(11314, 130107)
done in 4.921s.


In [43]:
# Vectorizer, filtered corpus
print("Vectorizer on sampled data *train* group")
t0 = time()
vectorizer_fetch20_samples = TfidfVectorizer()
vectors_samples = vectorizer_fetch20_samples.fit_transform(corpus_samples)
pprint(vectors_samples.shape)
print("done in %0.3fs." % (time() - t0))
#(100, 5647)

Vectorizer on sampled data *train* group
(11314, 101631)
done in 3.188s.


In [44]:
# Classifier 'multinomial Naive Bayes' on full 20newsgroups set
print("Creating Naive Bayes classifier on full data group")
t0 = time()
classifier_fetch20_full = MultinomialNB(alpha=.01)
classifier_fetch20_full.fit(vectors_full, newsgroups_train.target)
print("done in %0.3fs." % (time() - t0))

# test vector for classifier evaluation
print("Vectorizer on full data -test- group")
t0 = time()
vectors_test_full = vectorizer_fetch20_full.transform(newsgroups_test.data)
pprint(vectors_test_full.shape)
print("done in %0.3fs." % (time() - t0))
#(7532, 130107)

Creating Naive Bayes classifier on full data group
done in 0.210s.
Vectorizer on full data -test- group
(7532, 130107)
done in 2.745s.


In [45]:
# F1 score for full set 
print("F1 Score on full set")
pred_full = classifier_fetch20_full.predict(vectors_test_full)
metrics.f1_score(newsgroups_test.target, pred_full, average='macro')
#0.82906596444740432

F1 Score on full set


0.82906596444740432

In [46]:
# Classifier 'multinomial Naive Bayes' on trimed filtered corpus 20newsgroups set
print("Creating Naive Bayes classifier on sampled data group")
t0 = time()
classifier_fetch20_samples = MultinomialNB(alpha=.01)
# for a smaller corpus use the line below
#classifier_fetch20_samples.fit(vectors_samples, corpus_fetch20.target[:fetch20_corpus_size])
classifier_fetch20_samples.fit(vectors_samples, corpus_fetch20.target)
print("done in %0.3fs." % (time() - t0))

# test vector for classifier evaluation
print("Vectorizer on sampled data -test- group")
t0 = time()
vectors_test_samples = vectorizer_fetch20_samples.transform(newsgroups_test.data)
pprint(vectors_test_samples.shape)
print("done in %0.3fs." % (time() - t0))
#(7532, 5647)

Creating Naive Bayes classifier on sampled data group
done in 0.165s.
Vectorizer on sampled data -test- group
(7532, 101631)
done in 2.592s.


In [47]:
# F1 score for filtered set 
print("F1 Score on sampled/filtered set")
pred_samples = classifier_fetch20_samples.predict(vectors_test_samples)
metrics.f1_score(newsgroups_test.target, pred_samples, average='macro')
#0.2410889603950605 for 100 samples
#0.77414478112872853 for full filtered

F1 Score on sampled/filtered set


0.77414478112872853

### Section 3 Result Comparison and Looking at Top10s

Data comparison on the 20newsgroups results.

In [48]:
# some example for table display, requires tabletext module
#import tabletext
#table_example = [[1,2,30], [4,23125,6], [7,8,999]]
#print tabletext.to_text(data)

In [49]:
#Loading Time, Vectorizer Train, Classifier NB, Vectorizer Test
results_times_fullset=[0.732, 5.145, 0.217, 2.667]
results_times_filteredset=[2.452, 3.568, 0.165, 2.683]
results_times_filteredset100=[2.296, 0.070, 0.006, 2.751]

#Vectorizer Train, Vectorizer Test
results_sizes_fullset=[[11314, 130107], [7532, 130107]]
results_sizes_filteredset=[[11314, 101631], [7532, 101631]]
results_sizes_filteredset100=[[100, 5647], [7532, 5647]]

#F1 scores
results_f1_fullset=[0.829]
results_f1_filteredset=[0.774]
results_f1_filteredset100=[0.241]
# filtering doesn't lose to much F1 Score, but reducing the data set to 100 elements brings a strong degadation

If we compare the F1 scores of our 3 classifiers tested on the test set, the complete set has a 0.83 score, the filtered complete set has a 0.77 score, the filtered strongly reduced set only a 0.24 score. It should be mentioned the reduced set is less than 1% of the size of the full training set.

The calculation times for the strongly reduced set are also much smaller, about 50 times less than the full filtered set for the classifier and vectorizer on the training sets of both.


##### Note about F1-score (or F-score)
F1 score is a measure of the estimator accuracy, it considers both the precison p and recall r to compute the score. It varies between 0 (worst) and 1 (best).  
- precision, p: number of correct positive results (true positive, TP) divided by the number of all positive results  (true positive + false positive, TP + FP)  
- recall, r: number of correct positive results (true positive, TP) divided by the number of results that should be positive (true positive + false negative, TP + FN)  

Another classic measure is Accuracy:
- accuracy, acc: number of correct positive results (true positive, TP) added to the number of correct negative results (true negative, TN) divided by the total population (all results, TP+FP+TN+FN) 



#### top10 representation, most informative features, also used to evaluate overfit

In [50]:
# function to provide the top 10 features (words) of a category for the provided classifier 
def show_top10(classifier, vectorizer, categories):
    feature_names = np.asarray(vectorizer.get_feature_names())
    for i, category in enumerate(categories):
        top10 = np.argsort(classifier.coef_[i])[-10:]
        print("%s: %s" % (category, " ".join(feature_names[top10])))

In [51]:
show_top10(classifier_fetch20_full, vectorizer_fetch20_full, newsgroups_train.target_names)

alt.atheism: keith it and you in that is to of the
comp.graphics: edu in for it is and graphics of to the
comp.os.ms-windows.misc: file for of and edu is it to the windows
comp.sys.ibm.pc.hardware: card ide is of it drive and scsi to the
comp.sys.mac.hardware: in it is and of edu apple mac to the
comp.windows.x: it mit in motif and is of window to the
misc.forsale: shipping offer of 00 to and edu the for sale
rec.autos: that is you it in of and to car the
rec.motorcycles: dod you it com in of and bike to the
rec.sport.baseball: that is baseball and of in to he edu the
rec.sport.hockey: ca game he team and hockey of in to the
sci.crypt: chip that encryption is and clipper key of to the
sci.electronics: for edu you it in is and of to the
sci.med: edu pitt that it in and is to of the
sci.space: it that is nasa in and to of space the
soc.religion.christian: we it in and is god that to of the
talk.politics.guns: it is you that gun and in of to the
talk.politics.mideast: is you israeli that 

In [52]:
show_top10(classifier_fetch20_samples, vectorizer_fetch20_samples, corpus_fetch20.target_names)

alt.atheism: not in and it you is that of to the
comp.graphics: you in graphics it is for of and to the
comp.os.ms-windows.misc: file of you for and is it to windows the
comp.sys.ibm.pc.hardware: with scsi for of drive is it and to the
comp.sys.mac.hardware: that apple for of mac it and is to the
comp.windows.x: for this it in of is and window to the
misc.forsale: or in shipping offer 00 to and sale the for
rec.autos: is that in it of you and to car the
rec.motorcycles: for that in of you it and bike to the
rec.sport.baseball: year was is that of in and to he the
rec.sport.hockey: hockey team that game of he and in to the
sci.crypt: in be it is that key and of to the
sci.electronics: that for in it you is and of to the
sci.med: this you that in it and is to of the
sci.space: for that it is in and space of to the
soc.religion.christian: you it in god and is that to of the
talk.politics.guns: it gun is you in and that of to the
talk.politics.mideast: it is israel that you in and to of th

In [53]:
# common words in 2 lists, keeping list1 ordering
def cwl1(list1, list2):
    list3 = set(list1)&set(list2) # list3 doesn't need to be a list itself
    list4 = sorted(list3, key = lambda k : list1.index(k))
    return list4

In [54]:
list1= ["one", "two", "three"]
list2= ["two", "one", "four"]
list3=cwl1(list1, list2)
list3

['one', 'two']

In [55]:
# cwl with list comprehension (faster for big lists)
# both implementation keep ordering from list1
def cwl(list1, list2):
    s =set(list2)
    list3 = [x for x in list1 if x in s]
    return(list3)

In [56]:
list1= ["one", "two", "three"]
list2= ["two", "one", "four"]
list3=cwl(list1, list2)
list3

['one', 'two']

In [57]:
# function to extract common features (words) for the same category with two classifiers
# cat1 and cat2 should be the same or this will not work
# cwl is used as feature_names lists do not have an attribute index used in cwl1
def common_top10(class1, vect1, cat1, class2, vect2):
    feature_names_1 = np.asarray(vect1.get_feature_names())
    feature_names_2 = np.asarray(vect2.get_feature_names())
    for i, category in enumerate(cat1):
        top10_1 = np.argsort(class1.coef_[i])[-10:]
        top10_2 = np.argsort(class2.coef_[i])[-10:]
        print("%s: %s: %s" % ("Set1 ", category, " ".join(feature_names_1[top10_1])))
        print("%s: %s: %s" % ("Set2 ", category, " ".join(feature_names_2[top10_2])))
        print("%s: %s: %s" % ("Commons ", category, " ".join(cwl(feature_names_1[top10_1], feature_names_2[top10_2]))))
        print("%s" % (" ")) # just for skipping a line between each category

In [58]:
common_top10(classifier_fetch20_full, vectorizer_fetch20_full, newsgroups_train.target_names, classifier_fetch20_samples, vectorizer_fetch20_samples)

Set1 : alt.atheism: keith it and you in that is to of the
Set2 : alt.atheism: not in and it you is that of to the
Commons : alt.atheism: it and you in that is to of the
 
Set1 : comp.graphics: edu in for it is and graphics of to the
Set2 : comp.graphics: you in graphics it is for of and to the
Commons : comp.graphics: in for it is and graphics of to the
 
Set1 : comp.os.ms-windows.misc: file for of and edu is it to the windows
Set2 : comp.os.ms-windows.misc: file of you for and is it to windows the
Commons : comp.os.ms-windows.misc: file for of and is it to the windows
 
Set1 : comp.sys.ibm.pc.hardware: card ide is of it drive and scsi to the
Set2 : comp.sys.ibm.pc.hardware: with scsi for of drive is it and to the
Commons : comp.sys.ibm.pc.hardware: is of it drive and scsi to the
 
Set1 : comp.sys.mac.hardware: in it is and of edu apple mac to the
Set2 : comp.sys.mac.hardware: that apple for of mac it and is to the
Commons : comp.sys.mac.hardware: it is and of apple mac to the
 
Set1 :

The results show that we cannot get much information from this rough analysis, filtering helps removing some non-meaningful data like email addresses but most of the remaining words are language basics as in the talk.politics.misc commonalities "it is you and in that of to the".
We would need to create larger top sets, compare them and remove the words found in each topic, that would bring us the meaningul words specific to topics and not shared by the langue used.

In [59]:
# get cwl from all categories with top20 (cwl of all cwls)
def global_common_top(n, classifier, vectorizer, categorizer):
    feature_names = np.asarray(vectorizer.get_feature_names())
    common_top = []
    for i, category in enumerate(categorizer):
        top = np.argsort(classifier.coef_[i])[-n:]
        print("%s: %s: %s" % ("Classifier ", category, " ".join(feature_names[top])))
        if (len(common_top) == 0):
            common_top = feature_names[top]
        else:
            common_top = cwl(common_top, feature_names[top])
        print("%s: %s: %s" % ("Commons ", category, " ".join(common_top)))
        print("%s" % (" ")) # just for skipping a line between each category
        
    return common_top

In [60]:
commons_top20 = global_common_top(20, classifier_fetch20_samples, vectorizer_fetch20_samples, corpus_fetch20.target_names)
pprint(commons_top20)

Classifier : alt.atheism: they do for what have as god this be are not in and it you is that of to the
Commons : alt.atheism: they do for what have as god this be are not in and it you is that of to the
 
Classifier : comp.graphics: be thanks image or can any have this on that you in graphics it is for of and to the
Commons : comp.graphics: for have this be in and it you is that of to the
 
Classifier : comp.os.ms-windows.misc: but files this can on dos with have that in file of you for and is it to windows the
Commons : comp.os.ms-windows.misc: for have this in and it you is that of to the
 
Classifier : comp.sys.ibm.pc.hardware: controller bus this in my on card you that have with scsi for of drive is it and to the
Commons : comp.sys.ibm.pc.hardware: for have this in and it you is that of to the
 
Classifier : comp.sys.mac.hardware: can drive my if this on have in you with that apple for of mac it and is to the
Commons : comp.sys.mac.hardware: for have this in and it you is that of t

Results from the application of the global common to all categories top20 is below:
we go from "for have this be in and it you is that of to the" after 2 categories to "for in and it you is of to the" at the end of all categories.
We can see that some very common terms have still been discarded like "be", or "this". We should extend the top list.

In [61]:
commons_top50 = global_common_top(50, classifier_fetch20_samples, vectorizer_fetch20_samples, corpus_fetch20.target_names)
pprint(commons_top50)

Classifier : alt.atheism: islam which some just say with about religion would who by all there think no atheism can don one on he people or was we an so your but if they do for what have as god this be are not in and it you is that of to the
Commons : alt.atheism: islam which some just say with about religion would who by all there think no atheism can don one on he people or was we an so your but if they do for what have as god this be are not in and it you is that of to the
 
Classifier : comp.graphics: ftp do out some about software need hi at please does looking not am format 3d from as know anyone would me program file are there files with but if be thanks image or can any have this on that you in graphics it is for of and to the
Commons : comp.graphics: some with about would there can on or but if do for have as this be are not in and it you is that of to the
 
Classifier : comp.os.ms-windows.misc: am does cica at get as version know me anyone from will using ftp program problem 

Extending to a top50 we reach the following common terms: "with on or if for have this are in and it you is of to the".
Between top20 and top50 were added "with on or if have this are" which are common terms that should be ignored.

In [67]:
# different words in 2 lists, with list comprehension (faster for big lists)
def dwl(list1, list2):
    return([x for x in (set(list1)^set(list2))])

In [68]:
list1= ["one", "two", "three"]
list2= ["two", "one", "four"]
list3=dwl(list1, list2)
list3

['four', 'three']

In [69]:
# for each filtered result, get top20 that are =/ from global cwl defined as commons_top50 above
def extracted_common_top(n, classifier, vectorizer, categorizer, commons):
    feature_names = np.asarray(vectorizer.get_feature_names())
    for i, category in enumerate(categorizer):
        top = np.argsort(classifier.coef_[i])[-n:]
        print("%s: %s" % (category, " ".join(dwl(feature_names[top], commons))))

In [70]:
extracted_common_top(20, classifier_fetch20_samples, vectorizer_fetch20_samples, corpus_fetch20.target_names, commons_top50)

alt.atheism: do they be as not with if on what that god or
comp.graphics: be that graphics are with any if image can thanks
comp.os.ms-windows.misc: files file that but are if windows can dos or
comp.sys.ibm.pc.hardware: that bus controller are card if drive scsi my or
comp.sys.mac.hardware: apple that mac are drive can my or
comp.windows.x: be motif that an are if server window can or
misc.forsale: offer are condition if me on 00 this please sale shipping new
rec.autos: that cars they if was car my or
rec.motorcycles: that are your if me my dod bike was or
rec.sport.baseball: be his they that baseball are year with if on this team he was or at
rec.sport.hockey: be play they that game are have with if this players hockey team he was or
sci.crypt: be key that as are have not with if clipper they encryption or chip
sci.electronics: be that an there
sci.med: be that but as not if on msg my or
sci.space: be they that as have with if would space nasa was or
soc.religion.christian: be we tha

original top 10 : sci.space: for that it is in and space of to the  
filtered top 20 : sci.space: be they that as have with if would space nasa was or  

As can be seen the filtered top 20 is 2 words longer but much more significant even if some words are still very generic.  
We will see in fextraction2 (next experience file in this batch) how to reach the same (and even much better) results with inverse document frequency vectorizer (we used it without any parameter here, which is not different than a CountVectorizer).

next to do:
1. study overfit vs. performance vs. f1 scores
2. work with tf-idf to improve results, compare the results with the post-classifier treatment discussed above (removing global commons) (tf-idf was used for the vectorizer already in this example file we need to compare with simple tf)
3. study results between one big classifier (whole set) and multiple small classifiers merged together (probably less overfit but F1 score should be looked at) 