In [8]:
import codecs
import pandas as pd 
import numpy as np 
import nltk 
import re
import string
from nltk.stem.snowball import FrenchStemmer
from nltk.stem.snowball import SnowballStemmer
from nltk.corpus import stopwords
from nltk.tokenize.texttiling import TextTilingTokenizer
from nltk.collocations import *
import urllib
import requests
import json
import unicodedata
import os
from nltk.stem import WordNetLemmatizer
%matplotlib inline

## Construction d'un classifieur de texte : 

**Motivation :**


Le texte mining / l'extraction des informations clés d'un corpus (groupe de documents textes) est l'une des majeures applications du Data science. 

Il est nécessaire de maitriser les outils du traitement de langage automatique pour répondre à des problématiques business (Analyser les comportements d’internautes, améliorer l'image de marque d'une entreprise...) ou à des problématiques de recherche d'informations (construction d'une base de connaissances, moteurs de recherches…)


**Enjeux : **

- L'extraction d'information nécessite d'abord la récupération du corpus. De plus, le corpus obtenu est souvent bruité; Il contient des mots insignifiants (balises html, stopwords...) qu'il faut éliminer.

- Souvent, pour le domaine business  nous avons besoin d'identifier les noms des entités (nom des marques, des produits..). 

- Une fois le corpus est nettoyé, nous devons définir une représentation (un mapping) des mots des documents.

        

- Définir l'heuristique pour établir un clustering du corpus. 

- Déterminer l'algorithme d'apprentissage à utiliser pour la classification des textes


** Présentation des données:**

- Nous utilisons le dataset 20Newgroups de Jason Rennie : http://qwone.com/~jason/20Newsgroups/ 

In [9]:
os.chdir("/Users/sara/Documents/Projet_Etaonis/Classif_text")

In [10]:
with open("/Users/sara/Documents/Projet_Etaonis/Classif_text/20ng-train-all-terms-2.txt", "r") as f:text_train=f.read()

In [11]:
with open("/Users/sara/Documents/Projet_Etaonis/Classif_text/20ng-test-all-terms-2.txt", "r") as f:text_test=f.read()

- Chaque paragraphe du fichier représente un document. On va ainsi diviser le fichier en paragraphes:

In [12]:
docs_train = re.split('\n{1,}',text_train)

In [13]:
docs_test = re.split('\n{1,}',text_test)

*Nombre Total des docuements : Train & Test *

In [14]:
[len(docs_train),len(docs_test)]

[11294, 7529]

*Exemple:*

Chaque paragraphe est composé d'un mot représentant la classe du document, d'une tabulation puis du texte du document.

In [15]:
docs_train[10]

'alt.atheism\tre pompous ass livesey solntze wpd sgi com jon livesey writes how long does it the motto have to stay around before it becomes the default where s the cutoff point i don t know where the exact cutoff is but it is at least after a few years and surely after years why does the notion of default not take into account changes in population makeup specifically which changes are you talking about are you arguing that the motto is interpreted as offensive by a larger portion of the population now than years ago keith'

*Nous définisson une expression régulière capable de détecter les classes des documents:*

In [16]:
regex = re.compile(r"[A-Za-z]+\.+[A-Za-z]+\.+[A-Za-z]*|[A-Za-z]+\.+[A-Za-z]*")

In [17]:
regex.findall(docs_train[10000])

['talk.politics.mideast']

In [18]:
regex.findall(docs_train[10])

['alt.atheism']

In [19]:
regex.findall(docs_train[950])

['comp.graphics']

**Construction de la base d'apprentissage:**

In [20]:
docs_train[10].split('\t')

['alt.atheism',
 're pompous ass livesey solntze wpd sgi com jon livesey writes how long does it the motto have to stay around before it becomes the default where s the cutoff point i don t know where the exact cutoff is but it is at least after a few years and surely after years why does the notion of default not take into account changes in population makeup specifically which changes are you talking about are you arguing that the motto is interpreted as offensive by a larger portion of the population now than years ago keith']

In [21]:
s = []
for text in docs_train : 
    s.append(text.split('\t'))

In [22]:
s[10]

['alt.atheism',
 're pompous ass livesey solntze wpd sgi com jon livesey writes how long does it the motto have to stay around before it becomes the default where s the cutoff point i don t know where the exact cutoff is but it is at least after a few years and surely after years why does the notion of default not take into account changes in population makeup specifically which changes are you talking about are you arguing that the motto is interpreted as offensive by a larger portion of the population now than years ago keith']

In [23]:
d_train = pd.DataFrame(s, columns=['Classe','Texte'])

In [24]:
d_train = d_train[d_train['Classe']!='']

In [25]:
d_train.head()

Unnamed: 0,Classe,Texte
0,alt.atheism,alt atheism faq atheist resources archive name...
1,alt.atheism,alt atheism faq introduction to atheism archiv...
2,alt.atheism,re gospel dating in article mimsy umd edu mang...
3,alt.atheism,re university violating separation of church s...
4,alt.atheism,re soc motss et al princeton axes matching fun...


- Nombre de classes : 

In [26]:
d_train['Classe'].nunique()

20

- Nombre de documents pour chaque classe:

In [27]:
d_train.groupby('Classe').count()

Unnamed: 0_level_0,Texte
Classe,Unnamed: 1_level_1
alt.atheism,480
comp.graphics,584
comp.os.ms-windows.misc,572
comp.sys.ibm.pc.hardware,590
comp.sys.mac.hardware,578
comp.windows.x,593
misc.forsale,585
rec.autos,594
rec.motorcycles,598
rec.sport.baseball,597


**Construction de la base de test:**

In [28]:
s = []
for text in docs_test : 
    s.append(text.split('\t'))
d_test = pd.DataFrame(s, columns=['Classe','Texte'])

In [29]:
d_test = d_test[d_test['Classe']!='']

In [30]:
d_test.groupby('Classe').count()

Unnamed: 0_level_0,Texte
Classe,Unnamed: 1_level_1
alt.atheism,319
comp.graphics,389
comp.os.ms-windows.misc,394
comp.sys.ibm.pc.hardware,392
comp.sys.mac.hardware,385
comp.windows.x,392
misc.forsale,390
rec.autos,395
rec.motorcycles,398
rec.sport.baseball,397


##Notre approche:

Afin de répondre à notre problématique nous allons traiter le problème selon trois principales étapes : 

   - *1ère étape :* Traitement du corpus 
       - Nettoyage du texte 
       - détection des mots-clés
            
            
   - *2ème étape :* Clustering 
       - Construction de la matrice de vectorisation 
       - définition des metriques
       - algorithmes de clustering 
       
       
   - *3ème étape :* Construction du classifieur 
       - Utilisation des résultats du clustering pour créer les labels 
       - Construction de la base d'apprentissage 
       - Entraiment du modèle : Naive Bayes / Max Entropy / Decision Tree 
       - Pickle : pour sauvegrader le modèle 
       - Test du modèle : cross-valid / précision / rappel 
       
Finalement nous allons comparer les performances des différents modèles 
      


**fzg** 
$\

## Implémentation de la 1ère étape : détection des mots-clés

*Traitement du texte : *
Pour un texte donner

In [31]:
def chunk_mot(phrases):
    """split text to a set of words : bag-of-words"""
    mots = nltk.word_tokenize(phrases) 
    return mots 

def no_punctuation(mots):
    """return the set of words without puntctuation marks """
    regex =  re.compile('[%s]' % re.escape(string.punctuation))
    mots_no_punctuation = []
    for token in mots: 
        new_token = regex.sub(u'', token)
        if not new_token == u'':
            mots_no_punctuation.append(new_token.lower()) #lower pour normaliser le texte
    return mots_no_punctuation

def delete_stop_words(mots):
    """return the set of words without insignifiant words like : le , de, ces, the... 
    """
    stops= set(stopwords.words('french')+stopwords.words('english'))
    mots_no_stopwords = []
    for mot in mots:
        if not mot in stops:
                mots_no_stopwords.append(mot)
    return mots_no_stopwords

def word_stemmer(mots):
    snowball = SnowballStemmer('english') # for english use Porter stem algo
    preprocessed_docs = []
    for mot in mots:
        preprocessed_docs.append(snowball.stem(mot))   
    return preprocessed_docs

def word_lemmatizer(mots): 
    lemmat = WordNetLemmatizer()
    preprocessed_docs = []
    for mot in mots:
        preprocessed_docs.append(lemmat.lemmatize(mot))   
    return preprocessed_docs 
    
def nettoyer_texte(text, lemma = False, Stemm = False ): 
    text_chunk = chunk_mot(text)
    text_punct = no_punctuation(text_chunk)
    text_f = delete_stop_words(text_punct)
    if lemma == True :
        text_f = word_lemmatizer(text_f)
    if Stemm == True : 
        text_f = word_stemmer(text_f)
    #reconstruct the text
    textn = ' '.join(text_f)
    return textn
    

*exemple de nettoyage de texte :* 

In [32]:
nettoyer_texte(d_train['Texte'][11])

're keith schneider stealth poster sandvik newton apple com kent sandvik writes borrow philosophy truly understand color red seen true even experienced color red still might different interpretation wouldn know red certainly couldn judge subjectively objectivity applicable since wanting discuss merits red keith'

In [33]:
nettoyer_texte(d_train['Texte'][11], Stemm = True)

u're keith schneider stealth poster sandvik newton appl com kent sandvik write borrow philosophi truli understand color red seen true even experienc color red still might differ interpret wouldn know red certain couldn judg subject object applic sinc want discuss merit red keith'

** Nettoyage de notre dataset:**

Pour l'analyse et la construction de nos classifieurs, nous allons utiliser la base des textes "Stemmed"  sans stopwords et ponctuations.

In [34]:
d_train['Texte']=d_train['Texte'].apply(lambda x : nettoyer_texte(x, Stemm=True))

In [35]:
d_train.head()

Unnamed: 0,Classe,Texte
0,alt.atheism,alt atheism faq atheist resourc archiv name at...
1,alt.atheism,alt atheism faq introduct atheism archiv name ...
2,alt.atheism,re gospel date articl mimsi umd edu mango cs u...
3,alt.atheism,re univers violat separ church state dmn keple...
4,alt.atheism,re soc motss al princeton axe match fund boy s...


In [36]:
vocab= d_train['Texte'].apply(lambda x : x.split())

## Construction de la matrice : Bag of words 

Bag of words of the training set 

In [37]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(max_df=0.8, min_df=3)
%time
X_train=vectorizer.fit_transform(d_train['Texte'])

CPU times: user 2 µs, sys: 0 ns, total: 2 µs
Wall time: 5.01 µs


Bag of words of the test set using the same vectorizer 

In [38]:
X_test = vectorizer.transform(d_test['Texte'])

Labels des X_train et X_test 

In [39]:
Y_train = np.array(d_train['Classe'])

In [40]:
Y_test = np.array(d_test['Classe'])

Get the features Name 

In [41]:
features_names = vectorizer.get_feature_names()

In [42]:
feature_names = np.asarray(features_names)

#réduction des features : détection des mot-clés sans perdre de l'information sur la catégorie

Noud allons utiliser deux différentes méthodes de selection et nous allons par la suite la performance de ces dernières sur la classification.

##Information Gain:

définition : 

La formule est donnée par : $$IG(term:t)= -\sum_{i}P_r(C_i)logP_r(C_i) + P_r(t)\sum_{i}P_r(C_i|t)logP_r(C_i|t)+P_r(t^c)\sum_{i}P_r(C_i|t^c)logP_r(C_i|t^c) $$

$P_r(t) = \frac{|D_t|}{|D|}$ where $D_t$ : the training set containing term t and $D$ : the hole train set

$P_r(C_i|t) = \frac{Occurence(c_k assign to t)}{Occurence of t ine the training set}$


Implémentation 

In [None]:
def information_gain1(x, y):
    #Define the entropy of a distribution of categories 
    def _entropy1(categories):
        """compute the entropy of the categories 
        Input : 1-D array of labels
        retun : the value of the entropy"""
        #Count the occurence of each categorie:
        counts = np.bincount(categories)
        #define the proba of a categorie 
        proba = counts[np.nonzero(counts)] / float(len(categories))
        return - np.sum(proba*np.log(proba))
    for feature in x.T:
        information_gain_scores.append(_information_gain(feature, y))
    return information_gain_scores, []
    
    
    def _information_gain1(feat,y):
        #Define the entropy of  categories for a given t : 
        feature_t_indices = np.nonzero(feat)[1]
        #define the proba of the presence of a term t in the set 
        card_t = np.count_nonzero(feat)
        proba_t = np.count_nonzero(feat) / X.shape[0]
        categories = y[feature_t_indices]
        counts = np.bincount(categories)
        occ_t = sum(feat)
        proba = counts / occ_t
        entropy_t = np.sum(proba* np.log(proba))
        
        #Define the entropy of  categories with the absence of t :
        feature_not_t_indices = [i for i in range(X.shape[0]) if i not in feature_t_indices]
        categories_not_t = y[feature_not_t_indices]
        counts1 = np.bincount(categories_not_t)
        proba1 = counts1 / len(categories_not_t)
        entropy_no_t = np.sum(proba1* np.log(proba1))
        
        return _entropy(y) + entrop_t + entropy_no_t
    
    information_gain_scores = []
    for feat in X.T:
        information_gain_scores.append(_information_gain(feat, y))
    return information_gain_scores, []


In [408]:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
le.fit(Y_train)
print(le.classes_)
y=le.transform(Y_train)

['alt.atheism' 'comp.graphics' 'comp.os.ms-windows.misc'
 'comp.sys.ibm.pc.hardware' 'comp.sys.mac.hardware' 'comp.windows.x'
 'misc.forsale' 'rec.autos' 'rec.motorcycles' 'rec.sport.baseball'
 'rec.sport.hockey' 'sci.crypt' 'sci.electronics' 'sci.med' 'sci.space'
 'soc.religion.christian' 'talk.politics.guns' 'talk.politics.mideast'
 'talk.politics.misc' 'talk.religion.misc']


In [409]:
def information_gain(x, y):
    def _entropy(values):
        counts = np.bincount(values)
        probs = counts[np.nonzero(counts)] / float(len(values))
        return - np.sum(probs * np.log(probs))
    def _information_gain(feature, y):
        feature_set_indices = np.nonzero(feature)[1]
        feature_not_set_indices = [i for i in feature_range if i not in feature_set_indices]
        entropy_x_set = _entropy(y[feature_set_indices])
        entropy_x_not_set = _entropy(y[feature_not_set_indices])
        
        proba_t = np.count_nonzero(feature) / x.shape[0]

        return entropy_before - (proba_t * entropy_x_set) + (proba_t * entropy_x_not_set)

    feature_size = x.shape[0]
    feature_range = range(0, feature_size)
    entropy_before = _entropy(y)
    information_gain_scores = []

    for feature in x.T:
        information_gain_scores.append(_information_gain(feature, y))
    return information_gain_scores, []

In [410]:
I=information_gain(X_train, y)

ValueError: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all().

##RandomForest for IG Maximization 

In [434]:
from sklearn.ensemble import RandomForestClassifier
feat_labels = feature_names
forest = RandomForestClassifier(criterion='entropy',
                                n_estimators=1000,
                                   random_state=0,
                                   n_jobs=-1)
%time
forest.fit(X_train, Y_train)
importances = forest.feature_importances_
indices = np.argsort(importances)[::-1]
for f in range(X_train.shape[1]):
    print("%2d) %-*s %f" % (f + 1, 30,
                        importances[indices[f]],importances[indices[f]]))

CPU times: user 2 µs, sys: 3 µs, total: 5 µs
Wall time: 4.05 µs
 1) aa                             0.009793
 2) aaa                            0.008740
 3) aachen                         0.006956
 4) aad                            0.006900
 5) aamir                          0.006300
 6) aamrl                          0.006043
 7) aardvark                       0.005897
 8) aargh                          0.005760
 9) aarghhhh                       0.005403
10) aario                          0.005377
11) aarnet                         0.005178
12) aaron                          0.005121
13) aau                            0.005079
14) ab                             0.004822
15) abacus                         0.004685
16) abad                           0.004552
17) abandon                        0.004507
18) abat                           0.004157
19) abber                          0.004006
20) abbey                          0.003996
21) abbot                          0.003764
22) abbott  

In [435]:
importances

array([  5.72744752e-05,   3.53083225e-05,   1.67975214e-06, ...,
         1.33396937e-06,   7.71143364e-06,   2.36228472e-06])

In [438]:
indices[:10]

array([15961, 21679, 21879,  2783, 20763,  7846,  1044, 16954,  5423,  3296])

In [443]:
[importances[indices[:10]]
,feature_names[indices[:10]]]

[array([ 0.0097934 ,  0.00873982,  0.00695646,  0.0068996 ,  0.00630029,
         0.00604338,  0.00589707,  0.0057603 ,  0.0054028 ,  0.00537719]),
 array([u're', u'window', u'write', u'car', u'use', u'god', u'articl',
        u'sale', u'dod', u'christian'], 
       dtype='<U78')]

In [460]:
def IG_import(X_train,y_train, feature_names , k, n_estimators):
    """
    this function aims to select the k-best features based on the maximization of the information gain. We use the 
    RandomForest Classifier for that. 
    Arguments : 
        X_train,y_train : the Training set
        feature_names : the names of the features 
        k: the number of feature we want to select 
        n_estimators : the depth of the RandomForest  
    Returns : 
    two arrays : The first array contains the IG value of the best-features. 
                 The second one returns the k-best features.
    """
    forest = RandomForestClassifier(
                                n_estimators=n_estimators,
                                   random_state=0,
                                   n_jobs=-1)
    forest.fit(X_train, Y_train)
    importances = forest.feature_importances_
    indices = np.argsort(importances)[::-1]
    return [importances[indices[:k]],feature_names[indices[:k]]]

In [463]:
IG_import(X_train,Y_train, feature_names , 30, 100)

[array([ 0.00902507,  0.00821174,  0.00746222,  0.00730419,  0.00670511,
         0.00532208,  0.00530269,  0.00463321,  0.00436711,  0.00432745,
         0.00423719,  0.00421498,  0.00418285,  0.00416212,  0.00411486,
         0.00399102,  0.00378767,  0.00377959,  0.0036096 ,  0.00358028,
         0.00336956,  0.00325256,  0.00313999,  0.00306674,  0.00290846,
         0.00285797,  0.00284219,  0.00279825,  0.00276727,  0.00272368]),
 array([u'car', u'bike', u'dod', u'sale', u'window', u'gun', u'space',
        u'clipper', u'team', u're', u'god', u'christian', u'basebal',
        u'encrypt', u'mac', u'hockey', u'isra', u'graphic', u'ride',
        u'game', u'israel', u'motorcycl', u'key', u'nhl', u'use', u'atho',
        u'appl', u'rutger', u'write', u'govern'], 
       dtype='<U78')]

##Chi-2 measure:

The theoric formula is given by : 
the idea is to compute the statistics of independance of a given term and categorie 

We will use the scikit-learn module 

In [221]:
from sklearn.feature_selection import SelectKBest, chi2
#Select the  k-best chi-score term : 
def chi2_n(X_train,y_train,X_test, feature_names , k):
    print("Extracting %d best features by a chi-squared test" %k)
    ch2 = SelectKBest(chi2, k=k)
    X_train = ch2.fit_transform(X_train, y_train)
    X_test = ch2.transform(X_test)
    # keep selected feature names
    feature_names = [feature_names[i] for i
                    in ch2.get_support(indices=True)]
    return [X_train,X_test]

In [222]:
chi2_n(X_train,Y_train,X_test, feature_names , 10000)

Extracting 10000 best features by a chi-squared test


[<11293x10000 sparse matrix of type '<type 'numpy.float64'>'
 	with 738333 stored elements in Compressed Sparse Row format>,
 <7528x10000 sparse matrix of type '<type 'numpy.float64'>'
 	with 307124 stored elements in Compressed Sparse Row format>]

In [223]:
chi2_n(X_train,Y_train,X_test, feature_names , 1000)[1]

Extracting 1000 best features by a chi-squared test


<7528x1000 sparse matrix of type '<type 'numpy.float64'>'
	with 52528 stored elements in Compressed Sparse Row format>

In [224]:
Xtrain = chi2_n(X_train,Y_train,X_test, feature_names , 10000)[0]

Extracting 10000 best features by a chi-squared test


In [225]:
Xtrain.shape

(11293, 10000)

In [228]:
Xtest = chi2_n(X_train,Y_train,X_test, feature_names , 10000)[0]

Extracting 10000 best features by a chi-squared test


##SBS 

In [404]:
from sklearn.base import clone
from itertools import combinations
import numpy as np
from sklearn.cross_validation import train_test_split
from sklearn.metrics import accuracy_score

class SBS():
    def __init__(self, estimator, k_features,
        scoring=accuracy_score,
        test_size=0.25, random_state=1):
        self.scoring = scoring
        self.estimator = clone(estimator)
        self.k_features = k_features
        self.test_size = test_size
        self.random_state = random_state
        
    def fit(self, X, y):
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=self.test_size,
                                    random_state=self.random_state)
        dim = X_train.shape[1]
        self.indices_ = tuple(range(dim))
        self.subsets_ = [self.indices_]
        score = self._calc_score(X_train, y_train, X_test, y_test, self.indices_)
        self.scores_ = [score]
        while dim > self.k_features:
            scores = []
            subsets = []
            for p in combinations(self.indices_, r=dim-1000):
                score = self._calc_score(X_train, y_train,
                                            X_test, y_test, p)
                scores.append(score)
                subsets.append(p)
            best = np.argmax(scores)
            self.indices_ = subsets[best]
            self.subsets_.append(self.indices_)
            dim -= 1000
            self.scores_.append(scores[best])
        self.k_score_ = self.scores_[-1]
        return self

    def transform(self, X):
        return X[:, self.indices_]
    
    def _calc_score(self, X_train, y_train,X_test, y_test, indices):
        self.estimator.fit(X_train[:, indices], y_train)
        y_pred = self.estimator.predict(X_test[:, indices])
        score = self.scoring(y_test, y_pred)
        return score

In [405]:
clf = MultinomialNB(alpha=0.9)
sbs = SBS(clf, k_features=20000)
sbs.fit(X_train, y)

KeyboardInterrupt: 

#Tuning the Hyperparmeters 

## Classification : Construction du classifieur 

In [44]:
from time import time
import matplotlib.pyplot as plt
from sklearn import metrics
from sklearn.learning_curve import learning_curve

def benchmark(clf):
    categories = d_train['Classe'].unique()
    print('_' * 80)
    print("Training: ")
    print(clf)
    t0 = time()
    clf.fit(X_train, Y_train)
    train_time = time() - t0
    print("train time: %0.3fs" % train_time)
    t0 = time()
    pred = clf.predict(X_test)
    test_time = time() - t0
    print("test time:  %0.3fs" % test_time)

    score = metrics.accuracy_score(Y_test, pred)
    print("accuracy:   %0.3f" % score)
   
    print("classification report:")
    print(metrics.classification_report(Y_test, pred,
                                        target_names=categories))
￼￼
    plt.plot(train_sizes, test_mean,
                 color='green', linestyle='--',
                 marker='s', markersize=5,
                 label='validation accuracy')
    plt.fill_between(train_sizes,
                         test_mean + test_std,
                         test_mean - test_std,
                         alpha=0.15, color='green')
    plt.grid()
    plt.xlabel('Number of training samples')
    plt.ylabel('Accuracy')
    plt.legend(loc='lower right')
    plt.ylim([0.8, 1.0])
    plt.show()
    print("confusion matrix:")
    print(metrics.confusion_matrix(Y_test, pred))
    print()
    clf_descr = str(clf).split('(')[0]
    return clf_descr, score, train_time, test_time

SyntaxError: invalid syntax (<ipython-input-44-35080430b3c1>, line 26)

In [None]:
# Train sparse Naive Bayes classifiers
results = []
print('=' * 80)
print("Naive Bayes")
results.append(benchmark(MultinomialNB(alpha=0.9)))

In [None]:
print("learning curve:")
train_sizes, train_scores, test_scores =learning_curve(estimator=MultinomialNB(alpha=0.9),X=X_train,y=Y_train,
                       train_sizes=np.linspace(0.1, 1.0, 10),cv=5,n_jobs=1)
train_mean = np.mean(train_scores, axis=1)
train_std = np.std(train_scores, axis=1)
test_mean = np.mean(test_scores, axis=1)
test_std = np.std(test_scores, axis=1)
plt.plot(train_sizes, train_mean,
color='blue', marker='o',
markersize=5,
label='training accuracy')

In [3]:
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import Perceptron
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.naive_bayes import BernoulliNB, MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neighbors import NearestCentroid
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import RidgeClassifier
from sklearn.svm import LinearSVC
for clf, name in (
        (MultinomialNB(alpha=0.9),"Multinomial Naive Bayes"),
        (SGDClassifier(alpha=.0001, n_iter=50,penalty="elasticnet"), "Elastic-Net"),
        (RidgeClassifier(tol=1e-2, solver="lsqr"), "Ridge Classifier"),
        (Perceptron(n_iter=50), "Perceptron"),
        (PassiveAggressiveClassifier(n_iter=50), "Passive-Aggressive"),
        (KNeighborsClassifier(n_neighbors=10), "kNN"),
        (RandomForestClassifier(n_estimators=100,criterion='gini'), "Random forest")):
    print('=' * 80)
    print(name)
    results.append(benchmark(clf))

Multinomial Naive Bayes


NameError: name 'results' is not defined

In [275]:
# Tuning the hyperparameter : 

In [276]:
# Choosing the best classifier 