# TP de Text Mining

In [209]:
import os.path as op
import numpy as np
from sklearn.base import BaseEstimator, ClassifierMixin
from glob import glob
import re
from collections import Counter
import math
from sklearn.cross_validation import cross_val_score

In [210]:
# Load data
print("Loading dataset")
filenames_neg = sorted(glob(op.join('data', 'imdb1', 'neg', '*.txt'))) # Assumes that data are located in data/imdb1/neg
filenames_pos = sorted(glob(op.join('data', 'imdb1', 'pos', '*.txt'))) # Assumes that data are located in data/imdb1/neg
texts_neg = [open(f).read() for f in filenames_neg]
texts_pos = [open(f).read() for f in filenames_pos]
texts = texts_neg + texts_pos
y = np.ones(len(texts), dtype=np.int)
y[:len(texts_neg)] = 0.

print("%d documents" % len(texts))

Loading dataset
2000 documents


## 1) Compléter la fonction count_word

In [211]:
def count_words(texts):
    voc = {}
    uniq = set()
    # Determine total number of uniques words in the whole corpus of texts
    for text in texts :
        words = set(re.findall(r"\w+",text.lower()))
        uniq = uniq.union(words)
    counts = np.zeros((len(texts), len(uniq)))
    wordidx = 0
    for textidx,text  in enumerate(texts):
        #print(textidx," / ",len(texts), " documents\r" )
        text = re.findall(r"\w+",text.lower()) # Removes non-words characters and tokenize text
        for word in text:
                # If the word's already in the vocabulary dict, increment the count of the word in the counts matrix
                if word in voc: # for the current document
                    counts[textidx, voc[word]] += 1
                # Else add the new word to the vocabulary dict and set it's index in the counts matrix
                else:
                    voc[word] = wordidx 
                    # increment the count in counts for the current word in the current document
                    counts[textidx, wordidx] += 1 
                    wordidx += 1 # Increment the index for the next new word
    return voc, counts    
        

#### Test de la fonction

In [212]:
vocabulary, X = count_words(texts)

In [213]:
print(X.shape)

(2000, 39696)


La fonction marche correctement

## 2) Expliquer la classification obtenue dans le fichier poldata.readme

Voir le PDF

## 3) implémenter le classifieur Bayésien naïf

In [214]:
class NB(BaseEstimator, ClassifierMixin):
    def __init__(self): # No modification, everything is inherited from BaseEstimator et ClassifierMixin
        self.prior = {}
        self.vocabulary = {}
        self.cond_prob = {}

    def fit(self, X, y):
        vocabulary, counts = count_words(X)
        nDocs = len(counts)
        C = set(y)
        prior = {}
        tct = np.zeros([len(C), len(vocabulary)])
        array_condprob = np.copy(tct)
        cond_prob = {}
        for i,c in enumerate(C):
            # Get lines' index according to the membership of the current class
            ixc = np.where(y == c)[0]
            nDocsInc = len(ixc)
            prior[c] = nDocsInc / nDocs # class frequencies by document
            # Gets all words associated to the current class from counts matrix
            countsByClass = counts[ixc, :] 
            # Sum to get the number of word for each class
            tct[i, :] = np.sum(countsByClass, axis=0)
            # Computes the conditional probability for the word to belong to the current class 
            array_condprob[i, :] = (tct[i, :] + 1) / np.sum(tct[i, :] + 1)
            # Stores this probabilities into a dict with key : class and value : the probability for each word
            cond_prob[c] = array_condprob[i, :]
        self.vocabulary = vocabulary
        self.prior = prior
        self.cond_prob = cond_prob
    
    def extract_tokens(vocabulary, doc):
        tokens = re.findall(r"\w+", doc.lower())
        # New words missing from the training set are left over
        return [token for token in tokens if token in vocabulary.keys()] 
        
    def predict(self, X):
        results = np.zeros(len(X))
        for i,text in enumerate(X):
            # Gets all the word from the current testing doc
            w = NB.extract_tokens(self.vocabulary, text)
            score, maxprob, maxclass, = 0, - math.inf, 0
            for c in self.prior:
                # Initializes the score based on the a priori probability of belonging to a class
                score = np.log(self.prior[c])
                for word in w:
                    # Gets the index of the word in the train's counts matrix
                    index = self.vocabulary[word]
                    # Computes the score of the word by adding to it the log of its conditionnal probability of belonging
                    # to the class from the train's counts matrix
                    score += np.log(self.cond_prob[c][index])    
                if score > maxprob:
                    maxprob, maxclass  = score, c
            # stores for the current document, the predicted class according to the maximum conditionnal probability        
            results[i] = maxclass
        return results
    
    def score(self, X, y):
        return np.mean(self.predict(X) == y)
    

Voir le PDF pour l'interpretation de l'algorithme.
Test du classifieur

In [215]:
nb = NB()
nb.fit(texts[::2], y[::2])
print(nb.score(texts[1::2], y[1::2]))

0.81


Le classifieur fonctionne

## 4) Tester le classifieur avec une validation croisée à 5 passes

Nous utilisons la cross validation de SKLEARN pour tester le classifieur

In [216]:
scores = cross_val_score(nb,texts[1::2], y[1::2], cv = 10)
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

Accuracy: 0.80 (+/- 0.07)


Les résultats sont plutôts corrects

## 5) Filtrer les stop words

In [174]:
def count_words(texts, stopwords = None):
    voc = {}
    uniq = set()
    # Determine total number of uniques words in the whole corpus of texts
    for text in texts :
        words = re.findall(r"\w+",text.lower())
        words = set([word for word in words if word not in stopwords])
        uniq = uniq.union(words)
    counts = np.zeros((len(texts), len(uniq)))
    wordidx = 0
    for textidx,text  in enumerate(texts):
        text = re.findall(r"\w+",text.lower()) # Removes non-words characters and tokenize text
        text = [word for word in text if word not in stopwords]
        for word in text:
                # If the word's already in the vocabulary dict, increment the count of the word in the counts matrix
                if word in voc: # for the current document
                    counts[textidx, voc[word]] += 1
                # Else add the new word to the vocabulary dict and set it's index in the counts matrix
                else:
                    voc[word] = wordidx 
                    # increment the count in counts for the current word in the current document
                    counts[textidx, wordidx] += 1 
                    wordidx += 1 # Increment the index for the next new word
    return voc, counts    

class NB(BaseEstimator, ClassifierMixin):
    def __init__(self): # No modification, everything is inherited from BaseEstimator et ClassifierMixin
        self.prior = {}
        self.vocabulary = {}
        self.cond_prob = {}

    def fit(self, X, y, stop=None):
        vocabulary, counts = count_words(X,stop)
        nDocs = len(counts)
        C = set(y)
        prior = {}
        tct = np.zeros([len(C), len(vocabulary)])
        array_condprob = np.copy(tct)
        cond_prob = {}
        for i,c in enumerate(C):
            # Get lines' index according to the membership of the current class
            ixc = np.where(y == c)[0]
            nDocsInc = len(ixc)
            prior[c] = nDocsInc / nDocs # class frequencies by document
            # Gets all words associated to the current class from counts matrix
            countsByClass = counts[ixc, :] 
            # Sum to get the number of word for each class
            tct[i, :] = np.sum(countsByClass, axis=0)
            # Computes the conditional probability for the word to belong to the current class 
            array_condprob[i, :] = (tct[i, :] + 1) / np.sum(tct[i, :] + 1)
            # Stores this probabilities into a dict with key : class and value : the probability for each word
            cond_prob[c] = array_condprob[i, :]
        self.vocabulary = vocabulary
        self.prior = prior
        self.cond_prob = cond_prob
    
    def extract_tokens(vocabulary, doc):
        tokens = re.findall(r"\w+", doc.lower())
        # New words missing from the training set are left over
        return [token for token in tokens if token in vocabulary.keys()] 
        
    def predict(self, X):
        results = np.zeros(len(X))
        for i,text in enumerate(X):
            # Gets all the word from the current testing doc
            w = NB.extract_tokens(self.vocabulary, text)
            score, maxprob, maxclass, = 0, - math.inf, 0
            for c in self.prior:
                # Initializes the score based on the a priori probability of belonging to a class
                score = np.log(self.prior[c])
                for word in w:
                    # Gets the index of the word in the train's counts matrix
                    index = self.vocabulary[word]
                    # Computes the score of the word by adding to it the log of its conditionnal probability of belonging
                    # to the class from the train's counts matrix
                    score += np.log(self.cond_prob[c][index])    
                if score > maxprob:
                    maxprob, maxclass  = score, c
            # stores for the current document, the predicted class according to the maximum conditionnal probability        
            results[i] = maxclass
        return results
    
    def score(self, X, y):
        return np.mean(self.predict(X) == y)

    
f = open("data/english.stop")
stop = f.read().splitlines()
f.close()

Retestons les performances

In [199]:
nb = NB()
nb.fit(texts[::2], y[::2], stop)
print(nb.score(texts[1::2], y[1::2]))

0.812


Avec une cross-validation

In [207]:
scores = []
for _ in range(5):
    randidx = np.random.randint(0, len(texts), int(len(texts)/5))
    subset_train = [texts[i] for i in randidx]
    labels_train = [y[i] for i in randidx]
    nb.fit(subset_train, labels_train, stop)
    randidx = np.random.randint(0, len(texts), int(len(texts)/5))
    subset_test = [texts[i] for i in randidx]
    labels_test = [y[i] for i in randidx]
    scores.append(nb.score(subset_test, labels_test))
print(np.mean(scores))
print(np.std(scores))

0.7765
0.0180693109996


Le score obtenu est quasi identique au précédent mais on note qu'il est beaucoup plus stable car la déviation standard est petite.

## Utilisation de Scikit learn

### 1) Utilisation de MultinomialNB

In [217]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline

In [238]:
vectorizer = CountVectorizer(lowercase=True, stop_words=stop)
nb = MultinomialNB()
naiveBayes = Pipeline([('CountVectorizer', vectorizer),('MultinomialNB',nb)])
naiveBayes.set_params(CountVectorizer__analyzer = 'char_wb').fit(texts[::2], y[::2])
scores = cross_val_score(naiveBayes,texts[1::2], y[1::2], cv = 5)
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

Accuracy: 0.61 (+/- 0.15)


In [273]:
naiveBayes.set_params(CountVectorizer__analyzer = 'word').fit(texts[::2], y[::2])
scores = cross_val_score(naiveBayes,texts[1::2], y[1::2], cv = 5)
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

Accuracy: 0.77 (+/- 0.04)


In [274]:
naiveBayes.set_params(CountVectorizer__ngram_range = (1,2) ).fit(texts[::2], y[::2])
scores = cross_val_score(naiveBayes,texts[1::2], y[1::2], cv = 5)
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

Accuracy: 0.78 (+/- 0.06)


## 2) Test d'autres algorithmes

In [275]:
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(stop_words=stop)
svm = LinearSVC()
predictor = Pipeline([('CountVectorizer', vectorizer),('LinearSVC',svm)]).fit(texts[::2], y[::2])
scores = cross_val_score(predictor,texts[1::2], y[1::2], cv = 5)
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

Accuracy: 0.78 (+/- 0.04)


In [276]:
log = LogisticRegression()
predictor = Pipeline([('CountVectorizer', vectorizer),('LogisticRegression',log)]).fit(texts[::2], y[::2])
scores = cross_val_score(predictor,texts[1::2], y[1::2], cv = 5)
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

Accuracy: 0.80 (+/- 0.05)


Les résultats sont quasiment les mêmes pour ces deux algorithmes

## 3) Utilisation de NLTK pour raciniser les mots

In [277]:
from nltk import SnowballStemmer
from nltk import word_tokenize
stemmer = SnowballStemmer('english')
# Declaration of a tokenizer to be used with sklearn
def stem_tokens(tokens, stemmer):
    stemmed = []
    for item in tokens:
        stemmed.append(stemmer.stem(item))
    return stemmed

def tokenize(text):
    tokens = word_tokenize(text)
    stems = stem_tokens(tokens, stemmer)
    return stems

vectorizer = CountVectorizer(tokenizer=tokenize, stop_words=stop)
nb = MultinomialNB()
naiveBayes = Pipeline([('CountVectorizer', vectorizer),('MultinomialNB',nb)])
naiveBayes.set_params(CountVectorizer__analyzer = 'word').fit(texts[::2], y[::2])
scores = cross_val_score(naiveBayes,texts[1::2], y[1::2], cv = 5)
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))



Accuracy: 0.77 (+/- 0.04)


La précision ne s'améliore pas beaucoup, essayons avec un autre classifieur, le SVM

In [278]:
svm = LinearSVC()
predictor = Pipeline([('CountVectorizer', vectorizer),('LinearSVC',svm)]).fit(texts[::2], y[::2])
scores = cross_val_score(predictor,texts[1::2], y[1::2], cv = 5)
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

Accuracy: 0.77 (+/- 0.06)


La performance est identique à la précédente

## 4) Utilisation de pos_tag

In [282]:
from nltk import pos_tag



def stem_tokens(tokens, stemmer):
    stemmed = []
    for item in tokens:
        stemmed.append(stemmer.stem(item))
    return stemmed


def pos_tag_tokens(tokens) :
    tagged = pos_tag(tokens)
    filtered = [token[0] for token in tokens if togen[1] in ['NN','VB','JJ','RB','VBP']]
    return filtered


def tokenize(text):
    tokens = word_tokenize(text)
    stems = stem_tokens(tokens, stemmer)
    tagged = pos_tag_tokens(stems)
    return tagged


svm = LinearSVC()
predictor = Pipeline([('CountVectorizer', vectorizer),('LinearSVC',svm)]).fit(texts[::2], y[::2])
scores = cross_val_score(predictor,texts[1::2], y[1::2], cv = 5)
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))




Accuracy: 0.77 (+/- 0.06)
