# TP de Text Mining

In [209]:
import os.path as op
import numpy as np
from sklearn.base import BaseEstimator, ClassifierMixin
from glob import glob
import re
from collections import Counter
import math
from sklearn.cross_validation import cross_val_score

In [210]:
# Load data
print("Loading dataset")
filenames_neg = sorted(glob(op.join('data', 'imdb1', 'neg', '*.txt'))) # Assumes that data are located in data/imdb1/neg
filenames_pos = sorted(glob(op.join('data', 'imdb1', 'pos', '*.txt'))) # Assumes that data are located in data/imdb1/neg
texts_neg = [open(f).read() for f in filenames_neg]
texts_pos = [open(f).read() for f in filenames_pos]
texts = texts_neg + texts_pos
y = np.ones(len(texts), dtype=np.int)
y[:len(texts_neg)] = 0.

print("%d documents" % len(texts))

Loading dataset
2000 documents


## 1) Compléter la fonction count_word

In [211]:
def count_words(texts):
    voc = {}
    uniq = set()
    # Determine total number of uniques words in the whole corpus of texts
    for text in texts :
        words = set(re.findall(r"\w+",text.lower()))
        uniq = uniq.union(words)
    counts = np.zeros((len(texts), len(uniq)))
    wordidx = 0
    for textidx,text  in enumerate(texts):
        #print(textidx," / ",len(texts), " documents\r" )
        text = re.findall(r"\w+",text.lower()) # Removes non-words characters and tokenize text
        for word in text:
                # If the word's already in the vocabulary dict, increment the count of the word in the counts matrix
                if word in voc: # for the current document
                    counts[textidx, voc[word]] += 1
                # Else add the new word to the vocabulary dict and set it's index in the counts matrix
                else:
                    voc[word] = wordidx 
                    # increment the count in counts for the current word in the current document
                    counts[textidx, wordidx] += 1 
                    wordidx += 1 # Increment the index for the next new word
    return voc, counts    
        

#### Test de la fonction

In [212]:
vocabulary, X = count_words(texts)

In [213]:
print(X.shape)

(2000, 39696)


La fonction marche correctement

## 2) Expliquer la classification obtenue dans le fichier poldata.readme

Voir le PDF

## 3) implémenter le classifieur Bayésien naïf

In [214]:
class NB(BaseEstimator, ClassifierMixin):
    def __init__(self): # No modification, everything is inherited from BaseEstimator et ClassifierMixin
        self.prior = {}
        self.vocabulary = {}
        self.cond_prob = {}

    def fit(self, X, y):
        vocabulary, counts = count_words(X)
        nDocs = len(counts)
        C = set(y)
        prior = {}
        tct = np.zeros([len(C), len(vocabulary)])
        array_condprob = np.copy(tct)
        cond_prob = {}
        for i,c in enumerate(C):
            # Get lines' index according to the membership of the current class
            ixc = np.where(y == c)[0]
            nDocsInc = len(ixc)
            prior[c] = nDocsInc / nDocs # class frequencies by document
            # Gets all words associated to the current class from counts matrix
            countsByClass = counts[ixc, :] 
            # Sum to get the number of word for each class
            tct[i, :] = np.sum(countsByClass, axis=0)
            # Computes the conditional probability for the word to belong to the current class 
            array_condprob[i, :] = (tct[i, :] + 1) / np.sum(tct[i, :] + 1)
            # Stores this probabilities into a dict with key : class and value : the probability for each word
            cond_prob[c] = array_condprob[i, :]
        self.vocabulary = vocabulary
        self.prior = prior
        self.cond_prob = cond_prob
    
    def extract_tokens(vocabulary, doc):
        tokens = re.findall(r"\w+", doc.lower())
        # New words missing from the training set are left over
        return [token for token in tokens if token in vocabulary.keys()] 
        
    def predict(self, X):
        results = np.zeros(len(X))
        for i,text in enumerate(X):
            # Gets all the word from the current testing doc
            w = NB.extract_tokens(self.vocabulary, text)
            score, maxprob, maxclass, = 0, - math.inf, 0
            for c in self.prior:
                # Initializes the score based on the a priori probability of belonging to a class
                score = np.log(self.prior[c])
                for word in w:
                    # Gets the index of the word in the train's counts matrix
                    index = self.vocabulary[word]
                    # Computes the score of the word by adding to it the log of its conditionnal probability of belonging
                    # to the class from the train's counts matrix
                    score += np.log(self.cond_prob[c][index])    
                if score > maxprob:
                    maxprob, maxclass  = score, c
            # stores for the current document, the predicted class according to the maximum conditionnal probability        
            results[i] = maxclass
        return results
    
    def score(self, X, y):
        return np.mean(self.predict(X) == y)
    

Voir le PDF pour l'interpretation de l'algorithme.
Test du classifieur

In [215]:
nb = NB()
nb.fit(texts[::2], y[::2])
print(nb.score(texts[1::2], y[1::2]))

0.81


Le classifieur fonctionne

## 4) Tester le classifieur avec une validation croisée à 5 passes

Nous utilisons la cross validation de SKLEARN pour tester le classifieur

In [216]:
scores = cross_val_score(nb,texts[1::2], y[1::2], cv = 10)
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

Accuracy: 0.80 (+/- 0.07)


Les résultats sont plutôts corrects

## 5) Filtrer les stop words

In [174]:
def count_words(texts, stopwords = None):
    voc = {}
    uniq = set()
    # Determine total number of uniques words in the whole corpus of texts
    for text in texts :
        words = re.findall(r"\w+",text.lower())
        words = set([word for word in words if word not in stopwords])
        uniq = uniq.union(words)
    counts = np.zeros((len(texts), len(uniq)))
    wordidx = 0
    for textidx,text  in enumerate(texts):
        text = re.findall(r"\w+",text.lower()) # Removes non-words characters and tokenize text
        text = [word for word in text if word not in stopwords]
        for word in text:
                # If the word's already in the vocabulary dict, increment the count of the word in the counts matrix
                if word in voc: # for the current document
                    counts[textidx, voc[word]] += 1
                # Else add the new word to the vocabulary dict and set it's index in the counts matrix
                else:
                    voc[word] = wordidx 
                    # increment the count in counts for the current word in the current document
                    counts[textidx, wordidx] += 1 
                    wordidx += 1 # Increment the index for the next new word
    return voc, counts    

class NB(BaseEstimator, ClassifierMixin):
    def __init__(self): # No modification, everything is inherited from BaseEstimator et ClassifierMixin
        self.prior = {}
        self.vocabulary = {}
        self.cond_prob = {}

    def fit(self, X, y, stop=None):
        vocabulary, counts = count_words(X,stop)
        nDocs = len(counts)
        C = set(y)
        prior = {}
        tct = np.zeros([len(C), len(vocabulary)])
        array_condprob = np.copy(tct)
        cond_prob = {}
        for i,c in enumerate(C):
            # Get lines' index according to the membership of the current class
            ixc = np.where(y == c)[0]
            nDocsInc = len(ixc)
            prior[c] = nDocsInc / nDocs # class frequencies by document
            # Gets all words associated to the current class from counts matrix
            countsByClass = counts[ixc, :] 
            # Sum to get the number of word for each class
            tct[i, :] = np.sum(countsByClass, axis=0)
            # Computes the conditional probability for the word to belong to the current class 
            array_condprob[i, :] = (tct[i, :] + 1) / np.sum(tct[i, :] + 1)
            # Stores this probabilities into a dict with key : class and value : the probability for each word
            cond_prob[c] = array_condprob[i, :]
        self.vocabulary = vocabulary
        self.prior = prior
        self.cond_prob = cond_prob
    
    def extract_tokens(vocabulary, doc):
        tokens = re.findall(r"\w+", doc.lower())
        # New words missing from the training set are left over
        return [token for token in tokens if token in vocabulary.keys()] 
        
    def predict(self, X):
        results = np.zeros(len(X))
        for i,text in enumerate(X):
            # Gets all the word from the current testing doc
            w = NB.extract_tokens(self.vocabulary, text)
            score, maxprob, maxclass, = 0, - math.inf, 0
            for c in self.prior:
                # Initializes the score based on the a priori probability of belonging to a class
                score = np.log(self.prior[c])
                for word in w:
                    # Gets the index of the word in the train's counts matrix
                    index = self.vocabulary[word]
                    # Computes the score of the word by adding to it the log of its conditionnal probability of belonging
                    # to the class from the train's counts matrix
                    score += np.log(self.cond_prob[c][index])    
                if score > maxprob:
                    maxprob, maxclass  = score, c
            # stores for the current document, the predicted class according to the maximum conditionnal probability        
            results[i] = maxclass
        return results
    
    def score(self, X, y):
        return np.mean(self.predict(X) == y)

    
f = open("data/english.stop")
stop = f.read().splitlines()
f.close()

Retestons les performances

In [199]:
nb = NB()
nb.fit(texts[::2], y[::2], stop)
print(nb.score(texts[1::2], y[1::2]))

0.812


Avec une cross-validation

In [207]:
scores = []
for _ in range(5):
    randidx = np.random.randint(0, len(texts), int(len(texts)/5))
    subset_train = [texts[i] for i in randidx]
    labels_train = [y[i] for i in randidx]
    nb.fit(subset_train, labels_train, stop)
    randidx = np.random.randint(0, len(texts), int(len(texts)/5))
    subset_test = [texts[i] for i in randidx]
    labels_test = [y[i] for i in randidx]
    scores.append(nb.score(subset_test, labels_test))
print(np.mean(scores))
print(np.std(scores))

0.7765
0.0180693109996


Le score obtenu est quasi identique au précédent mais on note qu'il est beaucoup plus stable car la déviation standard est petite.

## Utilisation de Scikit learn

### 1) 

In [217]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline

In [238]:
vectorizer = CountVectorizer(lowercase=True, stop_words=stop)
nb = MultinomialNB()
naiveBayes = Pipeline([('CountVectorizer', vectorizer),('MultinomialNB',nb)])
naiveBayes.set_params(CountVectorizer__analyzer = 'char_wb').fit(texts[::2], y[::2])
scores = cross_val_score(naiveBayes,texts[1::2], y[1::2], cv = 10)
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

Accuracy: 0.61 (+/- 0.15)


In [239]:
naiveBayes.set_params(CountVectorizer__analyzer = 'word').fit(texts[::2], y[::2])
scores = cross_val_score(naiveBayes,texts[1::2], y[1::2], cv = 5)
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

Accuracy: 0.78 (+/- 0.04)


In [240]:
naiveBayes.set_params(CountVectorizer__ngram_range = (1,2) ).fit(texts[::2], y[::2])
scores = cross_val_score(naiveBayes,texts[1::2], y[1::2], cv = 10)
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

Accuracy: 0.78 (+/- 0.08)


## Test d'autres algorithmes

In [241]:
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression

svm = LinearSVC()
svm.fit(texts[::2], y[::2])


ValueError: could not convert string to float: 'plot : two teen couples go to a church party , drink and then drive . \nthey get into an accident . \none of the guys dies , but his girlfriend continues to see him in her life , and has nightmares . \nwhat\'s the deal ? \nwatch the movie and " sorta " find out . . . \ncritique : a mind-fuck movie for the teen generation that touches on a very cool idea , but presents it in a very bad package . \nwhich is what makes this review an even harder one to write , since i generally applaud films which attempt to break the mold , mess with your head and such ( lost highway & memento ) , but there are good and bad ways of making all types of films , and these folks just didn\'t snag this one correctly . \nthey seem to have taken this pretty neat concept , but executed it terribly . \nso what are the problems with the movie ? \nwell , its main problem is that it\'s simply too jumbled . \nit starts off " normal " but then downshifts into this " fantasy " world in which you , as an audience member , have no idea what\'s going on . \nthere are dreams , there are characters coming back from the dead , there are others who look like the dead , there are strange apparitions , there are disappearances , there are a looooot of chase scenes , there are tons of weird things that happen , and most of it is simply not explained . \nnow i personally don\'t mind trying to unravel a film every now and then , but when all it does is give me the same clue over and over again , i get kind of fed up after a while , which is this film\'s biggest problem . \nit\'s obviously got this big secret to hide , but it seems to want to hide it completely until its final five minutes . \nand do they make things entertaining , thrilling or even engaging , in the meantime ? \nnot really . \nthe sad part is that the arrow and i both dig on flicks like this , so we actually figured most of it out by the half-way point , so all of the strangeness after that did start to make a little bit of sense , but it still didn\'t the make the film all that more entertaining . \ni guess the bottom line with movies like this is that you should always make sure that the audience is " into it " even before they are given the secret password to enter your world of understanding . \ni mean , showing melissa sagemiller running away from visions for about 20 minutes throughout the movie is just plain lazy ! ! \nokay , we get it . . . there \nare people chasing her and we don\'t know who they are . \ndo we really need to see it over and over again ? \nhow about giving us different scenes offering further insight into all of the strangeness going down in the movie ? \napparently , the studio took this film away from its director and chopped it up themselves , and it shows . \nthere might\'ve been a pretty decent teen mind-fuck movie in here somewhere , but i guess " the suits " decided that turning it into a music video with little edge , would make more sense . \nthe actors are pretty good for the most part , although wes bentley just seemed to be playing the exact same character that he did in american beauty , only in a new neighborhood . \nbut my biggest kudos go out to sagemiller , who holds her own throughout the entire film , and actually has you feeling her character\'s unraveling . \noverall , the film doesn\'t stick because it doesn\'t entertain , it\'s confusing , it rarely excites and it feels pretty redundant for most of its runtime , despite a pretty cool ending and explanation to all of the craziness that came before it . \noh , and by the way , this is not a horror or teen slasher flick . . . it\'s \njust packaged to look that way because someone is apparently assuming that the genre is still hot with the kids . \nit also wrapped production two years ago and has been sitting on the shelves ever since . \nwhatever . . . skip \nit ! \nwhere\'s joblo coming from ? \na nightmare of elm street 3 ( 7/10 ) - blair witch 2 ( 7/10 ) - the crow ( 9/10 ) - the crow : salvation ( 4/10 ) - lost highway ( 10/10 ) - memento ( 10/10 ) - the others ( 9/10 ) - stir of echoes ( 8/10 ) \n'