# Classification de documents d'opinions

# Pré-traitements

On commence par importer les données :

In [372]:
import re
import nltk
import json
import pandas
import warnings
import unicodedata
import contractions
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

In [373]:
# nltk.download("stopwords")
# nltk.download("punkt")
# nltk.download("wordnet")

## Import Classifiers

In [374]:
import sklearn
from sklearn.svm import SVC
from unidecode import unidecode
from sklearn.model_selection import KFold
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

### One time installed tools

In [375]:
# pip install contractions
# pip install treetaggerwrapper
# pip install unidecode
# pip install treetaggerwrapper

### Constants

In [376]:
exceptionStopWords = ['no', 'not', 'nor', 'down', 'up', 'on', 'too', 'out']

newStopWords = set(stopwords.words('english')).difference(exceptionStopWords)

In [394]:
movieComments = pandas.read_csv('data/dataset.csv', sep = '\t', header = None, encoding = "utf8")
movieComments['lables'] = pandas.read_csv('data/labels.csv', sep = '\t', header = None, encoding = "utf8")
movieComments.columns = ['comments','lables']

In [407]:
movieComments = movieComments[0:100]

In [409]:
movieComments

Unnamed: 0,comments,lables
0,try finish film three time god awful ....,-1
1,get watch dvd home . love westerns hus...,-1
2,oh worst reunion movie ever see . say ...,-1
3,sure one woah attractions epcot open si...,-1
4,worst movie ever see billy zane in . ...,-1
...,...,...
95,film japanese woman obsession calligraphy...,-1
96,say right away check spoilers box give ...,-1
97,movie let down decidedly hard . great ...,-1
98,bad act bad light bad plot quality por...,-1


Pré-traitements choisis :

1- Supression de caractères non ASCII

2- Suppression des contractions

3- Passage en minuscule

4- Supression de la ponctuation

5- Suppressions des stopwords

6- Remplacement des nombres

7- Lemmatisation

## Remove special caracters

In [410]:
def removeSpecialCaracters(movieComments):
    for index, comment in movieComments.iterrows():
        comment = comment['comments']
        result = re.sub('[^a-zA-Z\n\.]', ' ', comment)
        comment = re.sub(' +', ' ', result)

        # Removing non ASCII characters
        comment = unicodedata.normalize('NFKD', comment).encode("ascii", "ignore").decode("utf-8", 'ignore')
        
        movieComments.loc[index, 'comments'] = comment    
    return movieComments

In [411]:
movieComments = removeSpecialCaracters(movieComments)
movieComments

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s


Unnamed: 0,comments,lables
0,try finish film three time god awful . case p...,-1
1,get watch dvd home . love westerns husband re...,-1
2,oh worst reunion movie ever see . say lot . a...,-1
3,sure one woah attractions epcot open silly . ...,-1
4,worst movie ever see billy zane in . understa...,-1
...,...,...
95,film japanese woman obsession calligraphy on ...,-1
96,say right away check spoilers box give commen...,-1
97,movie let down decidedly hard . great concept...,-1
98,bad act bad light bad plot quality porno movi...,-1


In [412]:
def removeContractions(movieComments):
    for index, comment in movieComments.iterrows():
        comment = comment['comments']
        comment = contractions.fix(comment, slang = True)
        movieComments.loc[index, 'comments'] = comment 
    return movieComments


In [413]:
movieComments = removeContractions(movieComments)
movieComments

Unnamed: 0,comments,lables
0,try finish film three time god awful . case p...,-1
1,get watch dvd home . love westerns husband re...,-1
2,oh worst reunion movie ever see . say lot . a...,-1
3,sure one woah attractions epcot open silly . ...,-1
4,worst movie ever see billy zane in . understa...,-1
...,...,...
95,film japanese woman obsession calligraphy on ...,-1
96,say right away check spoilers box give commen...,-1
97,movie let down decidedly hard . great concept...,-1
98,bad act bad light bad plot quality porno movi...,-1


In [414]:
def removeStopWords(movieComments):
    for index, comment in movieComments.iterrows():
        comment = comment['comments']

        removedStopWords =  [word for word in comment.split() if word.lower() not in newStopWords]
        #print(removedStopWords)
        #set(tokenizedText).difference(stopwords.words('english'))
        #comment = [word for word in comment if word not in stopwords.words('english')]


        comment = "".join([" " + i for i in removedStopWords])
        movieComments.loc[index, 'comments'] = comment 

        #print(index, comment)
    return movieComments

In [415]:
movieComments = removeStopWords(movieComments)
movieComments

Unnamed: 0,comments,lables
0,try finish film three time god awful . case p...,-1
1,get watch dvd home . love westerns husband re...,-1
2,oh worst reunion movie ever see . say lot . a...,-1
3,sure one woah attractions epcot open silly . ...,-1
4,worst movie ever see billy zane . understand ...,-1
...,...,...
95,film japanese woman obsession calligraphy on ...,-1
96,say right away check spoilers box give commen...,-1
97,movie let down decidedly hard . great concept...,-1
98,bad act bad light bad plot quality porno movi...,-1


In [416]:
def normilizeComments(movieComments):
    for index, comment in movieComments.iterrows():
        comment = comment['comments']

        # Putting all words in lowercase
        comment = [word.lower() for word in comment.split()]
        comment = " ".join(comment)
        movieComments.loc[index, 'comments'] = comment 
    return movieComments

In [417]:
movieComments = normilizeComments(movieComments)
movieComments

Unnamed: 0,comments,lables
0,try finish film three time god awful . case po...,-1
1,get watch dvd home . love westerns husband ren...,-1
2,oh worst reunion movie ever see . say lot . as...,-1
3,sure one woah attractions epcot open silly . f...,-1
4,worst movie ever see billy zane . understand m...,-1
...,...,...
95,film japanese woman obsession calligraphy on s...,-1
96,say right away check spoilers box give comment...,-1
97,movie let down decidedly hard . great concept ...,-1
98,bad act bad light bad plot quality porno movie...,-1


In [418]:
def tokenizedText(movieComments):
    for index, comment in movieComments.iterrows():
        comment = comment['comments']

        # Tokenizing
        tokenizedText = word_tokenize(comment)
        
        # Lemmatization
        lemmatizer = WordNetLemmatizer()
        comment = [lemmatizer.lemmatize(word, pos = 'v') for word in tokenizedText]
    
        # Turning back tokens into a string
        comment = " ".join([" " + i for i in comment])
        movieComments.loc[index, 'comments'] = comment 

    return movieComments

In [419]:
movieComments = tokenizedText(movieComments)
movieComments

Unnamed: 0,comments,lables
0,try finish film three time god awful ....,-1
1,get watch dvd home . love westerns hus...,-1
2,oh worst reunion movie ever see . say ...,-1
3,sure one woah attractions epcot open si...,-1
4,worst movie ever see billy zane . unde...,-1
...,...,...
95,film japanese woman obsession calligraphy...,-1
96,say right away check spoilers box give ...,-1
97,movie let down decidedly hard . great ...,-1
98,bad act bad light bad plot quality por...,-1


In [420]:
# pip install arff

import arff
arff.dump('dataset_tokenized_lemmatized.arff'
      , movieComments.values
      , relation='movieComments'
      , names=movieComments.columns)


In [5]:
def clean_text(commentString):   
    # Removing non ASCII characters
    commentString = unicodedata.normalize('NFKD', commentString).encode("ascii", "ignore").decode("utf-8", 'ignore')

    # Removing contractions
    commentString = contractions.fix(commentString, slang = True)

    # Tokenizing
    tokenizedText = word_tokenize(commentString)

    # Putting all words in lowercase
    tokenizedText = [word.lower() for word in tokenizedText]

    # Deleting ponctuations
    tokenizedText = [word for word in tokenizedText if word.isalpha()]

    # Removing stop words
    tokenizedText = [word for word in tokenizedText if not word in OurStopWords]
    
    # Converting numbers
    #inflectEngine = inflect.engine()
    #newWords = []
    #for word in tokenizedText:
     #   if word.isdigit():
     #       newWords.append(inflectEngine.number_to_words(word))
    #    else:
    #        newWords.append(word)
    #tokenizedText = newWords

    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    commentString = [lemmatizer.lemmatize(word, pos = 'v') for word in tokenizedText]

    # Turning back tokens into a string
    commentString = "".join([" " + i for i in tokenizedText]).strip()
    
    return commentString

print('hhhh')

hhhh


# Classifieurs

In [6]:
warnings.filterwarnings("ignore", category = FutureWarning)
print('hoho')

hoho


Définition des variables d'apprentissage et des variables à prédire

In [7]:
movieCommentsArray = movieComments.values
data = movieCommentsArray[:, 0] # X
print(data)
movieCommentsLabelsArray = movieCommentsLabels.values
dataLabels = movieCommentsLabelsArray[:, 0] # Y
print(dataLabels)

 'I just got through watching this DVD at home. We love Westerns, so my husband rented it. He started apologizing to me half way through. The saddles, costumes, accents--everything was off. The part that made me so mad is where the guy didn\'t shoot the "collector" with his bow and arrow as he was taking the fat guy\'s soul. His only excuse was "he only had 2 arrows left." We watched it all the way through, and, as someone else said...too many bad things to single out any one reason why it sucked. I mean, the fact that the boy happened to snatch the evil stone from the collector on the same month and day it was found, what\'s the point of that? And why were there a grave yard where everyone died on April 25 but the people whose souls were taken by the collector were still up walking around? If you want a movie to make fun of after a few beers, this may be your movie. However, if you want a real Western, you will hate this movie.'
 'Oh my, this was the worst reunion movie I have ever se

In [8]:
#cleanText = [clean_text(comment) for comment in data]
cleanText=[]
for comment in data:
  
  cleanText.append(clean_text(comment))

#cleanText = pandas.DataFrame(cleanText) 

print("cleanText")

cleanText


Vectorisation avec TF-IDF

In [9]:
# list of text documents
# create the transform
vectorizer = CountVectorizer()
# tokenize and build vocab
vectorizer.fit(cleanText)
# summarize
print(vectorizer.vocabulary_)
# encode document
vectors = vectorizer.transform(cleanText)
# summarize encoded vector
print(vectors.shape)




(10000, 48359)


In [16]:
print(type(vectors))
vocabulary = vectorizer.vocabulary_
for key,value in vocabulary.items():
    vocabulary[key] = int(value)
    
print(type(vocabulary))
with open('vectorizer.vocabulary_.json', 'w') as dataFile:
    json.dump(vocabulary, dataFile)


<class 'scipy.sparse.csr.csr_matrix'>
<class 'dict'>


In [17]:
ls

 [0m[01;34mcontent[0m/
 dataVectorized.json
 [01;34mMassiBelaid[0m/
 movieanalyser.ipynb
 movieanalyser_modified.ipynb
 Projet_NLP_Fouille_de_donnees.ipynb
'rapport fdd M2.pdf'
'Screenshot 2020-11-28 at 01.17.59.png'
'Screenshot 2020-11-28 at 01.18.07.png'
'Screenshot 2020-11-28 at 01.18.23.png'
'Screenshot 2020-11-28 at 01.18.31.png'
'Screenshot 2020-11-28 at 01.18.36.png'
'Screenshot 2020-11-28 at 01.19.08.png'
'Screenshot 2020-11-28 at 01.19.15.png'
'Screenshot 2020-11-28 at 01.19.22.png'
'Screenshot 2020-11-28 at 12.08.33.png'
 vectorizer.vocabulary_.json


In [17]:
import numpy as np

with open('dataVectorized.json', 'w') as dataFile:
  for row in cleanText:
      np.savetxt(dataFile, row)

ValueError: Expected 1D or 2D array, got 0D array instead

In [15]:


#vectorizer = TfidfVectorizer(preprocessor = cleanText, ngram_range = (1, 3), min_df = 0.01, max_df = 0.9, sublinear_tf = False, smooth_idf = True)
# vectorizer = TfidfVectorizer().fit_transform(data)


data = vectors.toarray()

Découpage des données en jeu d'apprentissage (70%) et jeu de test (30%)

In [16]:
trainingSize = 0.7
testingSize = 0.3

trainingData, testingData, trainingDataLabels, testingDataLabels = train_test_split(data, dataLabels, train_size = trainingSize, test_size = testingSize)
# X_train,    X_test,      Y_train,            Y_test

ValueError: Found input variables with inconsistent numbers of samples: [100, 10000]

# Sans grid search

Classifieurs SVC et Random forest, avec leurs paramètres par défaut

In [1]:
models = []
models.append(("SVC", SVC()))
models.append(("Random forest", RandomForestClassifier()))

for name, model in models:
    kFold = KFold(n_splits = 20, shuffle = True, random_state = 10)
    crossVal = cross_val_score(model, data, dataLabels, cv = kFold, scoring = "accuracy")
    print(name, ": ", crossVal.mean(), " (", crossVal.std(), ") \n")

NameError: ignored

# Avec grid search

Définition des classifieurs et leurs paramètres

In [21]:
classifiers = {
    'RandomForestClassifier': RandomForestClassifier(),
    'SVC': SVC()
}

parameters = {
    'RandomForestClassifier': [
        #TODO: ajouter les paramètres à tester
    ],
    
    'SVC': [
        {'C': [ 1, 2]},
        {'kernel': ['linear']},
        {'degree': [3]},
        {'class_weight': ['balanced']},
        {'probability' : ['True', 'False']},
        {'decision_function_shape': ['ovo', 'ovr']},
        {'random_state': [0, 1 , 5, 10]}
    ]
}

Recherche du meilleur classifieur entre SVC et Random Forest, et de ses meilleurs paramètres

In [22]:
class Model:
    def __init__(self, classifier, parameters, score):
        self.classifier = classifier
        self.parameters = parameters
        self.score = score

    def __repr__(self):
        return repr((self.classifier, self.parameters, self.score))

results = []
for key, value in classifiers.items():
    gridSearch = GridSearchCV(
        estimator = value,
        param_grid = parameters[key],
        scoring = "accuracy",
        cv = 5,
        n_jobs = -1,
        iid = True)

    gridSearch.fit(trainingData, trainingDataLabels)

    result = Model(key, gridSearch.best_score_, gridSearch.best_estimator_)
    results.append(result)

results = sorted(results, key = lambda result: result.score, reverse = True)

print("Results from best to worst: \n")
for result in results:
    print ("Classifier: ", result.parameters,
    " with score %0.2f " %result.score, '\n')

ValueError: ignored

Utilisation d'une pipeline pour sauvegarder le meilleur modèle

In [23]:
from sklearn.pipeline import Pipeline

classifier = SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
                 decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
                 kernel='linear', max_iter=-1, probability=False, random_state=None,
                 shrinking=True, tol=0.001, verbose=False)

pipeline = Pipeline([
    ("vectorizer", vectorizer),
    ("classifier", classifier)
])

pipeline.fit(trainingData, trainingDataLabels)

result = pipeline.predict(testingData)
print('\nAccuracy: ', accuracy_score(result, testingDataLabels),'\n')

matrix = confusion_matrix(testingDataLabels, result)
print ('\nMatrice de confusion: \n', matrix, "\n")

print ('\n', classification_report(testingDataLabels, result), "\n")

NameError: ignored

Sauvegarde dans un fichier pickle

In [None]:
import pickle

pickle.dump(pipeline, open('groupeE.pkl', 'wb'))

# Résultats du challenge

In [24]:
import pickle
import pandas
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

clf_loaded = pickle.load(open('groupeE.pkl', 'rb'))

movieComments = pandas.read_csv('Data/test_data.csv', sep = '\t', header = None, encoding = "utf8")
movieCommentsLabels = pandas.read_csv('Data/test_labels.csv', sep = '\t', header = None, encoding = "utf8")

movieCommentsArray = movieComments.values
data = movieCommentsArray[:, 0] # X

movieCommentsLabelsArray = movieCommentsLabels.values
dataLabels = movieCommentsLabelsArray[:, 0] # Y

result = clf_loaded.predict(data)

print("Accuracy:", accuracy_score(result, dataLabels),'\n')

matrix = confusion_matrix(movieCommentsLabelsArray[:, 0], result)
print('\nMatrice de confusion: \n', matrix, "\n")

print('\n', classification_report(dataLabels, result), "\n")


FileNotFoundError: ignored