# Importer bibliothèques

In [None]:
#Panda
import pandas as pd
import seaborn as sns
import numpy as np
from sklearn.model_selection import GridSearchCV


#Tokenize
from nltk.tokenize import word_tokenize

#String
import string

#Stopword
from nltk.corpus import stopwords
stop=set(stopwords.words('english'))

#Lemmatize
import nltk
from nltk.stem import WordNetLemmatizer
lemmatizer=WordNetLemmatizer()

# Importer Data

In [None]:
data['label']=data['label'].astype("category")

In [None]:
data=pd.read_csv("../input/imdb-dataset-sentiment-analysis-in-csv-format/Train.csv")
data.head()

In [None]:
data.label.value_counts()
data=data.sample(2000)


### Diagramme de la colonne Label

In [None]:
sns.countplot(data.label)

### Taille de la Dataframe

In [None]:
data.shape

# Nettoyage
## (définition des fonctions)

### Remove Punctuation

In [None]:
#Punct
import string
string.punctuation

def remove_punct(text) :
    text_tok = word_tokenize(text)
    list=[]
    for i in text_tok :
        if not i in string.punctuation :
            list.append(i)
                        
    resultat = " ".join(list)
    return resultat


### Remove Stopwords

In [None]:
#Stop Words

def remove_stopword(text):
    text_tok = word_tokenize(text)
    list=[]
    for i in text_tok :
        if not i in stop:
            list.append(i)
                        
    resultat = " ".join(list)
    return resultat

### Lematization

In [None]:
#Lemmatization

def lemm(text):
    text_tok = word_tokenize(text) 
    l=[]
    for word in text_tok:
        l.append(lemmatizer.lemmatize(word))

    resultat = " ".join(l)

    return resultat

# Nettoyage
## (Application)

In [None]:
# Supprimer manquants
data.dropna(inplace=True)

#Supprimer doublons
data = data.drop_duplicates()


In [None]:
#Mettre en minuscule

data['text'] = data['text'].str.lower()


In [None]:
#Remove punctuation
data['text'] = data.text.apply(remove_punct)

#Remove stopwords
data['text'] = data.text.apply(remove_stopword)

#Lemmatization
data['text'] = data.text.apply(lemm)


# Bag of word

In [None]:
corpus = data['text'].values

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
bw_vect = CountVectorizer(max_features=500)

# tokenize et construire le vocabulaire
bw_fit = bw_vect.fit(corpus)

# vectoriser les mots
bw_corpus = bw_fit.transform(corpus)

In [None]:
bw_corpus.shape

In [None]:
bw_fit.get_feature_names()

In [None]:
bw_corpus.toarray()

In [None]:
bw_data=pd.DataFrame(bw_corpus.toarray(),columns=bw_fit.get_feature_names())
bw_data

# TF-IDF

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
#Initialiser les paramètres du vectoriseur
tf_vect = TfidfVectorizer(max_features=500)

#Apprendre le vocabulaire du vectoriseur basé sur le paramètre initialisé
tfidf_fit=tf_vect.fit(corpus)

#Vectoriser le corpus
tfidf_corpus= tfidf_fit.transform(corpus)

In [None]:
tfidf_fit.get_feature_names()

In [None]:
tfidf_data=pd.DataFrame(tfidf_corpus.toarray(),columns = tfidf_fit.get_feature_names())
tfidf_data

# Arbre BW

In [None]:
from sklearn.model_selection import train_test_split
Y = data.label
Xbw = bw_data
X_trainbw, X_testbw, Y_train, Y_test = train_test_split(Xbw, Y, test_size=0.3, random_state=0)

In [None]:
Y_train.value_counts()

In [None]:
from sklearn import tree
tree_model = tree.DecisionTreeClassifier()
#tree_model = tree.DecisionTreeClassifier(max_depth = 2)
tree_model = tree_model.fit(X_trainbw, Y_train)

In [None]:
import matplotlib.pyplot as plt
plt.figure(figsize=(15,10))
names = ['Positif', 'Négatif']
tree.plot_tree(tree_model,feature_names = Xbw.columns, class_names=names, filled = True)

In [None]:
Y_predictbw=tree_model.predict(X_testbw)

In [None]:
# Plot the Confusion Matrix :
from sklearn.metrics import (accuracy_score, confusion_matrix)
mat = confusion_matrix(Y_predictbw, Y_test)
print(mat)

In [None]:
#plt.figure(figsize=(15,5))
sns.heatmap(mat, annot=True,  xticklabels=names, yticklabels=names)
plt.xlabel('Test')
plt.ylabel('Prédiction')

In [None]:
# Précision
a_CART = accuracy_score(Y_test,Y_predictbw)
print("La précision du modèle CART est de : ",a_CART)

# Arbre TFIDF

In [None]:
Xtfidf = tfidf_data
X_traintfidf, X_testtfidf, Y_train, Y_test = train_test_split(Xtfidf, Y, test_size=0.3, random_state=0)

In [None]:
Y_train.value_counts()

In [None]:
tree_model = tree.DecisionTreeClassifier()
#tree_modeltf = tree.DecisionTreeClassifier(max_depth = 2)
tree_model = tree_model.fit(X_traintfidf, Y_train)

In [None]:
import matplotlib.pyplot as plt
plt.figure(figsize=(15,10))
names = ['Positif', 'Négatif']
tree.plot_tree(tree_model,feature_names = Xtfidf.columns, class_names=names, filled = True)

In [None]:
Y_predicttf=tree_model.predict(X_testtfidf)

In [None]:
# Plot the Confusion Matrix :
from sklearn.metrics import (accuracy_score, confusion_matrix)
mat = confusion_matrix(Y_predicttf, Y_test)
print(mat)

In [None]:
#plt.figure(figsize=(15,5))
sns.heatmap(mat, annot=True,  xticklabels=names, yticklabels=names)
plt.xlabel('Test')
plt.ylabel('Prédiction')

In [None]:
# Précision
a_CART = accuracy_score(Y_test,Y_predicttf)
print("La précision du modèle CART est de : ",a_CART)

# Grid TF-IDF

In [None]:
depths = np.arange(10, 40,5)
param_grid = [{'max_depth':depths}]

In [None]:
grid_tree = GridSearchCV(estimator=tree.DecisionTreeClassifier(),param_grid=param_grid,scoring='accuracy',cv=10)
grid_tree.fit(X_traintfidf, Y_train)
best_model_tree = grid_tree.best_estimator_

In [None]:
Y_grid = best_model_tree.predict(X_testtfidf)

# Précision:
accuracy_score(Y_test, Y_grid)

In [None]:
print("La meilleure profondeur est : ")
print(grid_tree.best_params_)

# Forêt d'arbre

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
Rf_model = RandomForestClassifier()
Rf_model=Rf_model.fit(X_traintfidf, Y_train)

In [None]:
Y_predict=Rf_model.predict(X_testtfidf)

In [None]:
mat = confusion_matrix(Y_predict, Y_test)
sns.heatmap(mat, annot=True,  xticklabels=names, yticklabels=names)
plt.xlabel('Test')
plt.ylabel('Prédiction')

In [None]:
# Accuracy:
a_CART = accuracy_score(Y_test,Y_predict)
print("La précision du score du modèle RF est de : ",a_CART)

# Fonction Classer

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
commentaire = "good episode"

In [None]:
def classer(commentaire):
     
    #transformation en vecteur
    
    tfidf_commentaire = tfidf_fit.transform([commentaire])
    
    cm = cosine_similarity(tfidf_commentaire, tfidf_corpus)
    pos = np.argmax(cm[0])
    data.text[pos]
    return data.text[pos]

In [None]:
classer(commentaire)

In [None]:
while True:
    text = str(input("Input: "))
    if text== "exit":
        print("Response: Exiting.....")
        break
    print("Response:",classer(commentaire))