In [None]:
#Importation des librairies Pandas et seaborn et renommagerespectivement en pd et sns
#seaborn permet l'affichage des tableaux
import seaborn as sns
import pandas as pd
#Charger les données du fichier sms-spam-collection-dataset sous forme de dataframe:
sms = pd.read_csv("/kaggle/input/sms-spam-collection-dataset/spam.csv", encoding='latin-1')
#afficher les premières lignes du tableau
sms.head()

In [None]:
#affichage des dimensions du tableau pour vérifier la consigne
sms.shape

In [None]:
#Supression des colonnes (axis1) vides (Unnamed: 2', 'Unnamed: 3','Unnamed: 4)
sms=sms.drop(['Unnamed: 2', 'Unnamed: 3','Unnamed: 4'], axis=1)

In [None]:
#Renommage des colonnes V1 et V2 respectivement en Label et Message et affichage du resultat
sms.rename(columns={'v1': 'Label', 'v2': 'Message'},inplace={True})
sms.head()

In [None]:
sms["Label"] = sms["Label"].astype("category")
#on définit le type comme catégorie bien que ce ne soit pas nécessaire
#création de l'histogramme affichant la quantité de spam et ham
sns.countplot(sms.Label)
sms.head()

In [None]:
#affichage des dimensions de la dataframe
sms.shape

In [None]:
#Réduction de la taille de la police en minuscules
sms['Message']=sms['Message'].str.lower()

In [None]:
#Importation de la fonction word_tokenize de la bibliothèque nltk.tokenize
from nltk.tokenize import word_tokenize

In [None]:
#Importation des ponctuations et affichage de ces dernières
import string
string.punctuation

In [None]:
#Retrait des ponctuations des lignes du tableau
def remove_punct(text):
    #division du text en moreceaux
    text_tok = word_tokenize(text)
    #initialiser liste vide
    l=[]
    for token in text_tok :
        if not token in string.punctuation :
        #test si text n'est pas dans ponc
            l.append(token)
    resultat = " ".join(l)
    
    return resultat

In [None]:
#application de la fonction éditée précédemment
sms['Message']=sms.Message.apply(remove_punct)
sms.Message

In [None]:
from nltk.corpus import stopwords
#Télécharger les stopwords
stop=set(stopwords.words('english'))

In [None]:
#Définition de la fonction stopword
def remove_stopword(text):
    text_tok= word_tokenize(text)
    l=[]
    for token in text_tok : 
        if not token in stop : 
            l.append(token)
    resultat= " ".join(l)
    return resultat

In [None]:
#application de la fonction éditée précédemment
sms['Message']=sms.Message.apply(remove_stopword)
sms.Message

In [None]:
#Importation de la fonction wordnetlemmatizer de la bibliothèque nltk.stem
from nltk.stem import WordNetLemmatizer
lemmatizer=WordNetLemmatizer()
#on renomme just la fonction pour simplifier l'expression

In [None]:
def lemm(text):
    text_tok = word_tokenize(text)
    l=[]
    for token in text_tok :
        l.append(lemmatizer.lemmatize(token))
    resultat = " ".join(l)
    return resultat

In [None]:
sms.Message=sms.Message.apply(lemm)
sms.head()

Méthode Bag of words :

In [None]:
corpus=sms['Message'].values
corpus

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
bw_vect = CountVectorizer(max_features=200)
# tokenize et construire le vocabulaire
bw_fit=bw_vect.fit(corpus)
# vectoriser les mots
bw_corpus = bw_fit.transform(corpus)

In [None]:
cv_data=pd.DataFrame(bw_corpus.toarray(),columns=bw_fit.get_feature_names())
cv_data

In [None]:
from sklearn.model_selection import train_test_split
Y=sms.Label
X=cv_data
# Split train / test data :
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=0)


In [None]:
from sklearn import tree
#choisir le nombre d'étapes de l'arbre sa profondeur (changer le depth pour essayer)
tree_model = tree.DecisionTreeClassifier(max_depth = 15)
tree_model=tree_model.fit(X_train, Y_train)

In [None]:
X_train.shape

In [None]:
import matplotlib.pyplot as plt
plt.figure(figsize=(15,10))
names = ['spam', 'ham']
tree.plot_tree(tree_model,feature_names = X.columns, 
               class_names=names,
               filled = True)

In [None]:
Y_predict=tree_model.predict(X_test)

In [None]:
Y_predict.shape

In [None]:
from sklearn.metrics import accuracy_score, confusion_matrix 
mat = confusion_matrix(Y_predict, Y_test)
print(mat)

In [None]:
#plt.figure(figsize=(15,5))
sns.heatmap(mat, annot=True,  xticklabels=names, yticklabels=names)
plt.xlabel('Test')
plt.ylabel('Predicted')

In [None]:
# Accuracy: nb bonne rep /par le nb total de rep
a_CART = accuracy_score(Y_test,Y_predict)
print("L'accuracy score du modèle CART est de : ",a_CART)

In [None]:
from sklearn.model_selection import GridSearchCV
import numpy as np
depths = np.arange(1, 21,2)
param_grid = [{'max_depth':depths}]
grid_tree= GridSearchCV(estimator=tree.DecisionTreeClassifier(),param_grid=param_grid,scoring='accuracy',cv=10)
grid_tree.fit(X_train, Y_train)
best_model_tree = grid_tree.best_estimator_
#permet de trouver le meilleur parametre de recherche


In [None]:
grid_tree.best_params_

Méthode TF-IDF

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
#Initialiser les paramètres du vectoriseur
tf_vect = TfidfVectorizer(max_features=200)
#Apprendre le vocabulaire du vectoriseur basé sur le paramètre initialisé
tfidf_fit=tf_vect.fit(corpus)
#Vectoriser le corpus
tfidf_corpus= tfidf_fit.transform(corpus)

In [None]:
tfidf_data=pd.DataFrame(tfidf_corpus.toarray(),columns=tfidf_fit.get_feature_names())
tfidf_data

In [None]:
from sklearn.model_selection import train_test_split
Y=sms.Label
X=tfidf_data
# Split train / test data :
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=0)

In [None]:
from sklearn import tree
#choisir le nombre d'étapes de l'arbre sa profondeur (changer le depth pour essayer)
tree_model = tree.DecisionTreeClassifier(max_depth = 15)
tree_model=tree_model.fit(X_train, Y_train)

In [None]:
import matplotlib.pyplot as plt
plt.figure(figsize=(15,10))
names = ['spam', 'ham']
tree.plot_tree(tree_model,feature_names = X.columns, 
               class_names=names,
               filled = True)

In [None]:
Y_predict=tree_model.predict(X_test)
from sklearn.metrics import accuracy_score, confusion_matrix 
mat = confusion_matrix(Y_predict, Y_test)
print(mat)

In [None]:
#plt.figure(figsize=(15,5))
sns.heatmap(mat, annot=True,  xticklabels=names, yticklabels=names)
plt.xlabel('Test')
plt.ylabel('Predicted')

In [None]:
# Accuracy: nb bonne rep /par le nb total de rep
a_CART = accuracy_score(Y_test,Y_predict)
print("L'accuracy score du modèle CART est de : ",a_CART)

In [None]:
from sklearn.model_selection import GridSearchCV
import numpy as np
depths = np.arange(10, 40, 5)
param_grid = [{'max_depth':depths}]
grid_tree= GridSearchCV(estimator=tree.DecisionTreeClassifier(),param_grid=param_grid,scoring='accuracy',cv=10)
grid_tree.fit(X_train, Y_train)
best_model_tree = grid_tree.best_estimator_
#permet de trouver le meilleur parametre de recherche
grid_tree.best_params_

In [None]:
#choisir le nombre d'étapes de l'arbre sa profondeur (changer le depth pour essayer)
tree_model = tree.DecisionTreeClassifier(max_depth = 20)
tree_model=tree_model.fit(X_train, Y_train)

In [None]:

plt.figure(figsize=(15,10))
names = ['spam', 'ham']
tree.plot_tree(tree_model,feature_names = X.columns, 
               class_names=names,
               filled = True)

In [None]:
def classer(text):
    text=text.lower()
    text=remove_punct(text)
    text=remove_stopword(text)
    text=lemm(text)
    tfidf_text=tfidf_fit.transform([text])
    resultat=best_model_tree.predict(tfidf_text)
    
    return resultat

In [None]:
classer('i need your money')

In [None]:
while True:
    text = str(input("Input: "))
    if text== "exit":
        print("Response: Exiting.....")
        break
    print("Response:",classer(text))