In [None]:
import pandas as pd
import seaborn as sns
from nltk.tokenize import word_tokenize
import string
from nltk.corpus import stopwords
import nltk
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer


In [None]:
sms = pd.read_csv("/kaggle/input/sms-spam-collection-dataset/spam.csv", encoding='latin-1')
sms.head()

In [None]:
sms.drop(['Unnamed: 2','Unnamed: 3','Unnamed: 4'],axis=1,inplace=True)
sms.head()

In [None]:
sms=sms.rename(columns={"v1":"label","v2":"message"})

In [None]:
sms.head()

In [None]:
sns.countplot(sms.label)

In [None]:
sms.shape

In [None]:
#Passage en minuscule
sms['message']=sms['message'].str.lower()

In [None]:
def remove_punct(text):
    text_tok = word_tokenize(text)
    l=[]
    for word in text_tok: 
        if not word in string.punctuation:
            l.append(word)
           
    resultat=" ".join(l)  
    return resultat

sms['message']=sms.message.apply(remove_punct)

In [None]:
stop=set(stopwords.words('english'))

def remove_stopword(text):
    text_tok = word_tokenize(text)
    l = []
    for a in text_tok:
        if not a in stop:
            l.append(a)
            
    resultat = " ".join(l)
    return resultat

sms['message']=sms.message.apply(remove_stopword)

In [None]:
lemmatizer=WordNetLemmatizer()

def lemm(text):
    text_tok = word_tokenize(text) 
    l=[]
    for word in text_tok:
        l.append(lemmatizer.lemmatize(word))
        
    resultat = " ".join(l)

    return resultat

sms.message=sms.message.apply(lemm)

## Bag of words

## 

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
corpus=sms['message'].values
bw_vect = CountVectorizer()
# tokenize et construire le vocabulaire
bw_fit=bw_vect.fit(corpus)
# vectoriser les mots
bw_corpus = bw_fit.transform(corpus)
bw_sms=pd.DataFrame(bw_corpus.toarray(),columns=bw_fit.get_feature_names())
bw_sms

## TF IDF

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
#Initialiser les paramètres du vectoriseur
tf_vect = TfidfVectorizer(max_features=500)
#Apprendre le vocabulaire du vectoriseur basé sur le paramètre initialisé
tfidf_fit=tf_vect.fit(corpus)
#Vectoriser le corpus
tfidf_corpus= tfidf_fit.transform(corpus)
tfidf_sms=pd.DataFrame(tfidf_corpus.toarray(),columns=tfidf_fit.get_feature_names())
tfidf_sms

## 1ère méthode : avec TFIDF

## Vectorisation

In [None]:
from sklearn.model_selection import train_test_split
Xtfidf=tfidf_sms
Y=sms.label
# Split train / test data :
X_traintfidf, X_testtfidf, Y_train, Y_test = train_test_split(Xtfidf, Y, test_size=0.3, random_state=0)

## Arbre de décision

In [None]:
from sklearn import tree
tree_model = tree.DecisionTreeClassifier()
tree_model = tree.DecisionTreeClassifier(max_depth = 2)
tree_model = tree_model.fit(X_traintfidf, Y_train)

In [None]:
import matplotlib.pyplot as plt
plt.figure(figsize=(15,10))
names = ['spam', 'non spam']
tree.plot_tree(tree_model,feature_names = Xtfidf.columns, 
               class_names=names,
               filled = True)

In [None]:
Y_predicttfidf=tree_model.predict(X_testtfidf)

## Évaluation de l'arbre

In [None]:
from sklearn.metrics import accuracy_score, confusion_matrix 
mat = confusion_matrix(Y_predicttfidf, Y_test)
print(mat)

In [None]:
sns.heatmap(mat, annot=True,  xticklabels=names, yticklabels=names)
plt.xlabel('Test')
plt.ylabel('Predicted')

## 2ème méthode : avec Bag of words

## Vectorisation

In [None]:
from sklearn.model_selection import train_test_split
Xbw=bw_sms
Y=sms.label
# Split train / test data :
X_trainbw, X_testbw, Y_train, Y_test = train_test_split(Xbw, Y, test_size=0.3, random_state=0)

## Arbre de décision

In [None]:
from sklearn import tree
tree_model = tree.DecisionTreeClassifier()
tree_model = tree.DecisionTreeClassifier(max_depth = 2)
tree_model = tree_model.fit(X_trainbw, Y_train)

In [None]:
import matplotlib.pyplot as plt
plt.figure(figsize=(15,10))
names = ['spam', 'non spam']
tree.plot_tree(tree_model,feature_names = Xbw.columns, 
               class_names=names,
               filled = True)

In [None]:
Y_predictbw=tree_model.predict(X_testbw)

## Évaluation de l'arbre

In [None]:
from sklearn.metrics import accuracy_score, confusion_matrix 
mat = confusion_matrix(Y_predictbw, Y_test)
print(mat)

In [None]:
sns.heatmap(mat, annot=True,  xticklabels=names, yticklabels=names)
plt.xlabel('Test')
plt.ylabel('Predicted')

## Suite avec la méthode TFIDF

## Gridsearch

In [None]:
from sklearn.model_selection import GridSearchCV
import numpy as np
depths = np.arange(10, 40,5)
param_grid = [{'max_depth':depths}]
grid_tree= GridSearchCV(estimator=tree.DecisionTreeClassifier(),param_grid=param_grid,scoring='accuracy',cv=10)
grid_tree.fit(X_traintfidf, Y_train)
best_model_tree = grid_tree.best_estimator_
Y_grid=best_model_tree.predict(X_testtfidf)

In [None]:
mat = confusion_matrix(Y_grid, Y_test)
sns.heatmap(mat, annot=True,  xticklabels=names, yticklabels=names)
plt.xlabel('Test')
plt.ylabel('Predicted')

## Forêt d'arbres

In [None]:
from sklearn.ensemble import RandomForestClassifier
Rf_model = RandomForestClassifier()
Rf_model=Rf_model.fit(X_traintfidf, Y_train)
Y_predicttfidf=Rf_model.predict(X_testtfidf)
a_CART = accuracy_score(Y_test,Y_predicttfidf)
print("L'accuracy score du modèle RF est de : ",a_CART)
mat = confusion_matrix(Y_predicttfidf, Y_test)
sns.heatmap(mat, annot=True,  xticklabels=names, yticklabels=names)
plt.xlabel('Test')
plt.ylabel('Predicted')

## Classer

In [None]:
def reponse(text):
    text=text.lower()
    text=text.replace('covid-19','coronavirus')
    text=remove_punct(text)
    text=remove_stopword(text)
    text=lemm(text)
    tfidf_text=tfidf_fit.transform([text])
   
    cm=cosine_similarity(tfidf_text, tfidf_corpus)
    pos=np.argmax(cm[0])
    data.answers[pos]
    return data.answers[pos]

## Code à tester

In [None]:
while True:
    text = str(input("Input: "))
    if text== "exit":
        print("Response: Exiting.....")
        break
    print("Response:",reponse(text))