# 1. Import Dataset

In [None]:
import pandas as pd
import numpy as np

In [None]:
df = pd.read_csv("./src/spam.csv",encoding="ISO-8859-1")

In [None]:
df.head()

In [None]:
df.shape

In [None]:
df.isnull().sum()

# 2. Data cleaning

In [None]:
df.info()

In [None]:
df.drop(columns=["Unnamed: 2","Unnamed: 3","Unnamed: 4"],inplace=True)

In [None]:
df.columns = ["target","message"]

In [None]:
df.sample(4)

In [None]:
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
df["target"] = encoder.fit_transform(df["target"])

In [None]:
df.sample(4)

In [None]:
df.duplicated().sum()

In [None]:
df = df.drop_duplicates(keep = "first")

In [None]:
df.head()

# 3. Data Analysis

In [None]:
df["target"].value_counts()

In [None]:
import matplotlib.pyplot as plt
plt.pie(df["target"].value_counts(),labels=["ham","spam"],autopct="%.3f")
plt.show()

In [None]:
## !pip install nltk
import nltk

In [None]:
## extracting number of characters
df["characters"] = df["message"].apply(len)

In [None]:
## extracting number of words
df["words"] = df["message"].apply(lambda x : len(nltk.word_tokenize(x)))

In [None]:
df["sentences"] = df["message"].apply(lambda x : len(nltk.sent_tokenize(x)))

In [None]:
df.sample(5)

In [None]:
# ham meassages
df[df["target"] == 0].describe()

In [None]:
# spam messages
df[df["target"] == 1].describe()

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
plt.figure(figsize=(15,10))
sns.histplot(df[df["target"] == 0]["characters"],color="blue")
sns.histplot(df[df["target"] == 1]["characters"],color="red")
plt.show()

In [None]:
plt.figure(figsize=(15,10))
sns.histplot(df[df["target"] == 0]["words"],color="blue")
sns.histplot(df[df["target"] == 1]["words"],color="red")
plt.show()

In [None]:
plt.figure(figsize=(10,10))
sns.histplot(df[df["target"] == 0]["sentences"],color="blue")
sns.histplot(df[df["target"] == 1]["sentences"],color="red")
plt.show()

In [None]:
plt.figure(figsize=(15,10))
sns.pairplot(df,hue="target")
plt.show()

# 4. Data preprocessing

In [None]:
df.sample(4)

In [None]:
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
import string

stemmer = PorterStemmer()

# function to apply text preprocessing
def text_transform(text):
    # convert to lower case
    text = text.lower()
    # tokenize the text to individual words
    text = nltk.word_tokenize(text)
    # remove special characters and convert
    lst = []
    for word in text:
        # stopwords : which helps in formation of sentences and has no special meaning
        if word.isalnum() and word not in stopwords.words("english") and word not in string.punctuation:
            lst.append(stemmer.stem(word))

    return " ".join(lst)


In [None]:
df["transformed text"] = df["message"].apply(lambda x : text_transform(x))

In [None]:
df.sample(5)

In [None]:
# representation using wordcloud
from wordcloud import WordCloud
wc = WordCloud(height=500,width=500,min_font_size=5,background_color="white")

In [None]:
# wordcloud of ham
plt.imshow(wc.generate(df[df["target"] == 0]["transformed text"].str.cat(sep=" "))) 
plt.show()

In [None]:
# wordcloud of spam
plt.imshow(wc.generate(df[df["target"] == 1]["transformed text"].str.cat(sep=" "))) 
plt.show()

In [None]:
def count_message(lst):
    word_list = []
    for msg in lst:
        for word in msg.split():
            word_list.append(word)
    return word_list

In [None]:
spam_lst = count_message(df[df["target"] == 1]["transformed text"].tolist())
ham_lst = count_message(df[df["target"] == 0]["transformed text"].tolist())

In [None]:
print("Total words in ham : ",len(ham_lst))
print("Total words in spam : ",len(spam_lst))

In [None]:
from collections import Counter
spam_df = pd.DataFrame(Counter(spam_lst).most_common(10))
ham_df = pd.DataFrame(Counter(ham_lst).most_common(10))

In [None]:
spam_df.head(10)

In [None]:
ham_df.head(10)

# 5. Text conversion

###  Bag of words

In [None]:
## https://pages.github.rpi.edu/kuruzj/website_introml_rpi/notebooks/08-intro-nlp/03-scikit-learn-text.html

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer()

In [None]:
bag_of_words = vectorizer.fit_transform(df["transformed text"]).toarray()

In [None]:
bag_of_words

### tf-idf encoding

In [None]:
## https://pages.github.rpi.edu/kuruzj/website_introml_rpi/notebooks/08-intro-nlp/03-scikit-learn-text.html

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()

In [None]:
Tf_idf = vectorizer.fit_transform(df["transformed text"]).toarray()

In [None]:
Tf_idf

### word2vec encoding

In [None]:
## https://youtu.be/hQwFeIupNP0
## https://youtu.be/hQwFeIupNP0
## https://www.hackersrealm.net/post/word2vec-python#:~:text=Word2Vec%20is%20a%20popular%20technique,can%20be%20used%20in%20python.

In [None]:
from gensim.models import Word2Vec
import gensim

In [None]:
model = Word2Vec(vector_size = 100 , min_count = 1 , window = 2 , workers= 2)

In [None]:
df["temp"] = df["transformed text"].apply(lambda x : x.split())

In [None]:
model.build_vocab(df["temp"])
model.train(df["temp"], total_examples=model.corpus_count, epochs=model.epochs)

In [None]:
df = df.drop(["temp"],axis=1)

In [None]:
model.save("./src/word2vec.model")

# 6. Training models

## Classification training

In [None]:
from sklearn.tree import DecisionTreeClassifier as DTC
from sklearn.neighbors import KNeighborsClassifier as KNN 
from sklearn.svm import SVC
from sklearn.naive_bayes import  GaussianNB , MultinomialNB , BernoulliNB
from sklearn.ensemble import ExtraTreesClassifier as ETC , RandomForestClassifier as RF

from sklearn.metrics import accuracy_score , precision_score , confusion_matrix
from sklearn.model_selection import train_test_split as tt
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline

In [None]:
'''
knn = Pipeline([('Scalar 1',StandardScaler()) , ('PCA 1' , PCA(n_components=6)) , ('KNN Classifier' , KNN())])
dtc = Pipeline([('Scalar 2',StandardScaler()) , ('PCA 2' , PCA(n_components=6)) , ('DTC Classifier' , DTC())])
svc = Pipeline([('Scalar 3',StandardScaler()) , ('PCA 3' , PCA(n_components=6)) , ('SVC Classifier' , SVC())])
gnb = Pipeline([('Scalar 4',StandardScaler()) , ('PCA 4' , PCA(n_components=6)) , ('Gaussian Classifier' , GaussianNB())])
mnb = Pipeline([('Scalar 5',StandardScaler()) , ('PCA 5' , PCA(n_components=6)) , ('Multinomial Classifier' , MultinomialNB())])
bnb = Pipeline([('Scalar 6',StandardScaler()) , ('PCA 6' , PCA(n_components=6)) , ('Bernoulli Classifier' , BernoulliNB())])
'''

In [None]:
knn = Pipeline([('KNN Classifier' , KNN())])
dtc = Pipeline([('DTC Classifier' , DTC())])
svc = Pipeline([('SVC Classifier' , SVC())])
etc = Pipeline([('etc Classifier' , ETC())])
rf = Pipeline([('rf Classifier' ,RF())])
gnb = Pipeline([('Gaussian Classifier' , GaussianNB())])
mnb = Pipeline([('Multinomial Classifier' , MultinomialNB())])
bnb = Pipeline([('Bernoulli Classifier' , BernoulliNB())])

In [None]:
pipelines = [knn,dtc,svc,gnb,mnb,bnb,etc,rf]

In [None]:
pipe_dict = {
    0 : "KNN",
    1 : "DTC", 
    2 : "SVC", 
    3 : "GNB", 
    4 : "MNB", 
    5 : "BNB",
    6 : "ETC",
    7 : "RF"
    }

model_dict = pd.DataFrame.from_dict(pipe_dict,orient="index",columns=["models"])

In [None]:
df.head(5)

### Training with bag of words

In [None]:
x_train , x_test , y_train , y_test = tt(bag_of_words,df["target"],test_size = 0.3,random_state=3)

In [None]:
for pipe in pipelines:
    try:
        pipe.fit(x_train,y_train)
    except:
        pass

In [None]:
accuracy , precision = [] , []
for i , model in enumerate(pipelines):
    try:
        x_pred = model.predict(x_test)
        accuracy.append(accuracy_score(x_pred,y_test))
        precision.append(precision_score(x_pred,y_test))
        print(f"{pipe_dict[i]} confusion matrix : \n",confusion_matrix(x_pred,y_test))
        ## print(f"{pipe_dict[i]} Test Accuracy : {accuracy_score(x_pred,y_test)}")
    except:
        accuracy.append(0)
        precision.append(0)

In [None]:
model_dict["Bag of words Accuracy"] = pd.DataFrame(accuracy,columns=["Bag of words accuracy"])
model_dict["Bag of words Precision"] = pd.DataFrame(precision,columns=["Bag of words precision"])

In [None]:
model_dict

### Training with tf-idf encoded data

In [None]:
x_train , x_test , y_train , y_test = tt(Tf_idf,df["target"],test_size = 0.3,random_state=3)

In [None]:
for pipe in pipelines:
    try:
        pipe.fit(x_train,y_train)
    except:
        pass

In [None]:
accuracy , precision = [] , []
for i , model in enumerate(pipelines):
    try:
        x_pred = model.predict(x_test)
        accuracy.append(accuracy_score(x_pred,y_test))
        precision.append(precision_score(x_pred,y_test))
        print(f"{pipe_dict[i]} confusion matrix : \n",confusion_matrix(x_pred,y_test))
        ## print(f"{pipe_dict[i]} Test Accuracy : {accuracy_score(x_pred,y_test)}")
    except:
        accuracy.append(0)
        precision.append(0)

In [None]:
model_dict["Tf-idf Accuracy"] = pd.DataFrame(accuracy,columns=["Tf-idf accuracy"])
model_dict["Tf-idf Precision"] = pd.DataFrame(precision,columns=["Tf-idf precision"])

In [None]:
model_dict ## bag of words -> mnb

### Training on word2vec encoded text

In [None]:
word2vec = Word2Vec.load("./src/word2vec.model")

In [None]:
def encode_text(text):
  ## create text vector
  text_vector = np.zeros(word2vec.vector_size)
  
  count = 0
  for word in text.split():
    if word in word2vec.wv:
      text_vector += word2vec.wv[word]
      count += 1

  if count != 0:
        text_vector /= count
  return text_vector

In [None]:
x = []
for it in df["transformed text"]:
    x.append(encode_text(it))

In [None]:
x_train , x_test , y_train , y_test = tt(x,df["target"],test_size = 0.3,random_state=3)

In [None]:
for pipe in pipelines:
    try:
        pipe.fit(x_train,y_train)
    except:
        pass

In [None]:
accuracy , precision = [] , []
for i , model in enumerate(pipelines):
    try:
        x_pred = model.predict(x_test)
        accuracy.append(accuracy_score(x_pred,y_test))
        precision.append(precision_score(x_pred,y_test))
        print(f"{pipe_dict[i]} confusion matrix : \n",confusion_matrix(x_pred,y_test))
        ## print(f"{pipe_dict[i]} Test Accuracy : {accuracy_score(x_pred,y_test)}")
    except:
        accuracy.append(0)
        precision.append(0)

In [None]:
model_dict["Word2Vec Accuracy"] = pd.DataFrame(accuracy,columns=["Word2Vec accuracy"])
model_dict["Word2Vec Precision"] = pd.DataFrame(precision,columns=["Word2Vec precision"])

In [None]:
model_dict

### Multinomial naive bayes trained with bag of words give the highest accuracy and percision

In [None]:
df.head()

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import  MultinomialNB

vectorizer = CountVectorizer()
classifier = MultinomialNB()

In [None]:
from sklearn.model_selection import train_test_split as tt
x = vectorizer.fit_transform(df["transformed text"]).toarray()

In [None]:
x_train, x_test, y_train, y_test = tt(x,df["target"],test_size=0.3,random_state=3)

In [None]:
classifier.fit(x_train,y_train)
x_pred = classifier.predict(x_test)

In [None]:
from sklearn.metrics import accuracy_score , precision_score
accuracy_score(x_pred,y_test),precision_score(x_pred,y_test)

In [None]:
import pickle

In [None]:
with open("./src/vectorizer.pkl","wb") as f:
    pickle.dump(vectorizer,f)

with open("./src/classifier.pkl","wb") as f:
    pickle.dump(classifier,f)

In [None]:
def textPreprocessor(text):
    import nltk
    from nltk.corpus import stopwords
    from nltk.stem.porter import PorterStemmer
    import string

    stemmer = PorterStemmer()
    # convert to lower case
    text = text.lower()
    # tokenize the text to individual words
    text = nltk.word_tokenize(text)
    # remove special characters and convert
    lst = []
    for word in text:
        # stopwords : which helps in formation of sentences and has no special meaning
        if word.isalnum() and word not in stopwords.words("english") and word not in string.punctuation:
            lst.append(stemmer.stem(word))

    text = " ".join(lst)

    return text


In [None]:
with open("./src/textPreprocessor.pkl","wb") as f:
    pickle.dump(textPreprocessor,f)

In [None]:
! streamlit run app.py