# Import Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Read the Dataset

In [None]:
path="../input/sms-spam-collection-dataset/spam.csv"
data=pd.read_csv(path,encoding='latin-1')
data.head()

# Information about data

In [None]:
data.info()

In [None]:
data.describe()

# Data Preprocessing

In [None]:
df=data[["v1",'v2']].copy()
df.rename(columns={'v1':'Class','v2':'sms'},inplace=True)
df.head()

# Data Visualization

In [None]:
sns.countplot(x=df.Class)

### The dataset is imbalanced 

# Text preprocessing

In [None]:
import spacy
nlp=spacy.load('en')

## Tokenization

In [None]:
df["tokens"]=df.sms.apply(lambda x: nlp(x))
df.head()

# Lammatization after removing stopwords and punctuation

In [None]:
def stopword(txt):
    l=[]
    for tokens in txt:
        if not tokens.is_stop and not tokens.is_punct:
            l.append(tokens.lemma_.strip().lower())
    return l        
df['Lmnt_text']=df.tokens.apply(stopword) 
df.head()

# Corpus of Lammatized text

In [None]:
def final_corpus(lmt):
    return (' '.join(lmt))
df['final_corpus']=df.Lmnt_text.apply(final_corpus)
df.head()

# Word Cloud

In [None]:
text = " ".join(r for r,s in zip(df.final_corpus.astype(str),df.Class) if s == 'ham')
from wordcloud import WordCloud
wordcloud = WordCloud(width = 1200, height = 1200,
                background_color ='white',contour_width=1, contour_color='green',
                min_font_size = 20).generate(text)
plt.figure(figsize=[15,20])
plt.title("HAM WORD CLOUD")
plt.axis("off")
plt.imshow(wordcloud,interpolation='bilinear')

# Frequently occuring words in Ham messages

In [None]:
from collections import Counter
f=Counter(text.split())
print(f.most_common(10))

In [None]:
text = " ".join(r for r,s in zip(df.final_corpus.astype(str),df.Class) if s == 'spam')
from wordcloud import WordCloud
wordcloud = WordCloud(width = 1200, height = 1200,
                background_color ='black',contour_width=1, contour_color='green',
                min_font_size = 20).generate(text)
plt.figure(figsize=[15,20])
plt.title("SPAM WORD CLOUD")
plt.axis("off")
plt.imshow(wordcloud,interpolation='bilinear')

# Frequently occuring word in spam message

In [None]:
from collections import Counter
f=Counter(text.split())
print(f.most_common(10))

# Vectorization

# TF-IDF

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(use_idf=False)
X= tfidf.fit_transform(df.final_corpus).toarray()

# CountVectorizer

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer()
x = cv.fit_transform(df.final_corpus).toarray()

# Train-Test Split

In [None]:
from sklearn.model_selection import train_test_split
X_train1, X_test1, y_train1, y_test1 = train_test_split(X, df.Class, test_size=0.30, random_state=42)
X_train2, X_test2, y_train2, y_test2 = train_test_split(x, df.Class, test_size=0.30, random_state=42)

# Model Building

In [None]:
Model=pd.DataFrame({"Model":[],"Accuracy":[],"Vectorizer":[]})
Model

# Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression
lr=LogisticRegression().fit(X_train1,y_train1)
score=lr.score(X_test1,y_test1)
Model=Model.append({"Model":"Logistic Regression","Accuracy":score,"Vectorizer":"tfidf"},ignore_index=True)
lr=LogisticRegression().fit(X_train2,y_train2)
score=lr.score(X_test2,y_test2)
Model=Model.append({"Model":"Logistic Regression","Accuracy":score,"Vectorizer":"CountVector"},ignore_index=True)
Model 

# Decision Tree

In [None]:
from sklearn import tree
dt= tree.DecisionTreeClassifier().fit(X_train1, y_train1)
score=dt.score(X_test1,y_test1)
Model=Model.append({"Model":"Decision Tree","Accuracy":score,"Vectorizer":"tfidf"},ignore_index=True)
dt=tree.DecisionTreeClassifier().fit(X_train2,y_train2)
score=dt.score(X_test2,y_test2)
Model=Model.append({"Model":"Decision Tree","Accuracy":score,"Vectorizer":"CountVector"},ignore_index=True)
Model 

# Random forest

In [None]:
from sklearn.ensemble import RandomForestClassifier
rf= RandomForestClassifier(random_state=42).fit(X_train1, y_train1)
score=rf.score(X_test1,y_test1)
Model=Model.append({"Model":"Random Forest","Accuracy":score,"Vectorizer":"tfidf"},ignore_index=True)
rf=RandomForestClassifier(random_state=42).fit(X_train2,y_train2)
score=rf.score(X_test2,y_test2)
Model=Model.append({"Model":"Random Forest","Accuracy":score,"Vectorizer":"CountVector"},ignore_index=True)
Model 

# K-nearest Neighbours

In [None]:
from sklearn.neighbors import KNeighborsClassifier
for r in range(1,20,2):
    knn = KNeighborsClassifier(n_neighbors=r).fit(X_train1, y_train1)
    print(r,knn.score(X_test1,y_test1))

In [None]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=3).fit(X_train1, y_train1)
score=knn.score(X_test1,y_test1)
Model=Model.append({"Model":"KNN","Accuracy":score,"Vectorizer":"tfidf"},ignore_index=True)
knn = KNeighborsClassifier(n_neighbors=3).fit(X_train2,y_train2)
score=knn.score(X_test2,y_test2)
Model=Model.append({"Model":"KNN","Accuracy":score,"Vectorizer":"CountVector"},ignore_index=True)
Model 


# Naive Bayes

## For tfidf

In [None]:
from sklearn.naive_bayes import MultinomialNB,GaussianNB,ComplementNB,BernoulliNB
mnb = MultinomialNB().fit(X_train1, y_train1).score(X_test1,y_test1)
gnb = GaussianNB().fit(X_train1,y_train1).score(X_test1,y_test1)
cnb = ComplementNB().fit(X_train1,y_train1).score(X_test1,y_test1)
bnb = BernoulliNB().fit(X_train1,y_train1).score(X_test1,y_test1)

In [None]:
naive_bayes=pd.DataFrame({"Classifier":["GaussianNB","MultinomialNB","ComplementNB","BernoulliNB"],
                          "Score":[gnb,mnb,cnb,bnb]})
Model=Model.append({"Model":"Naive Bayes","Accuracy":mnb,"Vectorizer":"tfidf"},ignore_index=True)
naive_bayes

## For CountVector

In [None]:
from sklearn.naive_bayes import MultinomialNB,GaussianNB,ComplementNB,BernoulliNB
mnb = MultinomialNB().fit(X_train2, y_train2).score(X_test2,y_test2)
gnb = GaussianNB().fit(X_train2,y_train2).score(X_test2,y_test2)
cnb = ComplementNB().fit(X_train2,y_train2).score(X_test2,y_test2)
bnb = BernoulliNB().fit(X_train2,y_train2).score(X_test2,y_test2)

In [None]:
naive_bayes1=pd.DataFrame({"Classifier":["GaussianNB","MultinomialNB","ComplementNB","BernoulliNB"],
                          "Score":[gnb,mnb,cnb,bnb]})
naive_bayes1

## Multinomial Naive Bayes yields better result

In [None]:
Model=Model.append({"Model":"Naive Bayes","Accuracy":mnb,"Vectorizer":"CountVector"},ignore_index=True)
Model

# Support Vector Machine

In [None]:
from sklearn import svm
for r in ['linear' , 'poly', 'rbf', 'sigmoid']:
    sv=svm.SVC(kernel=r).fit(X_train1,y_train1).score(X_test1,y_test1)
    print(r,sv)

In [None]:
from sklearn import svm
for r in ['linear' , 'poly', 'rbf', 'sigmoid']:
    sv=svm.SVC(kernel=r).fit(X_train2,y_train2).score(X_test2,y_test2)
    print(r,sv)

In [None]:
from sklearn import svm
sv=svm.SVC(kernel='linear').fit(X_train1,y_train1)
score=sv.score(X_test1,y_test1)
Model=Model.append({"Model":"SVM","Accuracy":score,"Vectorizer":"tfidf"},ignore_index=True)
sv = svm.SVC(kernel="linear").fit(X_train2,y_train2)
score=sv.score(X_test2,y_test2)
Model=Model.append({"Model":"SVM","Accuracy":score,"Vectorizer":"CountVector"},ignore_index=True)
Model 

# Stochastic Gradient Descent

In [None]:
from sklearn.linear_model import SGDClassifier
for r in [ 'hinge', 'log', 'modified_huber']:
    sgd=SGDClassifier(loss=r).fit(X_train1,y_train1).score(X_test1,y_test1)
    print(r,sgd)

In [None]:
from sklearn.linear_model import SGDClassifier
for r in [ 'hinge', 'log', 'modified_huber']:
    sgd=SGDClassifier(loss=r).fit(X_train2,y_train2).score(X_test2,y_test2)
    print(r,sgd)

In [None]:
from sklearn.linear_model import SGDClassifier
sgd=SGDClassifier(loss='hinge').fit(X_train1,y_train1)
score=sgd.score(X_test1,y_test1)
Model=Model.append({"Model":"Stochastic Gradient Descent","Accuracy":score,"Vectorizer":"tfidf"},ignore_index=True)
sgd = SGDClassifier(loss="hinge").fit(X_train2,y_train2)
score=sgd.score(X_test2,y_test2)
Model=Model.append({"Model":"Stochastic Gradient Descent","Accuracy":score,"Vectorizer":"CountVector"},ignore_index=True)
Model 


# XGBoost 

In [None]:
import xgboost as xgb
xg=xgb.XGBClassifier(objective="binary:logistic", random_state=42,use_label_encoder=True).fit(X_train1, y_train1)
score=xg.score(X_test1,y_test1) 
Model=Model.append({"Model":"XGBoost","Accuracy":score,"Vectorizer":"tfidf"},ignore_index=True)
xg=xgb.XGBClassifier(objective="binary:logistic", random_state=42,use_label_encoder=True).fit(X_train2, y_train2)
score=xg.score(X_test2,y_test2)
Model=Model.append({"Model":"XGBoost","Accuracy":score,"Vectorizer":"CountVector"},ignore_index=True)
Model 


# Artificial neural network 

In [None]:
from tensorflow import keras
from tensorflow.keras import layers

model=keras.Sequential([
            layers.Dense(units=6, activation='relu'), 
            layers.Dense(units=6, activation='relu'),  
            layers.Dense(units=1, activation='sigmoid')])  

In [None]:
model.compile(
optimizer='adamax',
    loss='binary_crossentropy',
    metrics=['binary_accuracy']
)

In [None]:
stop=keras.callbacks.EarlyStopping(
patience=10,
min_delta=0.01,
restore_best_weights=True)

history=model.fit(
    X_train2,y_train2.replace({"ham":0,"spam":1}),
    validation_data=(X_test2,y_test2.replace({"ham":0,"spam":1})),
    batch_size=32,
    epochs=100,
    callbacks=[stop])

In [None]:
model.summary()

In [None]:
history_df = pd.DataFrame(history.history)
history_df.loc[:, ['loss', 'val_loss']].plot()
history_df.loc[:, ['binary_accuracy', 'val_binary_accuracy']].plot()
print(("Best Validation Loss: {:0.4f}" +\
      "\nBest Validation Accuracy: {:0.4f}")\
      .format(history_df['val_loss'].min(), 
              history_df['val_binary_accuracy'].max()))

In [None]:
Model=Model.append({"Model":"Artificial Neural Network","Accuracy":history_df['val_binary_accuracy'].max(),"Vectorizer":"CountVector"},ignore_index=True)
Model

In [None]:
from tensorflow import keras
from tensorflow.keras import layers

model=keras.Sequential([
            layers.Dense(units=6, activation='relu'), 
            layers.Dense(units=6, activation='relu'),  
            layers.Dense(units=1, activation='sigmoid')])
model.compile(
optimizer='adamax',
    loss='binary_crossentropy',
    metrics=['binary_accuracy'])
stop=keras.callbacks.EarlyStopping(
patience=10,
min_delta=0.01,
restore_best_weights=True)

history=model.fit(
    X_train1,y_train1.replace({"ham":0,"spam":1}),
    validation_data=(X_test1,y_test1.replace({"ham":0,"spam":1})),
    batch_size=32,
    epochs=100,
    callbacks=[stop])

In [None]:
history_df = pd.DataFrame(history.history)
history_df.loc[:, ['loss', 'val_loss']].plot()
history_df.loc[:, ['binary_accuracy', 'val_binary_accuracy']].plot()
print(("Best Validation Loss: {:0.4f}" +\
      "\nBest Validation Accuracy: {:0.4f}")\
      .format(history_df['val_loss'].min(), 
              history_df['val_binary_accuracy'].max()))

In [None]:
Model=Model.append({"Model":"Artificial Neural Network","Accuracy":history_df['val_binary_accuracy'].max(),"Vectorizer":"tfidf"},ignore_index=True)
Model

# Selection of State of The Art Model

In [None]:
Model.sort_values(by=['Accuracy'],ascending=False)

In [None]:
Model.groupby(["Vectorizer",'Model']).min()

# Stochastic Gradient Descent algorithm with count vectorization of corpus yields maximum of accuracy

# Building State Of The Art Model

In [None]:
from sklearn.linear_model import SGDClassifier
final_model=SGDClassifier(loss='hinge')
final_model.fit(X_train2,y_train2)
final_model.score(X_test2,y_test2)

# Confusion Matrix

In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_test2, final_model.predict(X_test2)))

In [None]:
from sklearn.metrics import plot_confusion_matrix
disp = plot_confusion_matrix(final_model, X_test2, y_test2,
                                 display_labels=['ham','spam'],
                                 cmap=plt.cm.Blues)
disp.ax_.set_title('SGDClassifier')
print(disp.confusion_matrix)
plt.show()

# Save the trained model for future use

In [None]:
import pickle
filename = 'finalized_model.sav'
pickle.dump(final_model, open(filename, 'wb'))

# Building the pipeline for the prediction

# Load the Model

In [None]:
load_model = pickle.load(open(filename, 'rb'))

In [None]:
import spacy
nlp=spacy.load('en')
def prediction(text):
    doc=nlp(text)
    lt=[]
    for tokens in doc:
        if not tokens.is_stop and not tokens.is_punct:
            lt.append(tokens.lemma_.strip().lower())
    corpus= ' '.join(lt) 
    f_vct = cv.transform([corpus]).toarray()
    pred=final_model.predict(f_vct)[0]
    return pred
    

# Prediction

In [None]:
text="Free tones Hope you enjoyed your new content"
output=prediction(text)
print(f"The sms is {output}")