In [None]:
# Basic Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import os
import missingno
from collections import Counter
import wordcloud
import emoji
import warnings
warnings.filterwarnings("ignore")
import string
import nltk
import re

In [None]:
# print(os.getcwd())

In [None]:
# os.chdir("G://Excel")

In [None]:
df = pd.read_csv("../input/sms-spam-collection-dataset/spam.csv", encoding = 'latin-1')

In [None]:
df.head()

In [None]:
df.drop(["Unnamed: 2", "Unnamed: 3", "Unnamed: 4"], axis = 1, inplace = True)

In [None]:
df.head()

In [None]:
df.columns = ["class", "message"]

In [None]:
df.head()

In [None]:
df["len"] = [len(text) for text in df.message]

In [None]:
df.groupby("class").describe()

It seems that the message are bit longer in the spam class as comapred to ham also spam message are less as compared to ham which clearly indicated the class imbalance in our data which we need to fix but we will going to cover this in our later pre-processing steps.

In [None]:
df["class"].value_counts()/df["class"].shape[0]*100

In [None]:
df.head()

In [None]:
stopword_list = nltk.corpus.stopwords.words("english")

In [None]:
tokener = nltk.word_tokenize

In [None]:
common_words = {"word": [], "count": []}

for word in df["message"]:
    tok = tokener(word)
    m_counter = Counter(tok)
    common_words["word"].append(m_counter.most_common(1)[0][0])
    common_words["count"].append(m_counter.most_common(1)[0][1])

In [None]:
common_words_df = pd.concat([df, pd.DataFrame(common_words)], axis = 1)

In [None]:
words_cloud = wordcloud.WordCloud().generate_from_text(' '.join(common_words_df["word"]))

In [None]:
plt.figure(figsize=(10,6))
plt.imshow(words_cloud)

In [None]:
df = df.drop("len", axis = 1)

In [None]:
df.head()

In [None]:
df.message = df.message.apply(str.lower)

In [None]:
def remove_whitespace(text):
    
    clean_words = re.sub(string.whitespace, " ", text)
    
    return clean_words

df.message = df.message.apply(remove_whitespace)

In [None]:
def remove_punch(text):
    
    rm_punch = str(text).translate(str.maketrans(" ", " ", string.punctuation))
    
    return rm_punch

df.message = df.message.apply(remove_punch)

In [None]:
df.message[4]

In [None]:
def remove_stopwords(text):
    
    tok_word = tokener(text)
    
    clean_words = []
    
    for word in tok_word:
        if word in stopword_list:
            continue
        else:
            clean_words.append(word)
    words = ' '.join(clean_words)
    return words

In [None]:
df.message = df.message.apply(remove_stopwords)

In [None]:
ham_class = df[df["class"] == "ham"]
spam_class = df[df["class"] == "spam"]

In [None]:
ham_text = ' '.join(ham_class.message)
spam_text = ' '.join(spam_class.message)

In [None]:
ham_wordcloud = wordcloud.WordCloud().generate_from_text(ham_text)
spam_wordcloud = wordcloud.WordCloud().generate_from_text(spam_text)

In [None]:
plt.figure(figsize=(16,12))
plt.subplot(121)
plt.title("Word Cloud of Ham Class After Pre-Processing")
plt.imshow(ham_wordcloud)
plt.subplot(122)
plt.title("Word Cloud of Spam Class After Pre-Processing")
plt.imshow(spam_wordcloud)

In [None]:
def remove_digits(text):
    
    clean_text = str(text).translate(str.maketrans("", "", string.digits))
    
    return clean_text

In [None]:
df.message = df.message.apply(remove_digits)

In [None]:
df.message[2]

In [None]:
remove_whitespace(df.message[2])

In [None]:
def remove_extraspace(text):
    
    clean_text = re.sub(r"\s+", " ", text)
    
    return clean_text

In [None]:
df.message = df.message.apply(remove_extraspace)

In [None]:
lm = nltk.WordNetLemmatizer()

In [None]:
def lemma(text):
    
    tok_word = tokener(text)
    
    lemma_word = []
    
    for tok in tok_word:
        lemma_tok = lm.lemmatize(tok)
        lemma_word.append(lemma_tok)
        
    return ' '.join(lemma_word)

In [None]:
df["lemma_message"] = df.message.apply(lemma)

In [None]:
df.head(5)

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
final_df = df.iloc[:, [0, 2]]

In [None]:
X = df.iloc[:, 1]
y = df.iloc[:, 0]

In [None]:
from sklearn.preprocessing import LabelBinarizer

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
lb = LabelBinarizer()

In [None]:
scaled_y = lb.fit_transform(y)

In [None]:
tf_idf = TfidfVectorizer(ngram_range=(1,3))

In [None]:
scaled_x = tf_idf.fit_transform(X)

In [None]:
import sklearn
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression, SGDClassifier, RidgeClassifier, PassiveAggressiveClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC

In [None]:
from sklearn.model_selection import StratifiedKFold

In [None]:
kfold= StratifiedKFold(n_splits= 10)

In [None]:
sklearn.metrics.accuracy_score

In [None]:
def evaluate_model(y_train, y_test, y_train_predict, y_test_predict):
    
    print("*******Accuracy*******\n")
    
    train_accuracy = sklearn.metrics.accuracy_score(y_train, y_train_predict)
    test_accuracy = sklearn.metrics.accuracy_score(y_test, y_test_predict)
    
    print("Train Accuracy: %s" %(train_accuracy))
    print("Test Accuracy: %s" %(test_accuracy))
    
    train_error = 1-train_accuracy
    test_error = 1-test_accuracy
    
    print("\n")
    print("Train Error: %s" %(train_error))
    print("Test Error: %s" %(test_error))
    
    print("\n******* F1-Score **********")
    
    train_f1_score = sklearn.metrics.f1_score(y_train, y_train_predict)
    test_f1_score = sklearn.metrics.f1_score(y_test, y_test_predict)
    
    print("\n")
    print("Train F1-Score: %s" %(train_f1_score))
    print("Test F1-Score: %s" %(test_f1_score))
    
    print("\n********* Log Loss ***********")
    train_log_loss = sklearn.metrics.log_loss(y_train, y_train_predict)
    test_log_loss = sklearn.metrics.log_loss(y_test, y_test_predict)
    
    print("\n")
    print("Train Log Loss: %s" %(train_log_loss))
    print("Test Log Loss: %s" %(test_log_loss))
    
    print("\n********* Classification Report *********")
    train_cf_report = pd.DataFrame(sklearn.metrics.classification_report(y_train, y_train_predict, output_dict = True))
    test_cf_report = pd.DataFrame(sklearn.metrics.classification_report(y_test, y_test_predict, output_dict = True))
    
    print("\n")
    print("Train Classification Report:")
    print(train_cf_report)
    print("Test Classification Report:")
    print(test_cf_report)
    
    print("\n********* F-Beta Score ********")
    train_fbeta_score = sklearn.metrics.fbeta_score(y_train, y_train_predict, beta = 0.5)
    test_fbeta_score = sklearn.metrics.fbeta_score(y_test, y_test_predict, beta = 0.5)
    
    print("\n")
    print("Train FBeta Score: %s" %(train_fbeta_score))
    print("Test FBeta Score: %s" %(test_fbeta_score))
    
    print("\n******** Confustion Matrix *********")
    train_conf_mtx = sklearn.metrics.confusion_matrix(y_train, y_train_predict)
    test_conf_mtx = sklearn.metrics.confusion_matrix(y_test, y_test_predict)
    
    print("\n")
    print("Train Confusion Matrix:")
    print(train_conf_mtx)
    print("Test Confusion Matrix:")
    print(test_conf_mtx)

## Logistic Regression

In [None]:
lr_train_accuracy = []
lr_test_accuracy = []
lr_train_f1_score = []
lr_test_f1_score = []

for train_idx, test_idx in kfold.split(scaled_x, scaled_y):
    x_train, x_test, y_train, y_test = scaled_x[train_idx], scaled_x[test_idx], scaled_y[train_idx], scaled_y[test_idx]
    lr = LogisticRegression(random_state = 42)
    lr.fit(x_train, y_train)
    train_predict = lr.predict(x_train)
    test_predict = lr.predict(x_test)
    
    train_accu = sklearn.metrics.accuracy_score(y_train, train_predict)
    test_accu = sklearn.metrics.accuracy_score(y_test, test_predict)
    
    train_f1 = sklearn.metrics.f1_score(y_train, train_predict)
    test_f1 = sklearn.metrics.f1_score(y_test, test_predict)
    
    lr_train_accuracy.append(train_accu)
    lr_test_accuracy.append(test_accu)
    lr_train_f1_score.append(train_f1)
    lr_test_f1_score.append(test_f1)

In [None]:
plt.figure(figsize=(16,4))
plt.subplot(121)
plt.title("Train Accuracy v/s Test Accuracy: %s" %(np.mean(lr_test_accuracy)))
plt.plot(lr_train_accuracy, marker = "o")
plt.plot(lr_test_accuracy, marker = "o", linestyle = "--")
plt.legend(["Train Accuracy", "Test Accuracy"])
plt.subplot(122)
plt.title("Train F1 Score v/s Test F1 Score: %s" %(np.mean(lr_test_f1_score)))
plt.plot(lr_train_f1_score, marker = "o", color = "r")
plt.plot(lr_test_f1_score,  marker = "o", linestyle = "--", color = "orange")
plt.legend(["Train F1-Score", "Test F1-Score"])

## K-Nearest Neighbor

In [None]:
knn_train_accuracy = []
knn_test_accuracy = []
knn_train_f1_score = []
knn_test_f1_score = []

for train_idx, test_idx in kfold.split(scaled_x, scaled_y):
    x_train, x_test, y_train, y_test = scaled_x[train_idx], scaled_x[test_idx], scaled_y[train_idx], scaled_y[test_idx]
    knn = KNeighborsClassifier()
    knn.fit(x_train, y_train)
    train_predict = knn.predict(x_train)
    test_predict = knn.predict(x_test)
    
    train_accu = sklearn.metrics.accuracy_score(y_train, train_predict)
    test_accu = sklearn.metrics.accuracy_score(y_test, test_predict)
    
    train_f1 = sklearn.metrics.f1_score(y_train, train_predict)
    test_f1 = sklearn.metrics.f1_score(y_test, test_predict)
    
    knn_train_accuracy.append(train_accu)
    knn_test_accuracy.append(test_accu)
    knn_train_f1_score.append(train_f1)
    knn_test_f1_score.append(test_f1)

In [None]:
plt.figure(figsize=(16,4))
plt.subplot(121)
plt.title("Train Accuracy v/s Test Accuracy: %s" %(np.mean(knn_test_accuracy)))
plt.plot(knn_train_accuracy, marker = "o")
plt.plot(knn_test_accuracy, marker = "o", linestyle = "--")
plt.legend(["Train Accuracy", "Test Accuracy"])
plt.subplot(122)
plt.title("Train F1 Score v/s Test F1 Score: %s" %(np.mean(knn_test_f1_score)))
plt.plot(knn_train_f1_score, marker = "o", color = "r")
plt.plot(knn_test_f1_score,  marker = "o", linestyle = "--", color = "orange")
plt.legend(["Train F1-Score", "Test F1-Score"])

## Decision Trees

In [None]:
dt_train_accuracy = []
dt_test_accuracy = []
dt_train_f1_score = []
dt_test_f1_score = []

for train_idx, test_idx in kfold.split(scaled_x, scaled_y):
    x_train, x_test, y_train, y_test = scaled_x[train_idx], scaled_x[test_idx], scaled_y[train_idx], scaled_y[test_idx]
    dt = DecisionTreeClassifier(random_state=42)
    dt.fit(x_train, y_train)
    train_predict = dt.predict(x_train)
    test_predict = dt.predict(x_test)
    
    train_accu = sklearn.metrics.accuracy_score(y_train, train_predict)
    test_accu = sklearn.metrics.accuracy_score(y_test, test_predict)
    
    train_f1 = sklearn.metrics.f1_score(y_train, train_predict)
    test_f1 = sklearn.metrics.f1_score(y_test, test_predict)
    
    dt_train_accuracy.append(train_accu)
    dt_test_accuracy.append(test_accu)
    dt_train_f1_score.append(train_f1)
    dt_test_f1_score.append(test_f1)

In [None]:
plt.figure(figsize=(16,4))
plt.subplot(121)
plt.title("Train Accuracy v/s Test Accuracy: %s" %(np.mean(dt_test_accuracy)))
plt.plot(dt_train_accuracy, marker = "o")
plt.plot(dt_test_accuracy, marker = "o", linestyle = "--")
plt.legend(["Train Accuracy", "Test Accuracy"])
plt.subplot(122)
plt.title("Train F1 Score v/s Test F1 Score: %s" %(np.mean(dt_test_f1_score)))
plt.plot(dt_train_f1_score, marker = "o", color = "r")
plt.plot(dt_test_f1_score,  marker = "o", linestyle = "--", color = "orange")
plt.legend(["Train F1-Score", "Test F1-Score"])

## Light GBM Classifier

In [None]:
lgbm_train_accuracy = []
lgbm_test_accuracy = []
lgbm_train_f1_score = []
lgbm_test_f1_score = []

for train_idx, test_idx in kfold.split(scaled_x, scaled_y):
    x_train, x_test, y_train, y_test = scaled_x[train_idx], scaled_x[test_idx], scaled_y[train_idx], scaled_y[test_idx]
    lgbm = LGBMClassifier(random_state=42)
    lgbm.fit(x_train, y_train)
    train_predict = lgbm.predict(x_train)
    test_predict = lgbm.predict(x_test)
    
    train_accu = sklearn.metrics.accuracy_score(y_train, train_predict)
    test_accu = sklearn.metrics.accuracy_score(y_test, test_predict)
    
    train_f1 = sklearn.metrics.f1_score(y_train, train_predict)
    test_f1 = sklearn.metrics.f1_score(y_test, test_predict)
    
    lgbm_train_accuracy.append(train_accu)
    lgbm_test_accuracy.append(test_accu)
    lgbm_train_f1_score.append(train_f1)
    lgbm_test_f1_score.append(test_f1)

In [None]:
plt.figure(figsize=(16,4))
plt.subplot(121)
plt.title("Train Accuracy v/s Test Accuracy: %s" %(np.mean(lgbm_test_accuracy)))
plt.plot(lgbm_train_accuracy, marker = "o")
plt.plot(lgbm_test_accuracy, marker = "o", linestyle = "--")
plt.legend(["Train Accuracy", "Test Accuracy"])
plt.subplot(122)
plt.title("Train F1 Score v/s Test F1 Score: %s" %(np.mean(lgbm_test_f1_score)))
plt.plot(lgbm_train_f1_score, marker = "o", color = "r")
plt.plot(lgbm_test_f1_score,  marker = "o", linestyle = "--", color = "orange")
plt.legend(["Train F1-Score", "Test F1-Score"])

## XGBoost Classifier

In [None]:
xgb_train_accuracy = []
xgb_test_accuracy = []
xgb_train_f1_score = []
xgb_test_f1_score = []

for train_idx, test_idx in kfold.split(scaled_x, scaled_y):
    x_train, x_test, y_train, y_test = scaled_x[train_idx], scaled_x[test_idx], scaled_y[train_idx], scaled_y[test_idx]
    xgb = XGBClassifier(random_state=42)
    xgb.fit(x_train, y_train)
    train_predict = xgb.predict(x_train)
    test_predict = xgb.predict(x_test)
    
    train_accu = sklearn.metrics.accuracy_score(y_train, train_predict)
    test_accu = sklearn.metrics.accuracy_score(y_test, test_predict)
    
    train_f1 = sklearn.metrics.f1_score(y_train, train_predict)
    test_f1 = sklearn.metrics.f1_score(y_test, test_predict)
    
    xgb_train_accuracy.append(train_accu)
    xgb_test_accuracy.append(test_accu)
    xgb_train_f1_score.append(train_f1)
    xgb_test_f1_score.append(test_f1)

In [None]:
plt.figure(figsize=(16,4))
plt.subplot(121)
plt.title("Train Accuracy v/s Test Accuracy: %s" %(np.mean(xgb_test_accuracy)))
plt.plot(xgb_train_accuracy, marker = "o")
plt.plot(xgb_test_accuracy, marker = "o", linestyle = "--")
plt.legend(["Train Accuracy", "Test Accuracy"])
plt.subplot(122)
plt.title("Train F1 Score v/s Test F1 Score: %s" %(np.mean(xgb_test_f1_score)))
plt.plot(xgb_train_f1_score, marker = "o", color = "r")
plt.plot(xgb_test_f1_score,  marker = "o", linestyle = "--", color = "orange")
plt.legend(["Train F1-Score", "Test F1-Score"])

## Multinomial Naive Bayes

In [None]:
multinom_train_accuracy = []
multinom_test_accuracy = []
multinom_train_f1_score = []
multinom_test_f1_score = []

for train_idx, test_idx in kfold.split(scaled_x, scaled_y):
    x_train, x_test, y_train, y_test = scaled_x[train_idx], scaled_x[test_idx], scaled_y[train_idx], scaled_y[test_idx]
    multinom = MultinomialNB()
    multinom.fit(x_train, y_train)
    train_predict = multinom.predict(x_train)
    test_predict = multinom.predict(x_test)
    
    train_accu = sklearn.metrics.accuracy_score(y_train, train_predict)
    test_accu = sklearn.metrics.accuracy_score(y_test, test_predict)
    
    train_f1 = sklearn.metrics.f1_score(y_train, train_predict)
    test_f1 = sklearn.metrics.f1_score(y_test, test_predict)
    
    multinom_train_accuracy.append(train_accu)
    multinom_test_accuracy.append(test_accu)
    multinom_train_f1_score.append(train_f1)
    multinom_test_f1_score.append(test_f1)

In [None]:
plt.figure(figsize=(16,4))
plt.subplot(121)
plt.title("Train Accuracy v/s Test Accuracy: %s" %(np.mean(multinom_test_accuracy)))
plt.plot(multinom_train_accuracy, marker = "o")
plt.plot(multinom_test_accuracy, marker = "o", linestyle = "--")
plt.legend(["Train Accuracy", "Test Accuracy"])
plt.subplot(122)
plt.title("Train F1 Score v/s Test F1 Score: %s" %(np.mean(multinom_test_f1_score)))
plt.plot(multinom_train_f1_score, marker = "o", color = "r")
plt.plot(multinom_test_f1_score,  marker = "o", linestyle = "--", color = "orange")
plt.legend(["Train F1-Score", "Test F1-Score"])

## SVM Classifier - Radial Bias Function

In [None]:
svc_classif_train_accuracy = []
svc_classif_test_accuracy = []
svc_classif_train_f1_score = []
svc_classif_test_f1_score = []

for train_idx, test_idx in kfold.split(scaled_x, scaled_y):
    x_train, x_test, y_train, y_test = scaled_x[train_idx], scaled_x[test_idx], scaled_y[train_idx], scaled_y[test_idx]
    svc_classif = SVC(random_state=42)
    svc_classif.fit(x_train, y_train)
    train_predict = svc_classif.predict(x_train)
    test_predict = svc_classif.predict(x_test)
    
    train_accu = sklearn.metrics.accuracy_score(y_train, train_predict)
    test_accu = sklearn.metrics.accuracy_score(y_test, test_predict)
    
    train_f1 = sklearn.metrics.f1_score(y_train, train_predict)
    test_f1 = sklearn.metrics.f1_score(y_test, test_predict)
    
    svc_classif_train_accuracy.append(train_accu)
    svc_classif_test_accuracy.append(test_accu)
    svc_classif_train_f1_score.append(train_f1)
    svc_classif_test_f1_score.append(test_f1)

In [None]:
plt.figure(figsize=(16,4))
plt.subplot(121)
plt.title("Train Accuracy v/s Test Accuracy: %s" %(np.mean(svc_classif_test_accuracy)))
plt.plot(svc_classif_train_accuracy, marker = "o")
plt.plot(svc_classif_test_accuracy, marker = "o", linestyle = "--")
plt.legend(["Train Accuracy", "Test Accuracy"])
plt.subplot(122)
plt.title("Train F1 Score v/s Test F1 Score: %s" %(np.mean(svc_classif_test_f1_score)))
plt.plot(svc_classif_train_f1_score, marker = "o", color = "r")
plt.plot(svc_classif_test_f1_score,  marker = "o", linestyle = "--", color = "orange")
plt.legend(["Train F1-Score", "Test F1-Score"])

## SGD Classifier

In [None]:
sgd_train_accuracy = []
sgd_test_accuracy = []
sgd_train_f1_score = []
sgd_test_f1_score = []

for train_idx, test_idx in kfold.split(scaled_x, scaled_y):
    x_train, x_test, y_train, y_test = scaled_x[train_idx], scaled_x[test_idx], scaled_y[train_idx], scaled_y[test_idx]
    sgd = SGDClassifier(random_state=42)
    sgd.fit(x_train, y_train)
    train_predict = sgd.predict(x_train)
    test_predict = sgd.predict(x_test)
    
    train_accu = sklearn.metrics.accuracy_score(y_train, train_predict)
    test_accu = sklearn.metrics.accuracy_score(y_test, test_predict)
    
    train_f1 = sklearn.metrics.f1_score(y_train, train_predict)
    test_f1 = sklearn.metrics.f1_score(y_test, test_predict)
    
    sgd_train_accuracy.append(train_accu)
    sgd_test_accuracy.append(test_accu)
    sgd_train_f1_score.append(train_f1)
    sgd_test_f1_score.append(test_f1)

In [None]:
plt.figure(figsize=(16,4))
plt.subplot(121)
plt.title("Train Accuracy v/s Test Accuracy: %s" %(np.mean(sgd_test_accuracy)))
plt.plot(sgd_train_accuracy, marker = "o")
plt.plot(sgd_test_accuracy, marker = "o", linestyle = "--")
plt.legend(["Train Accuracy", "Test Accuracy"])
plt.subplot(122)
plt.title("Train F1 Score v/s Test F1 Score: %s" %(np.mean(sgd_test_f1_score)))
plt.plot(sgd_train_f1_score, marker = "o", color = "r")
plt.plot(sgd_test_f1_score,  marker = "o", linestyle = "--", color = "orange")
plt.legend(["Train F1-Score", "Test F1-Score"])

## Passive Aggressive Classifier

In [None]:
pa_classif_train_accuracy = []
pa_classif_test_accuracy = []
pa_classif_train_f1_score = []
pa_classif_test_f1_score = []

for train_idx, test_idx in kfold.split(scaled_x, scaled_y):
    x_train, x_test, y_train, y_test = scaled_x[train_idx], scaled_x[test_idx], scaled_y[train_idx], scaled_y[test_idx]
    pa_classif = PassiveAggressiveClassifier(random_state=42)
    pa_classif.fit(x_train, y_train)
    train_predict = pa_classif.predict(x_train)
    test_predict = pa_classif.predict(x_test)
    
    train_accu = sklearn.metrics.accuracy_score(y_train, train_predict)
    test_accu = sklearn.metrics.accuracy_score(y_test, test_predict)
    
    train_f1 = sklearn.metrics.f1_score(y_train, train_predict)
    test_f1 = sklearn.metrics.f1_score(y_test, test_predict)
    
    pa_classif_train_accuracy.append(train_accu)
    pa_classif_test_accuracy.append(test_accu)
    pa_classif_train_f1_score.append(train_f1)
    pa_classif_test_f1_score.append(test_f1)

In [None]:
plt.figure(figsize=(16,4))
plt.subplot(121)
plt.title("Train Accuracy v/s Test Accuracy: %s" %(np.mean(pa_classif_test_accuracy)))
plt.plot(pa_classif_train_accuracy, marker = "o")
plt.plot(pa_classif_test_accuracy, marker = "o", linestyle = "--")
plt.legend(["Train Accuracy", "Test Accuracy"])
plt.subplot(122)
plt.title("Train F1 Score v/s Test F1 Score: %s" %(np.mean(pa_classif_test_f1_score)))
plt.plot(pa_classif_train_f1_score, marker = "o", color = "r")
plt.plot(pa_classif_test_f1_score,  marker = "o", linestyle = "--", color = "orange")
plt.legend(["Train F1-Score", "Test F1-Score"])

## Ridge Classifier

In [None]:
ridge_classif_train_accuracy = []
ridge_classif_test_accuracy = []
ridge_classif_train_f1_score = []
ridge_classif_test_f1_score = []

for train_idx, test_idx in kfold.split(scaled_x, scaled_y):
    x_train, x_test, y_train, y_test = scaled_x[train_idx], scaled_x[test_idx], scaled_y[train_idx], scaled_y[test_idx]
    ridge_classif = RidgeClassifier(random_state=42)
    ridge_classif.fit(x_train, y_train)
    train_predict = ridge_classif.predict(x_train)
    test_predict = ridge_classif.predict(x_test)
    
    train_accu = sklearn.metrics.accuracy_score(y_train, train_predict)
    test_accu = sklearn.metrics.accuracy_score(y_test, test_predict)
    
    train_f1 = sklearn.metrics.f1_score(y_train, train_predict)
    test_f1 = sklearn.metrics.f1_score(y_test, test_predict)
    
    ridge_classif_train_accuracy.append(train_accu)
    ridge_classif_test_accuracy.append(test_accu)
    ridge_classif_train_f1_score.append(train_f1)
    ridge_classif_test_f1_score.append(test_f1)

In [None]:
plt.figure(figsize=(16,4))
plt.subplot(121)
plt.title("Train Accuracy v/s Test Accuracy: %s" %(np.mean(ridge_classif_test_accuracy)))
plt.plot(ridge_classif_train_accuracy, marker = "o")
plt.plot(ridge_classif_test_accuracy, marker = "o", linestyle = "--")
plt.legend(["Train Accuracy", "Test Accuracy"])
plt.subplot(122)
plt.title("Train F1 Score v/s Test F1 Score: %s" %(np.mean(ridge_classif_test_f1_score)))
plt.plot(ridge_classif_train_f1_score, marker = "o", color = "r")
plt.plot(ridge_classif_test_f1_score,  marker = "o", linestyle = "--", color = "orange")
plt.legend(["Train F1-Score", "Test F1-Score"])

In [None]:
train_accuracy_score = pd.DataFrame([lr_train_accuracy, knn_train_accuracy, dt_train_accuracy, lgbm_train_accuracy, xgb_train_accuracy, multinom_train_accuracy, svc_classif_train_accuracy, sgd_train_accuracy, pa_classif_train_accuracy, ridge_classif_train_accuracy], index = ["Logistic", "KNN", "DecisionTree", "LightGBM", "XGB", "Multinom Bayes", "SVM", "SGD", "Passive Classifier", "Ridge"])
test_accuracy_score = pd.DataFrame([lr_test_accuracy, knn_test_accuracy, dt_test_accuracy, lgbm_test_accuracy, xgb_test_accuracy, multinom_test_accuracy, svc_classif_test_accuracy, sgd_test_accuracy, pa_classif_test_accuracy, ridge_classif_train_accuracy], index = ["Logistic", "KNN", "DecisionTree", "LightGBM", "XGB", "Multinom Bayes", "SVM", "SGD", "Passive Classifier", "Ridge"])
train_f1_score = pd.DataFrame([lr_train_f1_score, knn_train_f1_score, dt_train_f1_score, lgbm_train_f1_score, xgb_train_f1_score, multinom_train_f1_score, svc_classif_train_f1_score, sgd_train_f1_score, pa_classif_train_f1_score, ridge_classif_train_accuracy], index = ["Logistic", "KNN", "DecisionTree", "LightGBM", "XGB", "Multinom Bayes", "SVM", "SGD", "Passive Classifier", "Ridge"])
test_f1_score = pd.DataFrame([lr_test_f1_score, knn_test_f1_score, dt_test_f1_score, lgbm_test_f1_score, xgb_test_f1_score, multinom_test_f1_score, svc_classif_test_f1_score, sgd_test_f1_score, pa_classif_test_f1_score, ridge_classif_train_f1_score], index = ["Logistic", "KNN", "DecisionTree", "LightGBM", "XGB", "Multinom Bayes", "SVM", "SGD", "Passive Classifier", "Ridge"])

In [None]:
from IPython.display import display, HTML

In [None]:
CSS = """
.output {
    flex-direction: col;
}
"""

HTML('<style>{}</style>'.format(CSS))

In [None]:
display(train_accuracy_score.style.background_gradient(cmap = "Blues"), display_id = 'train_acc')
display(test_accuracy_score.style.background_gradient(cmap = "Blues"), display_id = 'test_acc')
display(train_f1_score.style.background_gradient(cmap = "Blues"), display_id = 'train_f1_score')
display(test_f1_score.style.background_gradient(cmap = "Blues"), display_id = 'test_f1_score')

In [None]:
model_avg_scores = {'model': [], "train_accuracy": [], "test_accuracy": [], "train_f1": [], "test_f1": []}

for idx1, row1 in train_accuracy_score.iterrows():
    model_avg_scores["model"].append(idx1)
    model_avg_scores['train_accuracy'].append(np.mean(row1))
    
for idx2, row2 in test_accuracy_score.iterrows():
    model_avg_scores['test_accuracy'].append(np.mean(row2))
    
for idx3, row3 in train_f1_score.iterrows():
    model_avg_scores['train_f1'].append(np.mean(row3))
    
for idx4, row4 in test_f1_score.iterrows():
    model_avg_scores['test_f1'].append(np.mean(row4))

In [None]:
model_score_df = pd.DataFrame(model_avg_scores)

In [None]:
plt.figure(figsize=(18,14))
plt.subplot(221)
plt.title("Model's Train Accuracy Score")
sns.barplot(x = model_score_df["train_accuracy"], y = model_score_df.model, color = "skyblue")
plt.xticks(rotation = "90")
for idx, val in enumerate(model_score_df.train_accuracy):
    plt.text(val, idx, round(float(val),3))
plt.subplot(222)
plt.title("Model's Test Accuracy Score")
sns.barplot(x = model_score_df["test_accuracy"], y = model_score_df.model, color = "g")
plt.xticks(rotation = "90")
for idx, val in enumerate(model_score_df.test_accuracy):
    plt.text(val, idx, round(float(val),3))
plt.subplot(223)
plt.title("Model's Train F1 Score")
sns.barplot(x = model_score_df["train_f1"], y = model_score_df.model, color = "skyblue")
plt.xticks(rotation = "90")
for idx, val in enumerate(model_score_df.train_f1):
    plt.text(val, idx, round(float(val),3))
plt.subplot(224)
plt.title("Model's Test F1 Score")
sns.barplot(x = model_score_df["test_f1"], y = model_score_df.model, color = "g")
plt.xticks(rotation = "90")
for idx, val in enumerate(model_score_df.test_f1):
    plt.text(val, idx, round(float(val),3))
plt.tight_layout()
plt.show()

As you can see above, the KNN algorithm is the worst model when it comes to handling the imbalanced dataset. If we take a look at the accuracy of the KNN is fairly decent but if we take a look at the F1 score, we can see the model is not performing good so we can say that in the presence of class imbalance, the accuracy would not be an ideal metric to look at but I have added to see the metric comparison in case of imabalance dataset.

If I have to choose any 2-3 algorithm for our hyperparameter optimization, I will prefer Ridge Classifier & Passive Classifier for our next step.

In [None]:
model_score_df.style.background_gradient(cmap = "Blues_r")

## Hyperparameter Optimization 

Now in this section we're going to perform hyperparameter optimization to see the performance of our both the models if we can improve it further by doing manual iteration into our model's parameters.

In [None]:
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, cross_val_score

### Passive Aggressive Classifier Hyperparameter Tuning with Balanced Weight

In [None]:
# pa_weights = np.linspace(0.0,0.99,200)
# [{0:x, 1:1.0-x} for x in pa_weights]

In [None]:
pa_param_grid = {
    "C": [0.0001, 0.001, 0.01, 1.0, 10, 100],
    "fit_intercept": [True, False],
    "max_iter": [100, 500, 1000],
    "tol": [0.0001, 0.001, 0.005, 0.01, 0.05, 0.1, 1.0, 0.0005],
    "shuffle": [True, False],
    "class_weight": ["balanced"]
}

In [None]:
pa_classif = PassiveAggressiveClassifier(random_state=42, verbose = 0)

In [None]:
random_search = RandomizedSearchCV(estimator = pa_classif, param_distributions = pa_param_grid, scoring = "f1", cv = 10, random_state = 42)

In [None]:
x_train, x_test, y_train, y_test = train_test_split(scaled_x, scaled_y, test_size = 0.30, random_state = 42)

In [None]:
random_search.fit(x_train, y_train)

In [None]:
random_search.best_score_

In [None]:
random_search.best_estimator_

In [None]:
plt.figure(figsize=(10,5))
plt.title("Passive Classifier Mean Test Score Plot")
plt.plot(random_search.cv_results_["mean_test_score"], marker = "o")
plt.xlabel("Cross Validation")
plt.ylabel("Mean Test Score")

In [None]:
random_search.cv_results_["mean_test_score"]

In [None]:
y_train_predict = random_search.predict(x_train)

In [None]:
y_test_predict = random_search.predict(x_test)

In [None]:
evaluate_model(y_train, y_test, y_train_predict, y_test_predict)

### Ridge Classifier Hyperparameter Tuning

In [None]:
ridge_params = {
    'alpha': [1.0,0.1,0.001,0.0001,0.5,0.005,0.0005,10,25,50,75],
    'fit_intercept': [True, False],
    'normalize': [True, False],
    'tol': [0.001,0.01,0.1,0.005,0.05,0.5,0.2,0.02,0.002],
    'class_weight': ['balanced'],
    'max_iter': [100,500,1000, False]
}

In [None]:
ridge_classifier_hp = RidgeClassifier(random_state = 0)

In [None]:
ridge_random_search = RandomizedSearchCV(estimator=ridge_classifier_hp, param_distributions=ridge_params, cv = 10, scoring = 'f1', random_state = 42, verbose = 0)

In [None]:
ridge_random_search.fit(x_train, y_train)

In [None]:
ridge_random_search.best_estimator_

In [None]:
ridge_random_search.best_score_

In [None]:
ridge_random_search.cv_results_["mean_test_score"]

In [None]:
plt.figure(figsize=(10,5))
plt.title("Ridge Classifier Mean Test Score Plot")
plt.plot(ridge_random_search.cv_results_["mean_test_score"], marker = "o")
plt.xlabel("Cross Validation")
plt.ylabel("Mean Test Score")

In [None]:
ridge_ytrain_predict = ridge_random_search.predict(x_train)
ridge_ytest_predict = ridge_random_search.predict(x_test)

In [None]:
evaluate_model(y_train, y_test, ridge_ytrain_predict, ridge_ytest_predict)

So based on the hyperparameter tuning, I can say that ridge classifier is performing slightly better when it comes to prediction and reducing the False Positive so I will be picking the Ridge Classifier to create my final model. Now, we are going to perform the Grid Search CV on the parameter that we've got from the randomized search cross validation.

### Grid Search Cross Validation - Ridge Classifier

In [None]:
# RidgeClassifier(alpha=75, class_weight='balanced', max_iter=500, normalize=True,
#                 random_state=0, tol=0.01)

In [None]:
grid_params = {
    
    'alpha': [25,50,60,70,75,80,85],
    'max_iter': [400,420,450,480,500,550,600],
    'tol': [0.01,0.02,0.03,0.04,0.05]
}

In [None]:
ridge = RidgeClassifier(random_state=0, class_weight='balanced', normalize=True)

In [None]:
grid_search = GridSearchCV(estimator=ridge, param_grid = grid_params, scoring = 'f1', cv = 10)

In [None]:
best_model = grid_search.fit(x_train, y_train)

In [None]:
best_model.best_estimator_

In [None]:
best_model.best_params_

In [None]:
best_model.best_score_

In [None]:
plt.figure(figsize=(10,6))
plt.title("Ridge Mean Test Score")
plt.plot(best_model.cv_results_["mean_test_score"])
plt.xlabel("Cross Validation")
plt.ylabel("Mean Test Score")

In [None]:
train_predict = best_model.predict(x_train)
test_predict = best_model.predict(x_test)

In [None]:
evaluate_model(y_train, y_test, train_predict, test_predict)

## Final Model

In [None]:
f_ridge =  RidgeClassifier(alpha=75, class_weight='balanced', max_iter=500, normalize=True, random_state=0, tol=0.01)

In [None]:
f_ridge.fit(x_train, y_train)

In [None]:
train_final_prediction = f_ridge.predict(x_train)

In [None]:
test_final_prediction = f_ridge.predict(x_test)

In [None]:
train_predict_df = pd.DataFrame(train_final_prediction)
test_predict_df = pd.DataFrame(test_final_prediction)

In [None]:
final_prediction = train_predict_df.append(test_predict_df).reset_index().drop("index", axis = 1)

In [None]:
final_df = df.iloc[:, :2]

In [None]:
final_df["class_predict"] = final_prediction

In [None]:
class_mapping = {0: "ham", 1:"spam"}

In [None]:
final_df.class_predict = final_df.class_predict.map(class_mapping)

In [None]:
plt.figure(figsize=(12,6))
plt.subplot(121)
plt.title("Actual Class")
sns.countplot(final_df["class"])
plt.subplot(122)
plt.title("Predicted Class")
sns.countplot(final_df["class_predict"])
plt.tight_layout()

In [None]:
final_df.to_csv("spam_submission.csv", index = False)

### I hope you liked this kernel..!!!!