In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
emails = pd.read_csv("../input/spam-filter/emails.csv")
emails

In [None]:
emails.describe()

In [None]:
emails.info()

In [None]:
emails['spam'].value_counts()

In [None]:
emails.notnull()

In [None]:
emails["text"] = emails["text"].str.lower()
emails.head()

In [None]:
import string
PUNCT_TO_REMOVE = string.punctuation
def remove_punctuation(text):
    """custom function to remove the punctuation"""
    return text.translate(str.maketrans('', '', PUNCT_TO_REMOVE))

emails["text"] = emails["text"].apply(lambda text: remove_punctuation(text))
emails['text']

In [None]:
from nltk.corpus import stopwords
STOPWORDS = set(stopwords.words('english'))
STOPWORDS.add('subject')
def remove_stopwords(text):
    """custom function to remove the stopwords"""
    return " ".join([word for word in str(text).split() if word not in STOPWORDS])

emails["text"] = emails["text"].apply(lambda text: remove_stopwords(text))
emails['text']


In [None]:
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()
def lemmatize_words(text):
    return " ".join([lemmatizer.lemmatize(word) for word in text.split()])

emails["text"] = emails["text"].apply(lambda text: lemmatize_words(text))
emails.head()

In [None]:
X = emails['text']
y = emails['spam']

## Splitting data

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

In [None]:
print(X_train.shape)
print(X_test.shape)

## Tfidf vectorization of data

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()
X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)

In [None]:
print(X_train.shape)
print(X_test.shape)

## multinomial naive bayes for classifying data

In [None]:
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB()
clf.fit(X_train, y_train)

In [None]:
y_pred = clf.predict(X_train)

In [None]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
trainacc = accuracy_score(y_train, y_pred)
trainf1 = f1_score(y_train, y_pred)
print(trainacc)
print(trainf1)

In [None]:
y_pred_test = clf.predict(X_test)

In [None]:
testacc = accuracy_score(y_test, y_pred_test)
f1test = f1_score(y_test, y_pred_test)
print(testacc)
print(f1test)

## Confusion matrix for train data

In [None]:
import seaborn as sn
from sklearn.metrics import confusion_matrix

cm = confusion_matrix(y_train, y_pred)
sn.heatmap(cm, annot=True)

## Confusion matrix for test data

In [None]:
cm1 = confusion_matrix(y_test, y_pred_test)
sn.heatmap(cm1, annot=True)

## Hyper parameter tuning to find best alpha

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.naive_bayes import MultinomialNB
params_KNN = {'alpha': [0.00000001,0.0000001, 0.000001,0.00001,0.0001,0.001,0.01,0.1,1,2,3,4,5,6,7,8,9,10,15,20,25,30,40,50]}
gs_KNN = GridSearchCV(estimator=MultinomialNB(),
                      param_grid=params_KNN, 
                      verbose=1,  # verbose: the higher, the more messages
                      scoring='f1', 
                      return_train_score=True)

In [None]:
gs_KNN.fit(X_train, y_train)
best_parameters = gs_KNN.best_params_

In [None]:
best_parameters

In [None]:
clf = MultinomialNB(alpha= 0.01)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_train)


In [None]:
from sklearn.metrics import accuracy_score
trainacc = accuracy_score(y_train, y_pred)
trainf1 = f1_score(y_train, y_pred)
print(trainacc)
print(trainf1)

In [None]:
y_pred_test = clf.predict(X_test)
testacc = accuracy_score(y_test, y_pred_test)
testf1 = f1_score(y_test, y_pred_test)
print(testacc)
print(testf1)

## Confusion matrix for test data

In [None]:
import seaborn as sn
from sklearn.metrics import confusion_matrix

cm = confusion_matrix(y_train, y_pred)
sn.heatmap(cm, annot=True)

## Confusion matrix for test data

In [None]:
cm1 = confusion_matrix(y_test, y_pred_test)
sn.heatmap(cm1, annot=True)