# Email Spam Filter

## Initialization

In [1]:
import numpy as np
from sklearn.model_selection import train_test_split
import os
import glob
import email
import re
import string
from plotly import tools
import plotly.offline as py
py.init_notebook_mode(connected=True)
import plotly.graph_objs as go

In [2]:
# Generic Path
path = 'data/'

In [3]:
# Ham Paths
hamPaths = glob.glob(path + 'ham/*')
ham2Paths = glob.glob(path + 'ham2/*')
toughHamPaths = glob.glob(path + 'toughHam/*')

# Spam Paths
spamPaths = glob.glob(path + 'spam/*')
spam2Paths = glob.glob(path + 'spam2/*')
print (len(toughHamPaths))

251


## Split training/testing data

In [4]:
def split(emailPaths):
    sample = np.array([train_test_split(x) for x in emailPaths])
#     print(sample.shape)
    train = np.array([])
    test = np.array([])
    for path in sample:   
        train = np.concatenate((train,path[0]),axis=0)
        test = np.concatenate((test,path[1]),axis=0)
#     print(train.shape, test.shape)
    return train, test

def createTrain_Test(ham, spam):
    # Create Labels
    ham_label = [0]*ham.shape[0]
    spam_label = [1]*spam.shape[0]
    x = np.concatenate((ham, spam))
    y = np.concatenate((ham_label, spam_label))
#     print(len(x), len(y))
    # Selet Random Emails
    shuffle_index = np.random.permutation(np.arange(0,x.shape[0]))
#     print(shuffle_index)
    x = x[shuffle_index]
    y = y[shuffle_index]
#     print(x.shape, y.shape)
    return x, y

In [5]:
# All Paths
allHamPaths = [hamPaths, ham2Paths, toughHamPaths]
allSpamPaths = [spamPaths, spam2Paths]

In [6]:
# Generate Ham and Spam Train/Test

ham_train, ham_test = split(allHamPaths)
spam_train, spam_test = split(allSpamPaths)

In [7]:
# Combine Ham and Spam, Create Labels

x_train_new, y_train_new = createTrain_Test(ham_train, spam_train)
x_test_new, y_test_new = createTrain_Test(ham_test, spam_test)

In [8]:
print(x_train_new.shape, y_train_new.shape)
print(x_test_new.shape, y_test_new.shape)

(4535,) (4535,)
(1516,) (1516,)


## Exploratory Data Analysis

In [9]:
def labelCount(y, val, status):
    
    # Separating Spam and Ham Labeled Emails
    spam_index = [i for i, label in enumerate(y) if label == 1]
    ham_index = [i for i, label in enumerate(y) if label == 0]
    
    spam_size = len(spam_index)
    ham_size = len(ham_index)
    total_size = spam_size + ham_size
    
#     print(spam_size, ham_size, total_size, len(y))
    
    # Label Count using Bar Chart
    trace = go.Bar(x = ["Spam","Ham"], y = [spam_size, ham_size],
        marker = dict(
            color = [spam_size, ham_size],
            colorscale = 'Picnic',
            reversescale = True
        ),
    )
    
    layout = go.Layout(title = val+' Data using Bar Chart '+status, font = dict(size = 18))
    fig = go.Figure(data = [trace], layout = layout)
    
    py.iplot(fig, filename="LabelCountBar")

    # Label Count using Pie Chart
    sizes = (np.array(([spam_size, ham_size]))/total_size*100)
    trace = go.Pie(labels = (np.array(["Spam", "Ham"])), values = sizes)
    layout = go.Layout(title = val+' Data using Pie Chart '+status, font = dict(size = 18), width = 650, height = 600,)
    fig = go.Figure(data = [trace], layout = layout)
    
    py.iplot(fig, filename="LabelCountPie")


In [10]:
labelCount(y_train_new, "Train", "Before Preprocessing")

In [11]:
labelCount(y_test_new, "Test", "Before Preprocessing")

In [12]:
def getBulkEmail(emails, labels):
    datas = [getContent(x) for x in emails]
    x, y = removeNull(datas, labels)
    return x, y

def getContent(emails):
    file = open(emails,encoding='latin1')
    try:
        message = email.message_from_file(file)
        for each_part in message.walk():
            if each_part.get_content_type() == 'text/plain':
                return each_part.get_payload() # prints the raw text
    except Exception as e:
        print(e)
        
def removeNull(datas,labels):
    not_null_idx = [i for i,o in enumerate(datas) if o is not None]
    return np.array(datas)[not_null_idx], np.array(labels)[not_null_idx]

In [13]:
# Get Email Contents from the Paths

x_train1, y_train = getBulkEmail(x_train_new, y_train_new)
x_test1, y_test = getBulkEmail(x_test_new, y_test_new)

In [14]:
print(x_train1.shape, y_train.shape)
print(x_test1.shape, y_test.shape)

(3821,) (3821,)
(1255,) (1255,)


## Data Cleaning

In [15]:
def removeHyperlink(word):
    return re.sub(r"http\S+", "", word)

def replaceNewline(word):
    return word.replace('\n',' ')

def removeSpecial(word):
    return re.sub('[^A-Za-z ]+', '', word)

def removeWhitespace(word):
    return word.strip()

def lowercase(word):
    return word.lower()

def dataCleanPipeline(sentence):
    utilsList = [
        removeHyperlink,
        replaceNewline,
        removeSpecial,
        removeWhitespace,
        lowercase
    ]
    for util in utilsList:
        sentence = util(sentence)
    return sentence

In [16]:
# Data Cleaning Pipeline

x_train2 = [dataCleanPipeline(sentence) for sentence in x_train1]
x_test2 = [dataCleanPipeline(sentence) for sentence in x_test1]

In [17]:
print(len(x_train2), len(y_train))
print(len(x_test2), len(y_test))

3821 3821
1255 1255


## Sentence Processing

In [18]:
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
import nltk
from nltk.corpus import stopwords
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')


The sklearn.feature_extraction.stop_words module is  deprecated in version 0.22 and will be removed in version 0.24. The corresponding classes / functions should instead be imported from sklearn.feature_extraction.text. Anything that cannot be imported from sklearn.feature_extraction.text is now part of the private API.

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\sanch\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\sanch\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\sanch\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [19]:
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()
stopwords = set(stopwords.words('english'))

In [20]:
def removeStopwords(words):
    result = [i for i in words if i not in stopwords]
    return result

def wordStemmer(words):
    return [stemmer.stem(o) for o in words]

def wordLemmatizer(words):
    return [lemmatizer.lemmatize(o) for o in words]

def sentenceProcessPipeline(sentence):
    utilsList = [
        removeStopwords,
#         wordStemmer,
        wordLemmatizer
    ]
    for util in utilsList:
        sentence = util(sentence)
#         print(util,': ',sentence,'\n')
    return sentence

In [21]:
# Word Tokenization

x_train3 = [word_tokenize(sentence) for sentence in x_train2]
x_test3 = [word_tokenize(sentence) for sentence in x_test2]

In [22]:
print(len(x_train3), len(y_train))
print(len(x_test3), len(y_test))

3821 3821
1255 1255


In [23]:
x_train = [sentenceProcessPipeline(sentence) for sentence in x_train3]
x_test = [sentenceProcessPipeline(sentence) for sentence in x_test3]

In [24]:
# Join the words into a string

x_train_str = [" ".join(words) for words in x_train]
x_test_str = [" ".join(words) for words in x_test]

## Visualization

In [25]:
# Separate Spam and Ham emails

spam_train_index = [i for i,o in enumerate(y_train) if o == 1]
spam_email = np.array(x_train_str)[spam_train_index]

ham_train_index = [i for i,o in enumerate(y_train) if o == 0]
ham_email = np.array(x_train_str)[ham_train_index]

In [26]:
print(len(spam_email), len(ham_email))

791 3030


### Bar Chart

In [27]:
from collections import defaultdict
import pandas as pd
import plotly.graph_objs as go
from plotly import tools
import plotly.offline as py
py.init_notebook_mode(connected=True)

In [28]:
## Generate NGrams of 
def generateNGrams(text, ngram=1):
    token = [token for token in text.lower().split(" ") if token != "" if token not in stopwords]
    ngrams = zip(*[token[i:] for i in range(ngram)])
    return [" ".join(ngram) for ngram in ngrams]

## custom function for horizontal bar chart ##
def horizontalBarChart(df, color):
    trace = go.Bar(
        y = df["word"].values[::-1],
        x = df["wordcount"].values[::-1],
        showlegend = False,
        orientation = 'h',
        marker = dict(
            color = color,
        ),
    )
    return trace

In [29]:
def generateBarChart(word_count=1):
    # Get the bar chart for Ham Emails
    frequency = defaultdict(int)
    for sentence in ham_email:
        for word in generateNGrams(sentence, word_count):
            frequency[word] += 1
    sortedFreq = pd.DataFrame(sorted(frequency.items(), key=lambda x: x[1])[::-1])
    sortedFreq.columns = ["word", "wordcount"]
    HamChart = horizontalBarChart(sortedFreq.head(25), 'blue')

    # Get the bar chart for Spam Emails
    frequency = defaultdict(int)
    for sentence in spam_email:
        for word in generateNGrams(sentence, word_count):
            frequency[word] += 1
    sortedFreq = pd.DataFrame(sorted(frequency.items(), key=lambda x: x[1])[::-1])
    sortedFreq.columns = ["word", "wordcount"]
    SpamChart = horizontalBarChart(sortedFreq.head(25), 'red')

    # Creating subplots for each label
    fig = tools.make_subplots(rows=1, cols=2,subplot_titles=["Frequent words in Ham Emails","Frequent words in Spam Emails"])
    fig.append_trace(HamChart, 1, 1)
    fig.append_trace(SpamChart, 1, 2)
    fig['layout'].update(height=1200, width=1000, paper_bgcolor='lightgray', title="Frequency Count Bar Chart")
    py.iplot(fig, filename='frequency-plots')

In [30]:
generateBarChart(1)


plotly.tools.make_subplots is deprecated, please use plotly.subplots.make_subplots instead



### Target Distribution

## Feature Extraction

### TfidVectorizer

In [31]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [32]:
tfidf_vectorizer = TfidfVectorizer()

In [33]:
# Train TF-IDF Vectorizer
tfidf_vectorizer.fit(x_train_str)

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.float64'>, encoding='utf-8',
                input='content', lowercase=True, max_df=1.0, max_features=None,
                min_df=1, ngram_range=(1, 1), norm='l2', preprocessor=None,
                smooth_idf=True, stop_words=None, strip_accents=None,
                sublinear_tf=False, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, use_idf=True, vocabulary=None)

In [34]:
x_train_features_tfidf = tfidf_vectorizer.transform(x_train_str)
x_test_features_tfidf = tfidf_vectorizer.transform(x_test_str)

In [35]:
print(x_train_features_tfidf.toarray())

[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


### CountVectorizer

In [36]:
from sklearn.feature_extraction.text import CountVectorizer

In [37]:
cv_vectorizer = CountVectorizer()

In [38]:
# Train Count Vectorizer
cv_vectorizer.fit(x_train_str)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                lowercase=True, max_df=1.0, max_features=None, min_df=1,
                ngram_range=(1, 1), preprocessor=None, stop_words=None,
                strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, vocabulary=None)

In [39]:
x_train_features_cv = cv_vectorizer.transform(x_train_str)
x_test_features_cv = cv_vectorizer.transform(x_test_str)

In [40]:
print(x_test_features_cv[0].toarray())
x_test_features_cv[0].shape

[[0 0 0 ... 0 0 0]]


(1, 49261)

In [41]:
type(cv_vectorizer.vocabulary_.items())
a1 = pd.DataFrame(sorted(cv_vectorizer.vocabulary_.items(), key=lambda x: x[1])[::1])

## Training Classifier

### Gaussian Naive Bayes

In [42]:
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression

In [43]:
gnb = GaussianNB()

In [44]:
# Train using TF-IDF Vectorizer features
gnb.fit(x_train_features_tfidf.toarray(),y_train)

GaussianNB(priors=None, var_smoothing=1e-09)

In [45]:
y_predict_gnb = gnb.predict(x_test_features_tfidf.toarray())

### Multinomial Naive Bayes

In [46]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression

In [47]:
mnb = MultinomialNB()

In [48]:
# Train using Count Vectorizer features
mnb.fit(x_train_features_cv.toarray(),y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [49]:
y_predict_mnb = mnb.predict(x_test_features_cv.toarray())

## Evaluation Metrics

### Accuracy

In [50]:
# GaussianNB Test Accuracy
gnb_accuracy = gnb.score(x_test_features_tfidf.toarray(),y_test)

# GaussianNB Test Accuracy
mnb_accuracy = mnb.score(x_test_features_cv.toarray(),y_test)

trace = go.Bar(x = ["GaussianNB", "MultinomialNB"], y = [gnb_accuracy*100, mnb_accuracy*100],
               text=[round(gnb_accuracy*100, 2), round(mnb_accuracy*100, 2)],textposition='auto',
    marker = dict(
        color = ["red", "blue"],
    ),
)

layout = go.Layout(title = 'Test Accuracy Comparison', font=dict(size=18,),width = 700, height = 600,
                   xaxis_title="Trained Models", yaxis_title="Accuracy %",)
fig = go.Figure(data = [trace], layout = layout)

py.iplot(fig, filename="Accuracy Comparison")

### Confusion Matrix

In [51]:
def confusion_matrix_figure(title, cmf):
    fig = go.Figure(data=[go.Table(
        header=dict(values=['<b> ' + title+ ' </b>', '<b>Predicted Ham</b>', '<b>Predicted Spam</b>','<b>Total</b>'],
                    font = dict(color = 'white', size = [13, 10]),
                    align = ['center'], fill_color = ['lightslategrey'],
                    line_color='darkslategray',
                   ),
                    
        cells=dict(values=[
            ['<b>Label Ham</b>', '<b>Label Spam</b>','<b>Total</b>'],
            [cmf[0][0], cmf[1][0], (cmf[0][0] + cmf[1][0])], 
            [cmf[0][1], cmf[1][1], (cmf[0][1] + cmf[1][1])],
            [(cmf[0][0] + cmf[0][1]), (cmf[1][0] + cmf[1][1]),(cmf[0][0]+cmf[0][1]+cmf[1][0]+cmf[1][1])]],
            font = dict(color = ['white', 'black'], size = 10),
        align = ['center'], fill_color = ['lightslategrey', 'white'],
            line_color='darkslategray',

        ))])
    fig.update_layout(width=650, height=400)
    fig.show()

In [52]:
from sklearn.metrics import confusion_matrix

sample = [[' TP ', ' FN '], [' FP ', ' TN ']]
confusion_matrix_figure('Standard', sample)

# Gaussian NB Confusion Matrix
gnb_cm = confusion_matrix(y_test,y_predict_gnb)
confusion_matrix_figure('GuassianNB', gnb_cm)

# Multinomial NB Confusion Matrix
mnb_cm = confusion_matrix(y_test,y_predict_mnb)
confusion_matrix_figure('MultinomialNB', mnb_cm)

### Precision, Recall and F1-Score

In [53]:
from sklearn.metrics import f1_score, precision_score,recall_score

In [54]:
# Precision, Recall and F1-Score

gnb_precision = precision_score(y_test, y_predict_gnb)
gnb_recall = recall_score(y_test, y_predict_gnb)
gnb_f1 = f1_score(y_test, y_predict_gnb)

print("Precision: {:.2f}%".format(100 * gnb_precision))
print("Recall: {:.2f}%".format(100 * gnb_recall))
print("F1-score: {:.2f}%".format(100 * gnb_f1))

Precision: 91.56%
Recall: 81.10%
F1-score: 86.01%


In [55]:
# Precision, Recall and F1-Score
mnb_precision = precision_score(y_test, y_predict_mnb)
mnb_recall = recall_score(y_test, y_predict_mnb)
mnb_f1 = f1_score(y_test, y_predict_mnb)

print("Precision: {:.2f}%".format(100 * mnb_precision))
print("Recall: {:.2f}%".format(100 * mnb_recall))
print("F1-score: {:.2f}%".format(100 * mnb_f1))

Precision: 98.74%
Recall: 92.52%
F1-score: 95.53%


In [56]:
# Summary Table

fig = go.Figure(data=[go.Table(
    header=dict(values=['<b>Trained Models</b>','<b>Precision</b>', '<b>Recall</b>', '<b>F1-Score</b>'],
                font = dict(color = 'white', size = [10]),
                align = ['center'], fill_color = ['lightslategrey'],
                line_color='darkslategray',
               ),

    cells=dict(values=[
        ['<b>GaussianNB</b>','<b>MultinomialNB</b>'],
        [round(gnb_precision, 2), round(mnb_precision, 2)], 
        [round(gnb_recall, 2), round(mnb_recall, 2)],
        [round(gnb_f1, 2), round(mnb_f1, 2)]],
        font = dict(color = ['white', 'black'], size = 10),
    align = ['center'], fill_color = ['lightslategrey', 'white'],
        line_color='darkslategray',

    ))])
fig.update_layout(width=650, height=400)
fig.show()