# Notebook Imports

In [34]:
import numpy as np
import pandas as pd

# will allow us to generate list of words very quickly
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import confusion_matrix, accuracy_score, recall_score, precision_score, f1_score

import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap



In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Constants

In [3]:
DATA_JSON_FILE = '/content/drive/MyDrive/Machine Learning /Naive Bayes Spam Filter/SpamData/01_Processing/email-text-data.json'

# Import Data

In [4]:
data = pd.read_json(DATA_JSON_FILE)

In [5]:
data.head()

Unnamed: 0,MESSAGE,CATEGORY,FILE_NAME
0,Immediate Help Needed. We are a fortune 500 c...,1,00458.62211764fde0dd7128ea4146268b40dd
1,You can save thousands of dollars over the cou...,1,00434.8507c67a652e01636df9b92a0a397193
2,"REGISTER .COM, .BIZ, AND .INFO DOMAINS FOR ONL...",1,00292.dbf78a2aaa230d288eb80ab843804252
3,This is a multi-part message in MIME format.\n...,1,00209.5276f967533f2ce0209c1eff631a86ff
4,This is a multi-part message in MIME format.\n...,1,00216.89c1ede0b81fb09f7334f47a5183410a


In [6]:
data.tail()

Unnamed: 0,MESSAGE,CATEGORY,FILE_NAME
5791,> -----Original Message-----\n\n> From: razor-...,0,00604.b79c959719f5f325067852352496e07a
5792,http://www.siliconvalley.com/mld/siliconvalley...,0,00905.defebe39d659693316e71ad1cd70b127
5793,"On Tue, Aug 06, 2002 at 02:04:11PM +0100, Nial...",0,00273.3d73db3ab6dc7c9cfc71126ae18b5b1b
5794,This comment probably goes into better late th...,0,00749.3500b619df0119e64fc177b3b6eff006
5795,"For the past 2 days, nothing that I'm reportin...",0,00580.7dd943cb2a791ae9600144dee69f27b1


In [7]:
data.sort_index(inplace=True) # sort indices in increasing order

In [8]:
data.tail()

Unnamed: 0,MESSAGE,CATEGORY,FILE_NAME
5791,> -----Original Message-----\n\n> From: razor-...,0,00604.b79c959719f5f325067852352496e07a
5792,http://www.siliconvalley.com/mld/siliconvalley...,0,00905.defebe39d659693316e71ad1cd70b127
5793,"On Tue, Aug 06, 2002 at 02:04:11PM +0100, Nial...",0,00273.3d73db3ab6dc7c9cfc71126ae18b5b1b
5794,This comment probably goes into better late th...,0,00749.3500b619df0119e64fc177b3b6eff006
5795,"For the past 2 days, nothing that I'm reportin...",0,00580.7dd943cb2a791ae9600144dee69f27b1


# Use Scikit learn to create our features sparse matrix and vocabulary

In [9]:
vectorizer = CountVectorizer(stop_words='english')

In [10]:
# recall individual words were features
all_features = vectorizer.fit_transform(data.MESSAGE)

In [11]:
all_features.shape # columns correspond to our individual words

(5796, 102694)

In [12]:
# vectorizer will generate vocabulary for us

vectorizer.vocabulary_

{'immediate': 51251,
 'help': 47200,
 'needed': 64741,
 'fortune': 42500,
 '500': 8927,
 'company': 29579,
 'growing': 45726,
 'tremendous': 88713,
 'rate': 76347,
 '1000': 1497,
 'year': 98499,
 'simply': 82183,
 'looking': 59228,
 'motivated': 63072,
 'individuals': 51723,
 'earn': 37247,
 'substantial': 84864,
 'income': 51540,
 'working': 96050,
 'home': 48006,
 'real': 76643,
 'opportunity': 67871,
 'make': 60515,
 'excellent': 39680,
 'experience': 39912,
 'required': 77713,
 'provide': 73660,
 'training': 88523,
 'need': 64740,
 'energetic': 38549,
 'self': 81125,
 'people': 70441,
 'click': 28719,
 'link': 58601,
 'complete': 29666,
 'online': 67665,
 'information': 51875,
 'request': 77707,
 'form': 42425,
 'employment': 38384,
 'specialist': 83507,
 'contact': 30198,
 'http': 48497,
 'ter': 86870,
 'netblah': 64878,
 'com': 29405,
 '8080': 12087,
 'employed': 38374,
 'career': 26784,
 'vast': 92136,
 'opportunities': 67870,
 'www': 96531,
 'zhoster': 100308,
 'homeopp': 48031

# Create our Training and Test Data

In [13]:
X = all_features # features we will use to predict - independent variables 
y = data.CATEGORY # what we are predicting - dependent variable

In [14]:
# split our data
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y,
                                                    test_size=0.3,
                                                    random_state=88)

In [15]:
X_train.shape

(4057, 102694)

In [16]:
X_test.shape

(1739, 102694)

# Train Our Model

In [17]:
classifier = MultinomialNB()

classifier.fit(X_train, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

# Predictions of Our model

In [18]:
y_pred = classifier.predict(X_test)
y_test = np.array(y_test)
compare_results = np.concatenate(
    (y_pred.reshape(len(y_pred), 1), 
     y_test.reshape(len(y_test), 1)),
     1)

In [19]:
compare_results[:100]

array([[1, 1],
       [0, 0],
       [0, 0],
       [0, 0],
       [1, 1],
       [1, 1],
       [0, 0],
       [0, 0],
       [0, 0],
       [0, 0],
       [0, 0],
       [0, 0],
       [0, 0],
       [1, 1],
       [1, 1],
       [1, 1],
       [0, 0],
       [0, 0],
       [1, 1],
       [0, 0],
       [1, 1],
       [0, 0],
       [0, 0],
       [1, 1],
       [1, 1],
       [0, 0],
       [0, 0],
       [0, 0],
       [0, 0],
       [0, 0],
       [0, 0],
       [0, 0],
       [0, 0],
       [0, 0],
       [0, 0],
       [0, 0],
       [0, 0],
       [0, 0],
       [1, 1],
       [0, 0],
       [0, 0],
       [0, 0],
       [1, 1],
       [0, 0],
       [1, 1],
       [1, 1],
       [1, 1],
       [0, 0],
       [0, 0],
       [0, 0],
       [1, 1],
       [0, 0],
       [0, 0],
       [0, 0],
       [0, 0],
       [0, 0],
       [1, 1],
       [0, 0],
       [1, 1],
       [0, 0],
       [0, 0],
       [0, 0],
       [0, 0],
       [0, 0],
       [0, 0],
       [0, 0],
       [0,

# Making our Confusion Matrix

In [20]:
conf_matrix = confusion_matrix(y_pred, y_test)

In [24]:
print(conf_matrix)

[[1179  105]
 [   6  449]]


# Additional Metrics: Accuracy, Recall, Precision, and F-Score

In [27]:
accuracy = accuracy_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
f_score = f1_score(y_test, y_pred)

In [28]:
print("Accuracy of Bayes Classifier with scikit-learn is {:.2%}".format(accuracy))
print("Recall score of Bayes Classifier with scikit-learn is {:.2%}".format(recall))
print("Precision of Bayes Classifier with scikit-learn is {:.2%}".format(precision))
print("F-Score of Bayes Classifier with scikit-learn is {:.2}".format(f_score))

Accuracy of Bayes Classifier with scikit-learn is 93.62%
Recall score of Bayes Classifier with scikit-learn is 81.05%
Precision of Bayes Classifier with scikit-learn is 98.68%
F-Score of Bayes Classifier with scikit-learn is 0.89
