In [1]:
#Importing pandas for data-preprocessing and reading the file emails.csv
import pandas as pd
dataset = pd.read_csv('emails.csv')

In [2]:
#Viewing the top 10 rows (1 for spam and 0 for not spam)
dataset.head(n=10)

Unnamed: 0,text,spam
0,Subject: naturally irresistible your corporate...,1
1,Subject: the stock trading gunslinger fanny i...,1
2,Subject: unbelievable new homes made easy im ...,1
3,Subject: 4 color printing special request add...,1
4,"Subject: do not have money , get software cds ...",1
5,"Subject: great nnews hello , welcome to medzo...",1
6,Subject: here ' s a hot play in motion homela...,1
7,Subject: save your money buy getting this thin...,1
8,Subject: undeliverable : home based business f...,1
9,Subject: save your money buy getting this thin...,1


In [3]:
#Measuring the length of columns and rows in dataset
dataset.shape

(5728, 2)

In [4]:
#Checking for any null values in dataset
from pandas import DataFrame
print(pd.DataFrame(dataset.isnull().sum()))

      0
text  0
spam  0


In [5]:
#Dropping the duplicate data in columns (If in case in data there are some same emails)
dataset.drop_duplicates(inplace = True)
dataset.shape

(5695, 2)

In [6]:
#Viewing the first 10 rows 
dataset.head(n=10)

Unnamed: 0,text,spam
0,Subject: naturally irresistible your corporate...,1
1,Subject: the stock trading gunslinger fanny i...,1
2,Subject: unbelievable new homes made easy im ...,1
3,Subject: 4 color printing special request add...,1
4,"Subject: do not have money , get software cds ...",1
5,"Subject: great nnews hello , welcome to medzo...",1
6,Subject: here ' s a hot play in motion homela...,1
7,Subject: save your money buy getting this thin...,1
8,Subject: undeliverable : home based business f...,1
9,Subject: save your money buy getting this thin...,1


In [7]:
#Removing the subject: keyword from each email (as seen from pattern in data)
dataset['text']=dataset['text'].map(lambda text: text[8:])

In [8]:
#default 5 if no parameter is passed in head function
dataset.head()

Unnamed: 0,text,spam
0,naturally irresistible your corporate identit...,1
1,the stock trading gunslinger fanny is merril...,1
2,unbelievable new homes made easy im wanting ...,1
3,4 color printing special request additional ...,1
4,"do not have money , get software cds from her...",1


In [9]:
#Removing anything exept a-z,A-Z,0-9 on the text and then converting it into lower letter and finally tokenize the text by split keyword
import re
dataset['text'] = dataset['text'].map(lambda text:re.sub('[^a-zA-Z0-9]+', ' ',text)).apply(lambda x: (x.lower()).split())

In [10]:
#After filtering the text mails
dataset.head(10)

Unnamed: 0,text,spam
0,"[naturally, irresistible, your, corporate, ide...",1
1,"[the, stock, trading, gunslinger, fanny, is, m...",1
2,"[unbelievable, new, homes, made, easy, im, wan...",1
3,"[4, color, printing, special, request, additio...",1
4,"[do, not, have, money, get, software, cds, fro...",1
5,"[great, nnews, hello, welcome, to, medzonline,...",1
6,"[here, s, a, hot, play, in, motion, homeland, ...",1
7,"[save, your, money, buy, getting, this, thing,...",1
8,"[undeliverable, home, based, business, for, gr...",1
9,"[save, your, money, buy, getting, this, thing,...",1


In [11]:
# applying stemmer and removing stopwords from the above text 
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
ps = PorterStemmer()   #approx 15 min to run
corpus=dataset['text'].apply(lambda text_list:' '.join(list(map(lambda word:ps.stem(word),(list(filter(lambda text:text not in set(stopwords.words('english')),text_list)))))))

In [12]:
#filtered statement is stored in the corpus variable
corpus.head(10)

0    natur irresist corpor ident lt realli hard rec...
1    stock trade gunsling fanni merril muzo colza a...
2    unbeliev new home made easi im want show homeo...
3    4 color print special request addit inform cli...
4    money get softwar cd softwar compat great grow...
5    great nnew hello welcom medzonlin sh groundsel...
6    hot play motion homeland secur invest terror a...
7    save money buy get thing tri ciall yet cannot ...
8    undeliver home base busi grownup messag subjec...
9    save money buy get thing tri ciall yet cannot ...
Name: text, dtype: object

In [13]:
#Creating the bag of words model of text in X variable
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer()
X = cv.fit_transform(corpus.values).toarray()
y = dataset.iloc[:, 1].values

In [14]:
# Splitting the dataset into the Training set and Test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

In [15]:
# Fitting Naive Bayes classifier to the Training set
from sklearn.naive_bayes import MultinomialNB
classifier = MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)
classifier.fit(X_train,y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [16]:
#Predicting the results and creating the confusion matrix
from sklearn.metrics import confusion_matrix
y_pred = classifier.predict(X_test)
cm = confusion_matrix(y_test, y_pred)

In [17]:
print(cm)

[[849  11]
 [  0 279]]


In [18]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test, y_pred)

0.990342405618964

In [19]:
#Applying k-Fold Cross Validation
from sklearn.model_selection import cross_val_score
accuracies = cross_val_score(estimator = classifier, X = X_train, y = y_train, cv = 10)
print(accuracies)

[0.98245614 0.98903509 0.99122807 0.98684211 0.99122807 0.99122807
 0.98903509 0.99340659 0.9978022  0.98898678]


In [20]:
#for checking variations in accuraries their standard deviation can be checked
accuracies.mean()
accuracies.std()

0.003829081764166351