# **Email Spam Detector model** 

In [1]:
#importing required libraries
import numpy as np
import pandas as pd
import nltk
from nltk.corpus import stopwords
import string

In [7]:
#uploading the dataset
from google.colab import files
uploaded = files.upload()

Saving spamham.csv to spamham.csv


In [8]:
#read the dataset
df = pd.read_csv('spamham.csv')

In [9]:
#printing the first 5 rows
df.head()

Unnamed: 0,text,spam
0,Subject: naturally irresistible your corporate...,1
1,Subject: the stock trading gunslinger fanny i...,1
2,Subject: unbelievable new homes made easy im ...,1
3,Subject: 4 color printing special request add...,1
4,"Subject: do not have money , get software cds ...",1


In [10]:
#printing the total number of rows and columns
df.shape

(5728, 2)

In [11]:
#printing the column names
df.columns

Index(['text', 'spam'], dtype='object')

In [13]:
#check for duplicate data and removing them
df.drop_duplicates(inplace=True)

In [14]:
#rechecking the shape of the data to verify the removal of duplicate data
df.shape

(5695, 2)

In [15]:
#printing the number of Null values , if any
df.isnull().sum()

text    0
spam    0
dtype: int64

# **Processing The Data**

In [16]:
#importing the Natural Language processing toolkit stopwords package
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [18]:
#Now we are processing the data
def process_text(text):
  #1: Remove punctuation
  nopunch=[char for char in text if char not in string.punctuation]
  nopunch=''.join(nopunch)

  #2: Remove the stopwords
  clean_words=[word for word in nopunch.split() if word.lower() not in stopwords.words('english')]
  
  #3: Return the list with clean words
  return clean_words

In [19]:
#visualising the data
df['text'].head().apply(process_text)

0    [Subject, naturally, irresistible, corporate, ...
1    [Subject, stock, trading, gunslinger, fanny, m...
2    [Subject, unbelievable, new, homes, made, easy...
3    [Subject, 4, color, printing, special, request...
4    [Subject, money, get, software, cds, software,...
Name: text, dtype: object

In [20]:
#converting the data into a matrix of token
from sklearn.feature_extraction.text import CountVectorizer
message_bow=CountVectorizer(analyzer=process_text).fit_transform(df['text'])

In [21]:
#splitting the data into 80% training set and 20% testing set
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(message_bow,df['spam'],test_size=0.20,random_state=0)

In [22]:
#displaying the shape of the dataset again
message_bow.shape

(5695, 37229)

In [24]:
#creating and training the Naive Bayes Multinomial classifier
from sklearn.naive_bayes import MultinomialNB
classifier=MultinomialNB().fit(x_train,y_train)

In [26]:
#printing the prediction
print(classifier.predict(x_train))
#printing the actual values
print(y_train.values)

[0 0 0 ... 0 0 0]
[0 0 0 ... 0 0 0]


# **Testing The Accuracy**

In [28]:
#evaluating the accuracy of the model
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score
pred=classifier.predict(x_train)
print( classification_report(y_train,pred))
print()
print('confusion Matrix:\n', confusion_matrix(y_train,pred))
print()
print('Accuracy  :',accuracy_score(y_train,pred)) 

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      3457
           1       0.99      1.00      0.99      1099

    accuracy                           1.00      4556
   macro avg       0.99      1.00      1.00      4556
weighted avg       1.00      1.00      1.00      4556


confusion Matrix:
 [[3445   12]
 [   1 1098]]

Accuracy  : 0.9971466198419666


In [30]:
#the accuracy of the model returns 99.7% on the training data

In [31]:
#printing the prediction
print(classifier.predict(x_test))
#printing the actual values
print(y_test.values)

[1 0 0 ... 0 0 0]
[1 0 0 ... 0 0 0]


In [32]:
#evaluating the accuracy of the model(testing data)
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score
pred=classifier.predict(x_test)
print( classification_report(y_test,pred))
print()
print('confusion Matrix:\n', confusion_matrix(y_test,pred))
print()
print('Accuracy  :',accuracy_score(y_test,pred)) 

              precision    recall  f1-score   support

           0       1.00      0.99      0.99       870
           1       0.97      1.00      0.98       269

    accuracy                           0.99      1139
   macro avg       0.98      0.99      0.99      1139
weighted avg       0.99      0.99      0.99      1139


confusion Matrix:
 [[862   8]
 [  1 268]]

Accuracy  : 0.9920983318700615


In [33]:
#The accuracy of the model returns 99.2% on the testing data