### Description: This program detects if an email is spam or not

In [1]:
#Import libraries
import numpy as np
import pandas as pd
import nltk
from nltk.corpus import stopwords
import string

In [2]:
#Load data
df = pd.read_csv('D:/Data Science Course/dataset/emails.csv')
df.head()

Unnamed: 0,text,spam
0,Subject: naturally irresistible your corporate...,1
1,Subject: the stock trading gunslinger fanny i...,1
2,Subject: unbelievable new homes made easy im ...,1
3,Subject: 4 color printing special request add...,1
4,"Subject: do not have money , get software cds ...",1


In [3]:
#Print the shape
print(df.shape)
print(df.columns)

(5728, 2)
Index(['text', 'spam'], dtype='object')


In [4]:
#Check for duplicates
df.drop_duplicates(inplace = True)
df.shape

(5695, 2)

In [5]:
#Show the number of missing data for each column
df.isnull().sum()

text    0
spam    0
dtype: int64

In [6]:
#Download the stopwords package
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ZhenXiang\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [7]:
def process_text(text):
    #Remove punctuation
    #Remove stopwords
    #return a list of clean text words
    
    #1
    nopunc = [char for char in text if char not in string.punctuation]
    nopunc = ''.join(nopunc)
    
    #2 Remove stopwords
    clean_words = [word for word in nopunc.split() if word.lower() not in stopwords.words('english')]
    
    #3 
    return clean_words 
    

In [8]:
#Show tokenisation
df['text'].head().apply(process_text)

0    [Subject, naturally, irresistible, corporate, ...
1    [Subject, stock, trading, gunslinger, fanny, m...
2    [Subject, unbelievable, new, homes, made, easy...
3    [Subject, 4, color, printing, special, request...
4    [Subject, money, get, software, cds, software,...
Name: text, dtype: object

In [60]:
#Example
from sklearn.feature_extraction.text import CountVectorizer
message4 = 'hello world hello hello world play'
message5 = 'test test test test test'

bo4 = CountVectorizer(analyzer=process_text).fit_transform([[message4], [message5]])
print(bo4)

  (0, 0)	3
  (0, 3)	2
  (0, 1)	1
  (1, 2)	5


In [74]:
cv = CountVectorizer(analyzer = process_text)
cv_fit = cv.fit_transform(df['text'])

In [11]:
#Split the data into 0.8 training and 0.2 testing
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(cv_fit, df['spam'], test_size = 0.20, random_state = 0)


In [76]:
#Get the shape of messages_bow
cv_fit.shape

(5695, 37229)

In [13]:
#Create and train the Naive Bayes Classifier
from sklearn.naive_bayes import MultinomialNB
classifier = MultinomialNB().fit(X_train, y_train)

In [14]:
#Print the predictions
print(classifier.predict(X_train))

#Print the actual values
print(y_train.values)

[0 0 0 ... 0 0 0]
[0 0 0 ... 0 0 0]


In [66]:
print(classifier.predict(X_train))

[0 0 0 ... 0 0 0]


In [15]:
#Evaluate the model
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
pred = classifier.predict(X_train)
print(classification_report(y_train, pred))
print()
print('Confusion Matrix:', confusion_matrix(y_train, pred))
print()
print('Accuracy:', accuracy_score(y_train, pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      3457
           1       0.99      1.00      0.99      1099

    accuracy                           1.00      4556
   macro avg       0.99      1.00      1.00      4556
weighted avg       1.00      1.00      1.00      4556


Confusion Matrix: [[3445   12]
 [   1 1098]]

Accuracy: 0.9971466198419666


In [16]:
#Print the predictions
print(classifier.predict(X_test))

#Print the actual values
print(y_test.values)

[1 0 0 ... 0 0 0]
[1 0 0 ... 0 0 0]


In [17]:
#Evaluate the model
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
pred = classifier.predict(X_test)
print(classification_report(y_test, pred))
print()
print('Confusion Matrix:', confusion_matrix(y_test, pred))
print()
print('Accuracy:', accuracy_score(y_test, pred))

              precision    recall  f1-score   support

           0       1.00      0.99      0.99       870
           1       0.97      1.00      0.98       269

    accuracy                           0.99      1139
   macro avg       0.98      0.99      0.99      1139
weighted avg       0.99      0.99      0.99      1139


Confusion Matrix: [[862   8]
 [  1 268]]

Accuracy: 0.9920983318700615


In [80]:
##df['text'].head().apply(process_text)
#CountVectorizer(analyzer=process_text).fit_transform([[message4], [message5]])
#messages_bow = CountVectorizer(analyzer = process_text).fit_transform(df['text'])

examples1 = ['Good Job, I think you are very good at your job! Keep at it']
examples2 = ['Viargra!, free! Come come!']
examples1Input = cv.transform(examples1)
examples2Input = cv.transform(examples2)
predict1 = classifier.predict(examples1Input)
predict2 = classifier.predict(examples2Input)
print(predict1)
print(predict2)

[0]
[1]


In [83]:
msg = input("Enter Message: ")
msgInput = cv.transform([msg])
predict_msg = classifier.predict(msgInput)
print(predict_msg)

Enter Message: How are you? Hope you are well!
[0]


In [84]:
msg = input("Enter Message: ")
msgInput = cv.transform([msg])
predict_msg = classifier.predict(msgInput)
print(predict_msg)


Enter Message: Free Stuff online! Come buy!!!! 
[1]


In [85]:
msg = input("Enter Message: ")
msgInput = cv.transform([msg])
predict_msg = classifier.predict(msgInput)
print(predict_msg)

Enter Message: I hate you
[0]
