In [1]:
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
import string

In [2]:
df= pd.read_csv("s_dataset.csv")


In [3]:
df.head(5)

Unnamed: 0.1,Unnamed: 0,label,text,label_num
0,605,nonSpam,Subject: enron methanol ; meter # : 988291\r\n...,0
1,2349,nonSpam,"Subject: hpl nom for january 9 , 2001\r\n( see...",0
2,3624,nonSpam,"Subject: neon retreat\r\nho ho ho , we ' re ar...",0
3,4685,spam,"Subject: photoshop , windows , office . cheap ...",1
4,2030,nonSpam,Subject: re : indian springs\r\nthis deal is t...,0


In [4]:
del df["Unnamed: 0"]

In [5]:
df.shape

(5171, 3)

In [6]:
df.columns

Index(['label', 'text', 'label_num'], dtype='object')

In [7]:
df.drop_duplicates(inplace=True)

In [8]:
df.shape

(4993, 3)

In [9]:
df.isnull().any()

label        False
text         False
label_num    False
dtype: bool

In [10]:
#Need to download stopwords
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\SOFTAGE\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [11]:
#Tokenization (a list of tokens), will be used as the analyzer
#1.Punctuations are [!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~]
#2.Stop words in natural language processing, are useless words (data).
def process_text(text):
    
    #1 Remove Punctuation
    nopunc = [char for char in text if char not in string.punctuation]
    nopunc = ''.join(nopunc)
    
    #2 Remove Stop Words
    clean_words = [word for word in nopunc.split() if word.lower() not in stopwords.words('english')]
    
    #3 Return a list of clean words
    return clean_words

In [12]:
#Show the Tokenization (a list of tokens )
df['text'].head(10).apply(process_text)

0    [Subject, enron, methanol, meter, 988291, foll...
1    [Subject, hpl, nom, january, 9, 2001, see, att...
2    [Subject, neon, retreat, ho, ho, ho, around, w...
3    [Subject, photoshop, windows, office, cheap, m...
4    [Subject, indian, springs, deal, book, teco, p...
5    [Subject, ehronline, web, address, change, mes...
6    [Subject, spring, savings, certificate, take, ...
7    [Subject, looking, medication, best, source, d...
8    [Subject, noms, actual, flow, 2, 26, agree, fo...
9    [Subject, nominations, oct, 21, 23, 2000, see,...
Name: text, dtype: object

In [13]:
from sklearn.feature_extraction.text import CountVectorizer
messages_bow = CountVectorizer(analyzer=process_text).fit_transform(df['text'])

In [14]:
#Split data into 80% training & 20% testing data sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(messages_bow, df['label'], test_size = 0.20, random_state = 0)

In [15]:
#Get the shape of messages_bow
messages_bow.shape

(4993, 50381)

In [16]:
from sklearn.naive_bayes import MultinomialNB
classifier = MultinomialNB()
classifier.fit(X_train, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [17]:
#Print the predictions
print(classifier.predict(X_train))
#Print the actual values
print(y_train.values)

['spam' 'nonSpam' 'spam' ... 'nonSpam' 'nonSpam' 'spam']
['spam' 'nonSpam' 'spam' ... 'nonSpam' 'nonSpam' 'spam']


In [18]:
#Evaluate the model on the training data set
from sklearn.metrics import classification_report,confusion_matrix, accuracy_score
pred = classifier.predict(X_train)
print(classification_report(y_train ,pred ))
print('Confusion Matrix: \n',confusion_matrix(y_train,pred))
print()
print('Accuracy: ', accuracy_score(y_train,pred))


              precision    recall  f1-score   support

     nonSpam       1.00      0.99      0.99      2809
        spam       0.98      0.99      0.99      1185

    accuracy                           0.99      3994
   macro avg       0.99      0.99      0.99      3994
weighted avg       0.99      0.99      0.99      3994

Confusion Matrix: 
 [[2787   22]
 [  13 1172]]

Accuracy:  0.9912368552829244


In [19]:
#Print the predictions
print('Predicted value: ',classifier.predict(X_test))
#Print Actual Label
print('Actual value: ',y_test.values)

Predicted value:  ['nonSpam' 'nonSpam' 'nonSpam' 'nonSpam' 'nonSpam' 'nonSpam' 'spam'
 'nonSpam' 'nonSpam' 'nonSpam' 'nonSpam' 'nonSpam' 'spam' 'nonSpam'
 'nonSpam' 'nonSpam' 'nonSpam' 'spam' 'nonSpam' 'nonSpam' 'spam' 'nonSpam'
 'nonSpam' 'spam' 'spam' 'nonSpam' 'nonSpam' 'nonSpam' 'nonSpam' 'spam'
 'nonSpam' 'spam' 'spam' 'spam' 'nonSpam' 'spam' 'spam' 'nonSpam'
 'nonSpam' 'nonSpam' 'nonSpam' 'spam' 'nonSpam' 'spam' 'nonSpam' 'spam'
 'nonSpam' 'nonSpam' 'nonSpam' 'spam' 'spam' 'nonSpam' 'nonSpam' 'nonSpam'
 'nonSpam' 'nonSpam' 'spam' 'nonSpam' 'nonSpam' 'spam' 'nonSpam' 'nonSpam'
 'spam' 'nonSpam' 'nonSpam' 'nonSpam' 'spam' 'spam' 'nonSpam' 'nonSpam'
 'spam' 'spam' 'spam' 'nonSpam' 'nonSpam' 'nonSpam' 'nonSpam' 'nonSpam'
 'nonSpam' 'spam' 'nonSpam' 'nonSpam' 'nonSpam' 'nonSpam' 'spam' 'nonSpam'
 'nonSpam' 'nonSpam' 'spam' 'nonSpam' 'nonSpam' 'nonSpam' 'nonSpam'
 'nonSpam' 'spam' 'nonSpam' 'nonSpam' 'spam' 'nonSpam' 'nonSpam' 'nonSpam'
 'spam' 'spam' 'nonSpam' 'nonSpam' 'nonSpam' 'non

In [20]:
#Evaluate the model on the test data set
from sklearn.metrics import classification_report,confusion_matrix, accuracy_score
pred = classifier.predict(X_test)
print(classification_report(y_test ,pred ))
print('Confusion Matrix: \n', confusion_matrix(y_test,pred))
print()
print('Accuracy: ', accuracy_score(y_test,pred))

              precision    recall  f1-score   support

     nonSpam       0.98      0.98      0.98       722
        spam       0.95      0.96      0.96       277

    accuracy                           0.98       999
   macro avg       0.97      0.97      0.97       999
weighted avg       0.98      0.98      0.98       999

Confusion Matrix: 
 [[709  13]
 [ 11 266]]

Accuracy:  0.975975975975976
