### Build a naive Bayes model on the data set for classifying the ham and spam
<li>Description: This program detects if an email is spam (1) or not (0)</li>

In [1]:
#Import libraries
import numpy as np 
import pandas as pd 
import nltk
from nltk.corpus import stopwords
import string

In [2]:
#Load the data
df = pd.read_csv('C:\\Users\\Raja\\Downloads\\assignments\\naive\\sms_raw_NB.csv', encoding="ISO-8859-1")
df.head()

Unnamed: 0,type,text
0,ham,Hope you are having a good week. Just checking in
1,ham,K..give back my thanks.
2,ham,Am also doing in cbe only. But have to pay.
3,spam,"complimentary 4 STAR Ibiza Holiday or å£10,000..."
4,spam,okmail: Dear Dave this is your final notice to...


In [3]:
# using dictionary map
type_map = {'spam': 1, 'ham': 0}
df['new_type'] = df['type'].map(type_map)
df.head()

Unnamed: 0,type,text,new_type
0,ham,Hope you are having a good week. Just checking in,0
1,ham,K..give back my thanks.,0
2,ham,Am also doing in cbe only. But have to pay.,0
3,spam,"complimentary 4 STAR Ibiza Holiday or å£10,000...",1
4,spam,okmail: Dear Dave this is your final notice to...,1


In [4]:
#Print the shape (Get the number of rows and cols)
df.shape

(5559, 3)

In [5]:
#Get the column names
df.columns

Index(['type', 'text', 'new_type'], dtype='object')

In [6]:
#Checking for duplicates and removing them
df.drop_duplicates(inplace = True)

In [7]:
df.shape

(5156, 3)

In [8]:
#Show the number of missing (NAN, NaN, na) data for each column
df.isnull().sum()

type        0
text        0
new_type    0
dtype: int64

In [9]:
#Need to download stopwords
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Raja\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [10]:
#Tokenization (a list of tokens), will be used as the analyzer
#1.Punctuations are [!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~]
#2.Stop words in natural language processing, are useless words (data).
def process_text(text):
    #1
    nopunc = [char for char in text if char not in string.punctuation]
    nopunc = ''.join(nopunc)
    
    #2
    clean_words = [word for word in nopunc.split() if word.lower() not in stopwords.words('english')]
    
    #3
    return clean_words

In [11]:
#Show the Tokenization (a list of tokens )
df['text'].head().apply(process_text)

0                         [Hope, good, week, checking]
1                                [Kgive, back, thanks]
2                                     [also, cbe, pay]
3    [complimentary, 4, STAR, Ibiza, Holiday, å£100...
4    [okmail, Dear, Dave, final, notice, collect, 4...
Name: text, dtype: object

In [12]:
#import of CountVectorizer
from sklearn.feature_extraction.text import CountVectorizer
messages= CountVectorizer(analyzer=process_text).fit_transform(df['text'])

In [13]:
#Split data into 80% training & 20% testing data sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(messages, df['new_type'], test_size = 0.20, random_state = 0)

In [14]:
#Get the shape of messages_bow
messages.shape

(5156, 11356)

In [15]:
#Import of naive classifier and fitting on model
from sklearn.naive_bayes import MultinomialNB
classifier = MultinomialNB()
classifier.fit(X_train, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [16]:
#Print the predictions
print(classifier.predict(X_train))
#Print the actual values
print(y_train.values)

[1 0 0 ... 1 0 0]
[1 0 0 ... 1 0 0]


In [17]:
#Evaluate the model on the training data set
from sklearn.metrics import classification_report,confusion_matrix, accuracy_score
pred = classifier.predict(X_train)
print(classification_report(y_train ,pred ))
print('Confusion Matrix: \n',confusion_matrix(y_train,pred))
print()
print('Accuracy: ', accuracy_score(y_train,pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      3586
           1       0.99      0.98      0.98       538

    accuracy                           1.00      4124
   macro avg       0.99      0.99      0.99      4124
weighted avg       1.00      1.00      1.00      4124

Confusion Matrix: 
 [[3580    6]
 [  11  527]]

Accuracy:  0.9958777885548011


In [18]:
#Print the predictions
print('Predicted value: ',classifier.predict(X_test))
#Print Actual Label
print('Actual value: ',y_test.values)

Predicted value:  [1 0 1 ... 0 0 0]
Actual value:  [0 0 1 ... 0 0 0]


In [19]:
#Evaluate the model on the test data set
from sklearn.metrics import classification_report,confusion_matrix, accuracy_score
pred = classifier.predict(X_test)
print(classification_report(y_test ,pred ))
print('Confusion Matrix: \n', confusion_matrix(y_test,pred))
print()
print('Accuracy: ', accuracy_score(y_test,pred))

              precision    recall  f1-score   support

           0       0.99      0.97      0.98       917
           1       0.77      0.92      0.84       115

    accuracy                           0.96      1032
   macro avg       0.88      0.94      0.91      1032
weighted avg       0.97      0.96      0.96      1032

Confusion Matrix: 
 [[885  32]
 [  9 106]]

Accuracy:  0.9602713178294574


The classifier accurately identified the  messages as spam or not spam with 96 % accuracy on the test data !