# Naive Bayes Spam detection

In [1]:
import pandas as pd
import numpy as np
import os

In [2]:
os.chdir(r'F:\MODULE 3\py-master\py-master\ML\14_naive_bayes')

In [3]:
os.listdir()

['14_naive_bayes_1_titanic_survival_prediction.ipynb',
 '14_naive_bayes_2_email_spam_filter.ipynb',
 'Exercise',
 'exercise.md',
 'spam.csv',
 'titanic.csv']

In [4]:
df=pd.read_csv('spam.csv')
df

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [5]:
# Ham means it's not a spam 

In [6]:
df.groupby('Category').describe()

Unnamed: 0_level_0,Message,Message,Message,Message
Unnamed: 0_level_1,count,unique,top,freq
Category,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
ham,4825,4516,"Sorry, I'll call later",30
spam,747,641,Please call our customer service representativ...,4


In [7]:
# The model understands only the number so we need to convert the textual data into number
# The category column can be easily be converted by using the lambda function 
# Lambda function checks every value if it is the spam it assings 1 else 0

In [8]:
df['spam']=df['Category'].apply(lambda x: 1 if x=='spam' else 0 )
df

Unnamed: 0,Category,Message,spam
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0
...,...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...,1
5568,ham,Will ü b going to esplanade fr home?,0
5569,ham,"Pity, * was in mood for that. So...any other s...",0
5570,ham,The guy did some bitching but I acted like i'd...,0


In [9]:
df.drop('Category',axis=1,inplace=True)

In [10]:
df.head()

Unnamed: 0,Message,spam
0,"Go until jurong point, crazy.. Available only ...",0
1,Ok lar... Joking wif u oni...,0
2,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,U dun say so early hor... U c already then say...,0
4,"Nah I don't think he goes to usf, he lives aro...",0


In [11]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(df.Message,df.spam,test_size=0.25)

In [12]:
# The message column has to be converted
# For this we will use count vectorization technique
# This works by finding the unique words in the document and their count


In [13]:
x_train

2867                        Smith waste da.i wanna gayle.
5374    Do u konw waht is rael FRIENDSHIP Im gving yuo...
654                            Fine i miss you very much.
1442                           Ya:)going for restaurant..
3085    Ok lor. I ned 2 go toa payoh 4 a while 2 retur...
                              ...                        
2969               Mostly sports type..lyk footbl,crckt..
4411                 You also didnt get na hi hi hi hi hi
822                               On the road so cant txt
3738    Plz note: if anyone calling from a mobile Co. ...
482     Yo carlos, a few friends are already asking me...
Name: Message, Length: 4179, dtype: object

In [14]:
from sklearn.feature_extraction.text import CountVectorizer
v=CountVectorizer()
x_train_count=v.fit_transform(x_train.values)

In [15]:
# This converts the the number of unique values into numbers
# The dotted lines indictes the number of unique words

In [16]:
x_train_count.toarray()[:3]

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [17]:
from sklearn.naive_bayes import MultinomialNB
model=MultinomialNB()
model.fit(x_train_count,y_train)

MultinomialNB()

In [18]:
# Here the test data also has to be converted into numbers

In [19]:
emails=v.transform(x_test)

In [20]:
emails.toarray()[:3]

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [21]:
# Here we are predicting the output

In [22]:
y_pred=model.predict(emails)
y_pred

array([0, 0, 1, ..., 0, 0, 0], dtype=int64)

In [23]:
# Here we are the calculating the score by giving the x_test data and y_test data
# Since the model has already know the training data it could know the score

In [24]:
model.score(emails,y_test)

0.9877961234745154

In [25]:
from sklearn.metrics import classification_report
print(classification_report(y_test,y_pred))


              precision    recall  f1-score   support

           0       0.99      1.00      0.99      1191
           1       0.98      0.94      0.96       202

    accuracy                           0.99      1393
   macro avg       0.98      0.97      0.97      1393
weighted avg       0.99      0.99      0.99      1393



# Pipeline

In [None]:
# We will use sklearn CountVectorizer to convert email text into a matrix of numbers and then use sklearn MultinomialNB classifier to train our model.
# The model score with this approach comes out to be very high (around 98%).
# Sklearn pipeline allows us to handle pre processing transformations easily with its convenient api. 

In [None]:
# The pipeline takes the countvectorizer and the model to to be built upon

In [30]:
from sklearn.pipeline import Pipeline
clf= Pipeline([('vectorizer',CountVectorizer()), ('nb', MultinomialNB()) ])

In [31]:
clf.fit(x_train,y_train)

Pipeline(steps=[('vectorizer', CountVectorizer()), ('nb', MultinomialNB())])

In [33]:
y_pred=clf.predict(x_test)

In [36]:
clf.score(x_test,y_test)

0.9877961234745154

In [35]:
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.99      1.00      0.99      1191
           1       0.98      0.94      0.96       202

    accuracy                           0.99      1393
   macro avg       0.98      0.97      0.97      1393
weighted avg       0.99      0.99      0.99      1393

