## Spam flitering using Naive Bayes classifier in order to predict whether a new mail is based on its content, can be categorized as spam or not.

In [13]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import BernoulliNB
from sklearn.metrics import accuracy_score,classification_report,confusion_matrix
import string
import matplotlib.pyplot as plt

In [14]:
# load the dataset

data = pd.read_csv("spam.tsv",sep='\t',names=['Class','Message'])
data.head()

Unnamed: 0,Class,Message
0,ham,I've been searching for the right words to tha...
1,spam,Free entry in 2 a wkly comp to win FA Cup fina...
2,ham,"Nah I don't think he goes to usf, he lives aro..."
3,ham,Even my brother is not like to speak with me. ...
4,ham,I HAVE A DATE ON SUNDAY WITH WILL!!!


In [15]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5567 entries, 0 to 5566
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   Class    5567 non-null   object
 1   Message  5567 non-null   object
dtypes: object(2)
memory usage: 87.1+ KB


In [16]:
data['Length'] = data['Message'].apply(len)
data

Unnamed: 0,Class,Message,Length
0,ham,I've been searching for the right words to tha...,196
1,spam,Free entry in 2 a wkly comp to win FA Cup fina...,155
2,ham,"Nah I don't think he goes to usf, he lives aro...",61
3,ham,Even my brother is not like to speak with me. ...,77
4,ham,I HAVE A DATE ON SUNDAY WITH WILL!!!,36
...,...,...,...
5562,spam,This is the 2nd time we have tried 2 contact u...,160
5563,ham,Will ü b going to esplanade fr home?,36
5564,ham,"Pity, * was in mood for that. So...any other s...",57
5565,ham,The guy did some bitching but I acted like i'd...,125


In [17]:
data.describe()

Unnamed: 0,Length
count,5567.0
mean,80.450153
std,59.891023
min,2.0
25%,36.0
50%,62.0
75%,122.0
max,910.0


In [18]:
data['Class'].value_counts()

ham     4821
spam     746
Name: Class, dtype: int64

### Text preprocessing

In [19]:
# assign ham as 1
data.loc[data['Class']=='ham','Class'] = 1

In [20]:
# assign spam as 0
data.loc[data['Class']=='spam','Class'] = 0

In [21]:
data.head(6)

Unnamed: 0,Class,Message,Length
0,1,I've been searching for the right words to tha...,196
1,0,Free entry in 2 a wkly comp to win FA Cup fina...,155
2,1,"Nah I don't think he goes to usf, he lives aro...",61
3,1,Even my brother is not like to speak with me. ...,77
4,1,I HAVE A DATE ON SUNDAY WITH WILL!!!,36
5,1,As per your request 'Melle Melle (Oru Minnamin...,160


### Remove punctuation. we can use python's built in string library to get a quick list of all possible punctuations

In [22]:
# why is it important to remove punctuation?
"This message is spam" == "This message is spam."

False

In [23]:
## get the default list of the punctuations in python

import string
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [24]:
## creating a function to remove the punctiation

def remove_punc(text):
    text = "".join([char for char in text if char not in string.punctuation])
    return text

In [28]:
## example
s = "data// science!!!"
remove_punc(s)

'data science'

In [26]:
text = []
for i in data['Message']:
    t = remove_punc(i)
    text.append(t)

In [27]:
# creating a new column for cleaned text

data['Clean_text'] = text
data

Unnamed: 0,Class,Message,Length,Clean_text
0,1,I've been searching for the right words to tha...,196,Ive been searching for the right words to than...
1,0,Free entry in 2 a wkly comp to win FA Cup fina...,155,Free entry in 2 a wkly comp to win FA Cup fina...
2,1,"Nah I don't think he goes to usf, he lives aro...",61,Nah I dont think he goes to usf he lives aroun...
3,1,Even my brother is not like to speak with me. ...,77,Even my brother is not like to speak with me T...
4,1,I HAVE A DATE ON SUNDAY WITH WILL!!!,36,I HAVE A DATE ON SUNDAY WITH WILL
...,...,...,...,...
5562,0,This is the 2nd time we have tried 2 contact u...,160,This is the 2nd time we have tried 2 contact u...
5563,1,Will ü b going to esplanade fr home?,36,Will ü b going to esplanade fr home
5564,1,"Pity, * was in mood for that. So...any other s...",57,Pity was in mood for that Soany other suggest...
5565,1,The guy did some bitching but I acted like i'd...,125,The guy did some bitching but I acted like id ...


In [29]:
## splitting x and y
x = data['Clean_text'].values
y = data['Class'].values


In [30]:
y

array([1, 0, 1, ..., 1, 1, 1], dtype=object)

In [32]:
## datatype for y is an object
y = y.astype('int')
y

array([1, 0, 1, ..., 1, 1, 1])

#### splitting train and test data

In [34]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=10)

In [35]:
x_train.shape

(4453,)

In [36]:
x_test.shape

(1114,)

### Bag of words

In [37]:
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer

CV = CountVectorizer(stop_words="english")

In [38]:
x_train_CV = CV.fit_transform(x_train)

In [39]:
import warnings
warnings.filterwarnings('ignore')
CV.get_feature_names()

['008704050406',
 '0089my',
 '0121',
 '01223585334',
 '0125698789',
 '02',
 '020603',
 '0207',
 '02070836089',
 '02072069400',
 '02073162414',
 '02085076972',
 '020903',
 '021',
 '050703',
 '0578',
 '060505',
 '061104',
 '07008009200',
 '07046744435',
 '07090298926',
 '07099833605',
 '071104',
 '07123456789',
 '0721072',
 '07732584351',
 '07734396839',
 '07742676969',
 '07753741225',
 '0776xxxxxxx',
 '07786200117',
 '078',
 '07808',
 '07808247860',
 '07808726822',
 '07815296484',
 '07821230901',
 '0784987',
 '0789xxxxxxx',
 '0794674629107880867867',
 '0796xxxxxx',
 '07973788240',
 '07xxxxxxxxx',
 '0800',
 '08000407165',
 '08000776320',
 '08000839402',
 '08000930705',
 '08000938767',
 '08001950382',
 '08002888812',
 '08002986030',
 '08002986906',
 '08002988890',
 '08006344447',
 '0808',
 '08081263000',
 '08081560665',
 '0825',
 '0844',
 '08448350055',
 '08448714184',
 '0845',
 '08452810071',
 '08452810073',
 '0870',
 '08700621170150p',
 '08701213186',
 '08701237397',
 '08701417012',
 '0

In [40]:
### Training

NB = MultinomialNB()

In [41]:
## feed data to the model
NB.fit(x_train_CV,y_train)


MultinomialNB()

In [43]:
x_test_CV = CV.transform(x_test)

In [44]:
y_predict = NB.predict(x_test_CV)
y_predict

array([1, 1, 1, ..., 1, 1, 1])

In [45]:
print(classification_report(y_test,y_predict))

              precision    recall  f1-score   support

           0       0.96      0.92      0.94       168
           1       0.99      0.99      0.99       946

    accuracy                           0.98      1114
   macro avg       0.97      0.96      0.97      1114
weighted avg       0.98      0.98      0.98      1114



In [46]:
pd.crosstab(y_test,y_predict)

col_0,0,1
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1
0,155,13
1,6,940


In [48]:
## using bernoulli 

bnb = BernoulliNB()
bnb.fit(x_train_CV,y_train)

# Getting prediction
y_pred1=bnb.predict(x_test_CV)

# confusion matrix
pd.crosstab(y_test,y_pred1)

col_0,0,1
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1
0,130,38
1,0,946


### using TF-IDF Technique

In [49]:
# splitting x and y

x = data['Clean_text'].values
y = data['Class'].values

In [50]:
y = y.astype('int')
y

array([1, 0, 1, ..., 1, 1, 1])

In [52]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(x,y,random_state=6)

In [53]:
from sklearn.feature_extraction.text import TfidfVectorizer
tf = TfidfVectorizer()

In [54]:
# to convert training data - fit_transform()
# to convert testing data - transform()
# converts text to numerical

x_train_cv = tf.fit_transform(x_train)
x_test_cv = tf.transform(x_test)

In [55]:
nb = MultinomialNB()
nb.fit(x_train_cv,y_train)

MultinomialNB()

In [56]:
y_hat = nb.predict(x_test_cv)

In [57]:
print(classification_report(y_test,y_hat))

              precision    recall  f1-score   support

           0       1.00      0.68      0.81       186
           1       0.95      1.00      0.98      1206

    accuracy                           0.96      1392
   macro avg       0.98      0.84      0.89      1392
weighted avg       0.96      0.96      0.95      1392



In [58]:
pd.crosstab(y_test,y_hat)

col_0,0,1
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1
0,127,59
1,0,1206


In [59]:
## model object creation
nb = BernoulliNB()
# fitting the model
nb.fit(x_train_cv,y_train)

# getting prediction
y_hat = nb.predict(x_test_cv)


In [60]:
y_hat

array([1, 1, 1, ..., 1, 1, 1])

In [61]:
## Evaluating the model

from sklearn.metrics import classification_report,confusion_matrix

In [62]:
print(classification_report(y_test,y_hat))

              precision    recall  f1-score   support

           0       1.00      0.82      0.90       186
           1       0.97      1.00      0.99      1206

    accuracy                           0.98      1392
   macro avg       0.99      0.91      0.94      1392
weighted avg       0.98      0.98      0.98      1392



In [63]:
pd.crosstab(y_test,y_hat)

col_0,0,1
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1
0,153,33
1,0,1206
