# Import Necessary Libraries 

In [119]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sn
import pickle

from sklearn.model_selection import train_test_split 
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline

# EDA

In [120]:
df = pd.read_csv('D:\mlops\sajat\data\spam.csv')
df.head(20)

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
5,spam,FreeMsg Hey there darling it's been 3 week's n...
6,ham,Even my brother is not like to speak with me. ...
7,ham,As per your request 'Melle Melle (Oru Minnamin...
8,spam,WINNER!! As a valued network customer you have...
9,spam,Had your mobile 11 months or more? U R entitle...


In [121]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Category  5572 non-null   object
 1   Message   5572 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB


In [122]:
df.describe()

Unnamed: 0,Category,Message
count,5572,5572
unique,2,5157
top,ham,"Sorry, I'll call later"
freq,4825,30


In [123]:
df.value_counts()

Category  Message                                                                                                                                                                     
ham       Sorry, I'll call later                                                                                                                                                          30
          I cant pick the phone right now. Pls send a message                                                                                                                             12
          Ok...                                                                                                                                                                           10
          7 wonders in My WORLD 7th You 6th Ur style 5th Ur smile 4th Ur Personality 3rd Ur Nature 2nd Ur SMS and 1st "Ur Lovely Friendship"... good morning dear                          4
          Okie                                               

In [124]:
df[df['Category'] == 'spam'].value_counts()

Category  Message                                                                                                                                                                     
spam      Please call our customer service representative on FREEPHONE 0808 145 4742 between 9am-11pm as you have WON a guaranteed £1000 cash or £5000 prize!                             4
          I don't know u and u don't know me. Send CHAT to 86688 now and let's find each other! Only 150p/Msg rcvd. HG/Suite342/2Lands/Row/W1J6HL LDN. 18 years or over.                  3
          Camera - You are awarded a SiPix Digital Camera! call 09061221066 fromm landline. Delivery within 28 days.                                                                      3
          Congrats! 1 year special cinema pass for 2 is yours. call 09061209465 now! C Suprman V, Matrix3, StarWars3, etc all 4 FREE! bx420-ip4-5we. 150pm. Dont miss out!                3
          December only! Had your mobile 11mths+? You are entitle

In [125]:
df[df['Category'] == 'ham'].value_counts()

Category  Message                                                                                                                                                
ham       Sorry, I'll call later                                                                                                                                     30
          I cant pick the phone right now. Pls send a message                                                                                                        12
          Ok...                                                                                                                                                      10
          7 wonders in My WORLD 7th You 6th Ur style 5th Ur smile 4th Ur Personality 3rd Ur Nature 2nd Ur SMS and 1st "Ur Lovely Friendship"... good morning dear     4
          Okie                                                                                                                                                        

**after analysing the data, we will need to :**
1. replace the 'category' column with numeric values
2. use countVectorizer to represent each data as a vector of numbers

# Data Preprocessing

In [126]:
# Making a 'spam' column and replacing spam with 1 
df['spam'] = df['Category'].apply(lambda x: 1 if x=='spam' else 0)
df.head()

Unnamed: 0,Category,Message,spam
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0


In [127]:
#droping the category column
df.drop('Category',inplace =True, axis =1)

## Train Test Split

In [128]:
x= df.Message
y= df['spam']

In [129]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2)

# Building The Model

In [130]:
clf = Pipeline([('vectorizer', CountVectorizer()),('nb', MultinomialNB())])

In [131]:
clf.fit(x_train,y_train)

## The Model Accuracy

In [132]:
clf.score(x_train,y_train)

0.9932690150325331

In [133]:
clf.score(x_test,y_test)

0.9856502242152466

In [134]:
testing_emails = ['hi, wanna hangout at 10? i heard there is a good chinese restaurant nearby. see you soon'
                 ,"don't miss this chance to win 100$ dollars"]

In [135]:
clf.predict(testing_emails)

array([0, 1])