# Problem Statement : 

# Given above data build a machine learning model that can classify Email is Spam or Not

In [2]:
import pandas as pd # for data frame
import numpy as np # for mathemetical operation
import matplotlib.pyplot as plt # for vizualization
%matplotlib inline
import seaborn as sns # for visualization

In [3]:
#loading dataset
df = pd.read_csv('spam.csv')
df.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [6]:
df.groupby('Category').describe()

Unnamed: 0_level_0,Message,Message,Message,Message
Unnamed: 0_level_1,count,unique,top,freq
Category,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
ham,4825,4516,"Sorry, I'll call later",30
spam,747,641,Please call our customer service representativ...,4


In [7]:
#Encoding Category Column
df["Spam"] = df["Category"].apply(lambda x:1 if x=="spam" else 0)
df.head()

Unnamed: 0,Category,Message,Spam
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0


In [9]:
#split data for train anf test data
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(df.Message,df.Spam,test_size=0.25)

In [10]:
#feature extraction from Message in training data
from sklearn.feature_extraction.text import CountVectorizer
v = CountVectorizer()
X_train_count = v.fit_transform(X_train.values)
X_train_count.toarray()[:3]

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [16]:
from sklearn.naive_bayes import MultinomialNB
model = MultinomialNB()

In [18]:
#train model 
model.fit(X_train_count,y_train)

MultinomialNB()

In [19]:
#predict two emails in model
emails = [
    'Hey mohan, can we get together to watch footbal game tomorrow?',
    'Upto 20% discount on parking, exclusive offer just for you. Dont miss this reward!'
]
emails_count = v.transform(emails)
model.predict(emails_count)

array([0, 1], dtype=int64)

In [21]:
#score on train data
model.score(X_train_count,y_train)

0.9928212491026561

In [20]:
#score on test data
X_test_count = v.transform(X_test)
model.score(X_test_count,y_test)

0.9863603732950467

In [23]:
#Creating Pipeline for Model

from sklearn.pipeline import Pipeline
clf = Pipeline([("vectorizer",CountVectorizer()),
                ("nb",MultinomialNB())])

In [25]:
#train model for pipeline
clf.fit(X_train,y_train)

Pipeline(steps=[('vectorizer', CountVectorizer()), ('nb', MultinomialNB())])

In [26]:
#score on test data
clf.score(X_test,y_test)

0.9863603732950467

In [27]:
#predict email class
clf.predict(emails)

array([0, 1], dtype=int64)