In [62]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import precision_score, confusion_matrix

In [103]:
data = pd.read_csv('https://raw.githubusercontent.com/codebasics/py/master/ML/14_naive_bayes/spam.csv')

In [97]:
data.head(10)

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
5,spam,FreeMsg Hey there darling it's been 3 week's n...
6,ham,Even my brother is not like to speak with me. ...
7,ham,As per your request 'Melle Melle (Oru Minnamin...
8,spam,WINNER!! As a valued network customer you have...
9,spam,Had your mobile 11 months or more? U R entitle...


In [98]:
# Information about target feature
data.groupby('Category').describe()

Unnamed: 0_level_0,Message,Message,Message,Message
Unnamed: 0_level_1,count,unique,top,freq
Category,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
ham,4825,4516,"Sorry, I'll call later",30
spam,747,641,Please call our customer service representativ...,4


In [104]:
# Convert categorical data into numericals
data['spam'] = data['Category'].apply(lambda x: 0 if x == 'spam' else 1)
data.head()

Unnamed: 0,Category,Message,spam
0,ham,"Go until jurong point, crazy.. Available only ...",1
1,ham,Ok lar... Joking wif u oni...,1
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,0
3,ham,U dun say so early hor... U c already then say...,1
4,ham,"Nah I don't think he goes to usf, he lives aro...",1


In [106]:
# Convert our sentences to vectors
# It will build the dictionary containing all the words

v = CountVectorizer()
x = v.fit_transform(data['Message'].values)
y = data['spam']

In [107]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=42)

In [108]:
# Fitting the training data into model

model = MultinomialNB()
model.fit(x_train, y_train)

MultinomialNB()

In [109]:
predictions = model.predict(x_test)

# Finding how much messages our model correctly classified
conf = confusion_matrix(y_test, predictions)
print('\n Total successfully identified spam messages : ', conf[1][1])
print('\n Messages that are identified as spam, but they are not spam : ', conf[0][1])

# Precision score
prec_score = precision_score(y_test, predictions)
print('\n Precision score for our model: ', prec_score)


 Total successfully identified spam messages :  1193

 Messages that are identified as spam, but they are not spam :  9

 Precision score for our model:  0.9925124792013311


In [110]:
# Prediction on custome example
emails = [
          'would you like to watch movie tomorrow with me?',
          'You have won prize and that is car For free'
]
emails_count = v.transform(emails)
model.predict(emails_count)

# 0 - spam
# 1 - ham

array([1, 0])