In [2]:
#load the necesssary libraries
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB

In [4]:
#Load the dataset 
df = pd.read_csv('spam.csv',encoding='latin-1')

In [5]:
df.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [6]:
#remove the unnecessary columns
df = df.drop(['Unnamed: 2','Unnamed: 3','Unnamed: 4'],axis=1)

In [7]:
df

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will Ì_ b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [8]:
#rename v1,v2 as Type and Messages respectively for better understanding
df = df.rename(columns={'v1':'Type','v2':'Messages'})

In [9]:
df.head()

Unnamed: 0,Type,Messages
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [10]:
df.groupby('Type').describe()

Unnamed: 0_level_0,Messages,Messages,Messages,Messages
Unnamed: 0_level_1,count,unique,top,freq
Type,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
ham,4825,4516,"Sorry, I'll call later",30
spam,747,653,Please call our customer service representativ...,4


In [11]:
#turn ham and spam into numerical data, creating a new column called spam
# te1-indicas spam 0-indicates ham
df['spam'] = df['Type'].apply(lambda x:1 if x == 'spam' else 0)

In [12]:
df

Unnamed: 0,Type,Messages,spam
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0
...,...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...,1
5568,ham,Will Ì_ b going to esplanade fr home?,0
5569,ham,"Pity, * was in mood for that. So...any other s...",0
5570,ham,The guy did some bitching but I acted like i'd...,0


Here we can see a seperate column names 'spam' wherein '1' indicates the message as spam and '0' indicates the message as 'ham'


In [13]:
#create train/test split
X_train,X_test,y_train,y_test = train_test_split(df.Messages,df.spam,test_size = 0.25)

In [14]:
#find wordcount and store data as a matrix
cv = CountVectorizer()
X_train_count = cv.fit_transform(X_train.values)

In [15]:
X_train_count.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [16]:
#train the model using multinomial naive byes 
model = MultinomialNB()
model.fit(X_train_count, y_train)

In [18]:
#pre-test ham message
sms_ham = ['hi there, how are you?']
sms_ham_count = cv.transform(sms_ham)
model.predict(sms_ham_count)

array([0], dtype=int64)

here 0 is returned indicating that the message is ham


In [21]:
#pre-test spam message
sms_spam = ['congrats! you have won 1000000']
sms_spam_count = cv.transform(sms_spam)
model.predict(sms_spam_count)

array([1], dtype=int64)

here 1 is returned indicating the message as spam

In [20]:
#testing the model for its accuracy
x_test_count = cv.transform(X_test)
model.score(x_test_count, y_test)

0.9877961234745154