In [None]:
import pandas as pd
from google.colab import drive
drive.mount('/gdrive')
%cd /gdrive

Mounted at /gdrive
/gdrive


In [None]:
df = pd.read_csv('/gdrive/MyDrive/spam.csv')
df.head(5)

Unnamed: 0.1,Unnamed: 0,Category,Message
0,0,ham,"Go until jurong point, crazy.. Available only ..."
1,1,ham,Ok lar... Joking wif u oni...
2,2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,3,ham,U dun say so early hor... U c already then say...
4,4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [None]:
df.groupby('Category').describe()

Unnamed: 0_level_0,Unnamed: 0,Unnamed: 0,Unnamed: 0,Unnamed: 0,Unnamed: 0,Unnamed: 0,Unnamed: 0,Unnamed: 0
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max
Category,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
ham,4825.0,2793.21886,1604.618358,0.0,1409.0,2794.0,4184.0,5571.0
spam,747.0,2735.64257,1634.604866,2.0,1228.0,2719.0,4151.5,5567.0


In [None]:
#now we will convert the Category and message columns to numbers as machines can understand only numbers
#first we convert the 'Category' column into numbers 

df['spam'] = df['Category'].apply(lambda x:1 if x=='spam' else 0)
#here we created a new column with the name spam with binary values, if its spam it returns 1 otherwise it returns 0

In [None]:
df.head(5)

Unnamed: 0.1,Unnamed: 0,Category,Message,spam
0,0,ham,"Go until jurong point, crazy.. Available only ...",0
1,1,ham,Ok lar... Joking wif u oni...,0
2,2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,3,ham,U dun say so early hor... U c already then say...,0
4,4,ham,"Nah I don't think he goes to usf, he lives aro...",0


In [None]:
#Next we import the train test split from the sklearn
from sklearn.model_selection import train_test_split
X_train,X_test,y_train, y_test = train_test_split(df.Message, df.spam,test_size=0.25)

**Feature Extraction**

In [None]:
#Then we will convert the 'Message' column to numbers using the count vectorizer technique
#the count vectorizer takes the unique words from the message column and converts it into matrix
from sklearn.feature_extraction.text import CountVectorizer
vect = CountVectorizer()
X_train_count = vect.fit_transform(X_train.values)
X_train_count.toarray()[:5]

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [None]:
#here we use the multinomial naive bayes technique
#As multinomial naive bayes method is used if we have discrete values

from sklearn.naive_bayes import MultinomialNB
model = MultinomialNB()


In [None]:
#train the model
model.fit(X_train_count,y_train)

MultinomialNB()

Inputs

In [None]:
emails = [
    'Hey mohan, can we get together to watch footbal game tomorrow?',
    'Upto 20% discount on parking, exclusive offer just for you. Dont miss this reward!'
]

In [None]:
emails_count = vect.transform(emails) #to transform email into vector form
model.predict(emails_count) #Predicting the values


array([0, 1])

In [None]:
#Calculating the score or measuring the accuracy
X_test_count = vect.transform(X_test)
model.score(X_test_count,y_test)

0.9842067480258435

**Cross Validating using the Pipeline**



In [None]:
from sklearn.pipeline import Pipeline
clsf = Pipeline([('vectorizer',CountVectorizer()),('nb',MultinomialNB())])

In [None]:
clsf.fit(X_train,y_train)

Pipeline(steps=[('vectorizer', CountVectorizer()), ('nb', MultinomialNB())])

In [None]:
clsf.score(X_test,y_test)

0.9842067480258435

In [None]:
clsf.predict(emails)

array([0, 1])