## Quadri Alli

## Spam detection classification using Multinomial Naive Bayes

In [118]:
#Import Libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB

In [119]:
#Reading text data
df = pd.read_fwf('spam_doc.txt', index=False)
df

Unnamed: 0,"Category,Message",Unnamed: 1,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8,Unnamed: 9,Unnamed: 10,Unnamed: 11,Unnamed: 12,Unnamed: 13,Unnamed: 14,Unnamed: 15,Unnamed: 16,Unnamed: 17
0,"ham,""Go until jurong point, crazy.. Available ...",,,,,,,,,,,,,,,,,
1,"ham,Ok lar... Joking wif u oni...",,,,,,,,,,,,,,,,,
2,"spam,Free entry in 2 a wkly comp to win FA Cup...",,,,,,,,,,,,,,,,,
3,"ham,U dun say so early hor... U c already then...",,,,,,,,,,,,,,,,,
4,"ham,""Nah I don't think he goes to usf, he live...",,,,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5569,"spam,""This is the 2nd time we have tried 2 con...",,,,,,,,,,,,,,,,,
5570,"ham,Will ü b going to esplanade fr home?",,,,,,,,,,,,,,,,,
5571,"ham,""Pity, * was in mood for that. So...any ot...",,,,,,,,,,,,,,,,,
5572,"ham,The guy did some bitching but I acted like...",,,,,,,,,,,,,,,,,


In [120]:
#Choosing the import columns for analysis
df = df.iloc[:, :1]

In [121]:
#Data Cleaning
df[['Category','Message']] = df['Category,Message'].str.split(',', n=1, expand=True) #creates 2 new columns with split
df = df.drop(df[~df['Category'].isin(['spam', 'ham'])].index) # deletes rows that doesn't contain exactly spam and ham
df = df.drop('Category,Message', axis=1) # drops the Category,Message Column
df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[['Category','Message']] = df['Category,Message'].str.split(',', n=1, expand=True) #creates 2 new columns with split
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[['Category','Message']] = df['Category,Message'].str.split(',', n=1, expand=True) #creates 2 new columns with split


Unnamed: 0,Category,Message
0,ham,"""Go until jurong point, crazy.. Available only..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"""Nah I don't think he goes to usf, he lives ar..."
...,...,...
5569,spam,"""This is the 2nd time we have tried 2 contact ..."
5570,ham,Will ü b going to esplanade fr home?
5571,ham,"""Pity, * was in mood for that. So...any other ..."
5572,ham,The guy did some bitching but I acted like i'd...


In [122]:
df.groupby('Category').describe()

Unnamed: 0_level_0,Message,Message,Message,Message
Unnamed: 0_level_1,count,unique,top,freq
Category,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
ham,4825,4516,"""Sorry, I'll call later""",30
spam,747,641,Please call our customer service representativ...,4


In [123]:
# Encoding the categorical variable that we want as our output feature.
df['spam'] = df['Category'].apply(lambda x: 1 if x == 'spam' else 0)
df

Unnamed: 0,Category,Message,spam
0,ham,"""Go until jurong point, crazy.. Available only...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"""Nah I don't think he goes to usf, he lives ar...",0
...,...,...,...
5569,spam,"""This is the 2nd time we have tried 2 contact ...",1
5570,ham,Will ü b going to esplanade fr home?,0
5571,ham,"""Pity, * was in mood for that. So...any other ...",0
5572,ham,The guy did some bitching but I acted like i'd...,0


In [124]:
# Choosing our input and output variables
X = df['Message']
y = df['spam']

In [135]:
#Training the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.75, test_size=0.25)

In [136]:
# Find,count training data words and store as matrix
cv = CountVectorizer()
X_train_count = cv.fit_transform(X_train.values) #fit transfrom and give me values

In [137]:
X_train_count #4179 rows of data & 7373 unique words

<4179x7354 sparse matrix of type '<class 'numpy.int64'>'
	with 54193 stored elements in Compressed Sparse Row format>

In [138]:
X_train_count.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [146]:
#Training model
model = MultinomialNB()
model.fit(X_train_count, y_train)

In [147]:
#Testing an example. Model predicts Messsage is a aspam
email = ['i fucke ujhbvfbsikb free entry']
email_count = cv.transform(email)
model.predict(email_count)

array([1])

In [153]:
# Model Predicts message is not a spam
email_2 = ['Hi,would you like to hang out tmr']
email_count_2 = cv.transform(email_2)
model.predict(email_count_2)

array([0])

In [149]:
#Test model. Our model has a 98% accuracy score, which is pretty good.
X_test_count = cv.transform(X_test) 
model.score(X_test_count, y_test)

0.9877961234745154