In [49]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pickle
import random

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import BernoulliNB

In [50]:
# read CSV
df = pd.read_csv('./archive/spam.csv', encoding='latin-1')
# preprocessing
df.drop(['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], axis=1, inplace=True)
df.rename(columns={"v1":"class", "v2":"text"}, inplace=True)

In [51]:
df.head()

Unnamed: 0,class,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [52]:
print(sum(df.loc[:,"class"] == "ham"))
print(sum(df.loc[:,"class"] == "spam"))

4825
747


In [53]:
# split training and test data
X = pd.DataFrame(df['text'])
Y = pd.DataFrame(df['class'])
random.seed(100)
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, train_size=0.7, test_size=0.3, random_state=10)

In [54]:
# Count words
vec_count = CountVectorizer(min_df=3)
vec_count.fit(X_train['text'])

print('word size: ', len(vec_count.vocabulary_))
print('word content: ', dict(list(vec_count.vocabulary_.items())[0:5]))

word size:  2196
word content:  {'hey': 901, 'there': 1885, 'babe': 266, 'how': 936, 'doin': 607}


In [55]:
# vectorise train and test data
X_train_vec = vec_count.transform(X_train['text'])
X_text_vec = vec_count.transform(X_test['text'])

pd.DataFrame(X_train_vec.toarray()[0:5], columns=vec_count.get_feature_names())

Unnamed: 0,00,000,02,03,04,05,06,07xxxxxxxxx,0800,08000839402,...,yr,yrs,yummy,yun,yup,zed,ì_,ìï,û_,ûò
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [56]:
# Bernoulli model - naive bayse
model = BernoulliNB()
model.fit(X_train_vec, Y_train['class'])

BernoulliNB()

In [57]:
filename = 'NB_model.sav'
pickle.dump(model, open(filename, 'wb'))

In [58]:
print('Train accuracy = %.3f' % model.score(X_train_vec, Y_train))
print(' Test accuracy = %.3f' % model.score(X_text_vec, Y_test))

Train accuracy = 0.989
 Test accuracy = 0.984


In [44]:
# Recieve input text
data_list = []
data = input()
data_list.append(data)
while data != "":
    data = input()
    if data != "":
        data_list.append(data)
data = np.array(data_list)

hfgakhsvasb



Mila, age23, blonde, new in UK. I look sex with UK guys. if u like fun with me. Text MTALK to 69866.18 . 30pp/txt 1st 5free. å£1.50 increments. Help08718728876

Claim a 200 shopping spree, just call 08717895698 now! Have you won! MobStoreQuiz10ppm

In [45]:

#data = np.array(['I am happy.', 
                 #'Are you happy? 00', 
                 #'Free service! Please contact me immediately. But it is 300 US dollars next month.'])
df_data = pd.DataFrame(data, columns=['text'])

# vectorise the input text
input_vec = vec_count.transform(df_data['text'])

pd.DataFrame(input_vec.toarray(), columns=vec_count.get_feature_names())

Unnamed: 0,00,000,02,03,04,05,06,07xxxxxxxxx,0800,08000839402,...,yr,yrs,yummy,yun,yup,zed,ì_,ìï,û_,ûò
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [46]:
result = model.predict(input_vec)
for i in range(0,len(result)):
    print("Message " + str(i+1) + " is " + result[i])

Message 1 is ham
