In [12]:
import pandas as pd
from sklearn import naive_bayes, metrics
from sklearn import preprocessing
from sklearn.model_selection import train_test_split

sms_data = pd.read_csv('datasets/smsspamcollection/SMSSpamCollection', 
                       header=None, 
                       sep='\t', 
                       names=['Label', 'SMS'])

print(sms_data.groupby('Label').count())

for i in range(0, 5):
    print(sms_data['Label'][i])
    print(sms_data['SMS'][i])
    print("\n")

sms_data_clean = sms_data.copy()
sms_data_clean['SMS'] = sms_data_clean['SMS'].str.replace(r'\W+', ' ', regex=True).str.replace(r'\s+', ' ', regex=True).str.strip()
sms_data_clean['SMS'] = sms_data_clean['SMS'].str.lower()
sms_data_clean['SMS'] = sms_data_clean['SMS'].str.split()


le = preprocessing.LabelEncoder()
sms_data_clean['Label'] = le.fit_transform(sms_data_clean['Label'])

for i in range(0, 5):
    print(sms_data_clean['Label'][i])
    print(sms_data_clean['SMS'][i])
    print("\n")


X_train, X_test, y_train, y_test = train_test_split(sms_data_clean['SMS'], sms_data_clean['Label'], test_size=0.2, random_state=42)

        SMS
Label      
ham    4825
spam    747
ham
Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...


ham
Ok lar... Joking wif u oni...


spam
Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's


ham
U dun say so early hor... U c already then say...


ham
Nah I don't think he goes to usf, he lives around here though


0
['go', 'until', 'jurong', 'point', 'crazy', 'available', 'only', 'in', 'bugis', 'n', 'great', 'world', 'la', 'e', 'buffet', 'cine', 'there', 'got', 'amore', 'wat']


0
['ok', 'lar', 'joking', 'wif', 'u', 'oni']


1
['free', 'entry', 'in', '2', 'a', 'wkly', 'comp', 'to', 'win', 'fa', 'cup', 'final', 'tkts', '21st', 'may', '2005', 'text', 'fa', 'to', '87121', 'to', 'receive', 'entry', 'question', 'std', 'txt', 'rate', 't', 'c', 's', 'apply', '08452810075over18', 's']


0
['u', 'dun', 'say', 'so', 'early', 

In [6]:
print(sms_data.head())

print(sms_data_clean.head())

  Label                                                SMS
0   ham  Go until jurong point, crazy.. Available only ...
1   ham                      Ok lar... Joking wif u oni...
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...
3   ham  U dun say so early hor... U c already then say...
4   ham  Nah I don't think he goes to usf, he lives aro...
   Label                                                SMS
0      0  [go, until, jurong, point, crazy, available, o...
1      0                     [ok, lar, joking, wif, u, oni]
2      1  [free, entry, in, 2, a, wkly, comp, to, win, f...
3      0  [u, dun, say, so, early, hor, u, c, already, t...
4      0  [nah, i, don, t, think, he, goes, to, usf, he,...


In [40]:
sms_data_clean['Label'].value_counts()

0    4825
1     747
Name: Label, dtype: int64

In [41]:
sms_data_clean['Label'].value_counts() / sms_data.shape[0] * 100

0    86.593683
1    13.406317
Name: Label, dtype: float64

In [14]:


vocabulary = list(set(X_train.sum()))
X_train_voc = pd.DataFrame([
    [row.count(word) for word in vocabulary]
    for row in X_train], columns=vocabulary)
X_test_voc = pd.DataFrame([
    [row.count(word) for word in vocabulary]
    for row in X_test], columns=vocabulary)

X_train_voc.head()

Unnamed: 0,weeks,try,thinl,neekunna,thia,funny,blanket,doubles,conclusion,mila,...,performed,qing,jane,timi,fieldof,sorting,everyday,triple,23g,54
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [43]:
m = []

for row in X_train:
    l = []
    for word in vocabulary:
        l.append(row.count(word))
    
    m.append(l)

X_train_voc_2 = pd.DataFrame(m, columns=vocabulary)

X_train_voc_2.head()

Unnamed: 0,cell,4eva,bleak,sensible,main,ps,reassuring,chechi,scrumptious,chores,...,overdid,milta,misundrstud,west,natalja,comingdown,dramatic,aunt,applying,prefer
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [48]:
print(X_train_voc.shape)
print(X_test_voc.shape)

(4457, 7741)
(1115, 7741)


In [33]:
cl_gauss = naive_bayes.GaussianNB()
res_gauss = cl_gauss.fit(X_train_voc, y_train).predict(X_test_voc)
print(metrics.accuracy_score(y_test, res_gauss) * 100)


91.03139013452915


In [34]:
cl_multi = sklearn.naive_bayes.MultinomialNB()
res_multi = cl_multi.fit(X_train_voc, y_train).predict(X_test_voc)
metrics.accuracy_score(y_test, res_multi) * 100

99.19282511210761

In [35]:
cl_bern = sklearn.naive_bayes.BernoulliNB()
res_bern = cl_bern.fit(X_train_voc, y_train).predict(X_test_voc)
metrics.accuracy_score(y_test, res_bern) * 100

98.29596412556054