In [14]:
import warnings
warnings.filterwarnings('ignore')
import numpy as np
import pandas as pd
import string
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics

In [2]:
data = pd.read_csv('/Users/tWo/Documents/Project/SMS Spam Detection/spam.csv', usecols=[0,1], encoding='latin-1')
data.columns = ['label', 'message']
data.head()

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [3]:
print('The data has {:0d} rows and {:0d} columns.'.format(data.shape[0], data.shape[1]))
print('The basic counts for data:', '\n', data.groupby('label').describe())

The data has 5572 rows and 2 columns.
The basic counts for data: 
                                                         message
label                                                          
ham   count                                                4825
      unique                                               4516
      top                                Sorry, I'll call later
      freq                                                   30
spam  count                                                 747
      unique                                                653
      top     Please call our customer service representativ...
      freq                                                    4


Split data for train and test, using first 4000 for train and remaining for test.

In [4]:
test = data.iloc[4000:,:]
train = data.iloc[:4000,:]
print('The train has {:0d} rows and {:0d} columns.'.format(train.shape[0], train.shape[1]))
print('The test has {:0d} rows and {:0d} columns.'.format(test.shape[0], test.shape[1]))

The train has 4000 rows and 2 columns.
The test has 1572 rows and 2 columns.


In [5]:
def clean_text(text):
    text = text.translate(str.maketrans('','', string.punctuation)).lower()
    result = []
    for word in text.split():
        if word not in stopwords.words('english'):
            result.append(word)
    return ' '.join(result)

train['message'] = train['message'].apply(clean_text)
train.head()

Unnamed: 0,label,message
0,ham,go jurong point crazy available bugis n great ...
1,ham,ok lar joking wif u oni
2,spam,free entry 2 wkly comp win fa cup final tkts 2...
3,ham,u dun say early hor u c already say
4,ham,nah dont think goes usf lives around though


In [8]:
x, y = train['message'], train['label']
vect = TfidfVectorizer('english')
x_trans = vect.fit_transform(x)
x_trans

<4000x7764 sparse matrix of type '<class 'numpy.float64'>'
	with 33848 stored elements in Compressed Sparse Row format>

In [58]:
nb_model = MultinomialNB(alpha=1)
nb_model.fit(x_trans, y)
print(nb_model.score(x_trans, y))
metrics.confusion_matrix(y, nb_model.predict(x_trans))

0.9775


array([[3465,    0],
       [  90,  445]])

In [59]:
y_pred, y_true = pd.Series(nb_model.predict(x_trans)), y
pd.crosstab(y_true, y_pred, rownames=['True'], colnames=['Pred'], margins=True)

Pred,ham,spam,All
True,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
ham,3465,0,3465
spam,90,445,535
All,3555,445,4000


In [60]:
x_test, y_test = test['message'], test['label']
x_test = vect.transform(x_test)
print(nb_model.score(x_test, y_test))
metrics.confusion_matrix(y_test, nb_model.predict(x_test))

0.955470737913


array([[1360,    0],
       [  70,  142]])

In [None]:
# spam_text = []
# ham_text = []
# for i in range(0,len(train)):
#     if train.ix[i,0] == 'spam':
#         for line in train['message']:
#             for word in line.split():
#                 spam_text.append(word)
#     else:
#         for line in train['message']:
#             for word in line.split():
#                 ham_text.append(word)

# spam_text, ham_text = pd.Series(spam_text), pd.Series(ham_text)
# print(spam_text.value_counts().head(20))
# print(ham_text.value_counts().head(20))