In [133]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder

In [134]:
data = pd.read_table("spam_ham.txt",header=None, names=['Class', 'sms'])

In [135]:
data.shape

(5572, 2)

In [136]:
data['Class'].value_counts()

ham     4825
spam     747
Name: Class, dtype: int64

In [137]:
le = LabelEncoder()
data.Class = le.fit_transform(data.Class)

0       0
1       0
2       1
3       0
4       0
       ..
5567    1
5568    0
5569    0
5570    0
5571    0
Name: Class, Length: 5572, dtype: int64

In [138]:
x_data = data.sms

In [139]:
y_class = data.Class

In [140]:
# preprocessing

In [8]:
x_data[:5]

0    Go until jurong point, crazy.. Available only ...
1                        Ok lar... Joking wif u oni...
2    Free entry in 2 a wkly comp to win FA Cup fina...
3    U dun say so early hor... U c already then say...
4    Nah I don't think he goes to usf, he lives aro...
Name: sms, dtype: object

In [9]:
x_data

0       Go until jurong point, crazy.. Available only ...
1                           Ok lar... Joking wif u oni...
2       Free entry in 2 a wkly comp to win FA Cup fina...
3       U dun say so early hor... U c already then say...
4       Nah I don't think he goes to usf, he lives aro...
                              ...                        
5567    This is the 2nd time we have tried 2 contact u...
5568                Will Ã¼ b going to esplanade fr home?
5569    Pity, * was in mood for that. So...any other s...
5570    The guy did some bitching but I acted like i'd...
5571                           Rofl. Its true to its name
Name: sms, Length: 5572, dtype: object

In [92]:
clean_data = []

In [93]:
from nltk.tokenize import word_tokenize, RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import re

In [141]:
porter = PorterStemmer()
word_list_per_sent = []
for sent in x_data:
    words_list = word_tokenize(sent)
    clean_words = []
    for word in words_list:
        if word not in stopwords.words('english'):
            word = re.sub(r'[^\w\s]','',word)
            if word != '':
                clean_words.append(porter.stem(word))
    word_list_per_sent.append(" ".join(clean_words))

In [143]:
len(word_list_per_sent)
print(word_list_per_sent[0])

go jurong point crazi avail bugi n great world la e buffet cine got amor wat


In [144]:
from sklearn.model_selection import train_test_split

In [145]:
X_train, X_test, y_train, y_test = train_test_split(
...     word_list_per_sent, y_class, test_size=0.3, random_state=42)

In [146]:
X_train[0]

'quit late lar ard 12 anyway wun b drivin'

In [147]:
# vectorizing the sentences; removing stop words
from sklearn.feature_extraction.text import CountVectorizer
vect = CountVectorizer(stop_words='english')

In [148]:
vect.fit(X_train)

CountVectorizer(stop_words='english')

In [150]:
vect.vocabulary_

{'quit': 4485,
 'late': 3306,
 'lar': 3302,
 'ard': 947,
 '12': 254,
 'wun': 6104,
 'drivin': 2042,
 'tuesday': 5647,
 'night': 3881,
 'real': 4539,
 'chase': 1492,
 'run': 4718,
 'cross': 1750,
 'street': 5221,
 'say': 4778,
 'answer': 902,
 'text': 5422,
 'confirmdeni': 1660,
 'work': 6067,
 'small': 5015,
 'hous': 2870,
 'think': 5468,
 'stop': 5208,
 'like': 3367,
 'hour': 2868,
 'roommat': 4691,
 'look': 3425,
 'stock': 5200,
 'trip': 5624,
 'lol': 3414,
 'great': 2654,
 'im': 2949,
 'hungri': 2899,
 'good': 2610,
 'll': 3395,
 'phone': 4215,
 'tomo': 5559,
 'lunchtim': 3482,
 'shall': 4877,
 'organis': 4069,
 'someth': 5060,
 'yesterday': 6209,
 'true': 5631,
 'ya': 6176,
 'nice': 3873,
 'readi': 4538,
 'thursday': 5495,
 'hi': 2795,
 'way': 5927,
 '2day': 382,
 'normal': 3926,
 'ur': 5759,
 'uniqu': 5724,
 'hope': 2853,
 'know': 3247,
 'rest': 4637,
 'mylif': 3797,
 'wot': 6078,
 'lost': 3435,
 'today': 5539,
 'accept': 738,
 'day': 1816,
 'brother': 1307,
 'sister': 4969,
 'lov

In [151]:
# vocab size
len(vect.vocabulary_.keys())

6309

In [152]:
# transforming the train and test datasets
X_train_transformed = vect.transform(X_train)
X_test_transformed = vect.transform(X_test)

In [155]:
print(X_train_transformed[0])

  (0, 254)	1
  (0, 947)	1
  (0, 2042)	1
  (0, 3302)	1
  (0, 3306)	1
  (0, 4485)	1
  (0, 6104)	1


In [156]:
# training the NB model and making predictions
from sklearn.naive_bayes import MultinomialNB
mnb = MultinomialNB()

# fit
mnb.fit(X_train_transformed,y_train)

# predict class
y_pred_class = mnb.predict(X_test_transformed)

# predict probabilities
y_pred_proba = mnb.predict_proba(X_test_transformed)

In [157]:
# note that alpha=1 is used by default for smoothing
mnb

MultinomialNB()

In [159]:
# printing the overall accuracy
from sklearn import metrics
metrics.accuracy_score(y_test, y_pred_class)

0.9856459330143541

In [160]:
# confusion matrix
metrics.confusion_matrix(y_test, y_pred_class)
# help(metrics.confusion_matrix)

array([[1440,    8],
       [  16,  208]], dtype=int64)