In [1]:
import pandas as pd

In [4]:
sms = pd.read_table('sms.tsv', header = None, names = ['label','message'])
sms.head()

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [5]:
sms.shape

(5572, 2)

In [7]:
# convert label to numerical variable
sms['label_num'] = sms.label.map({'ham':0, 'spam':1})
sms.head()

Unnamed: 0,label,message,label_num
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0


In [8]:
X = sms.message
y = sms.label_num

In [9]:
from sklearn.model_selection import train_test_split

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 1)

# Vectorizing text as numeral

In [12]:
from sklearn.feature_extraction.text import CountVectorizer

In [13]:
vect = CountVectorizer()

In [14]:
vect.fit(X_train)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [25]:
X_train_dtm = vect.transform(X_train)

In [26]:
X_train_dtm

<4179x7456 sparse matrix of type '<class 'numpy.int64'>'
	with 55209 stored elements in Compressed Sparse Row format>

In [27]:
from sklearn.naive_bayes import MultinomialNB

In [28]:
nb = MultinomialNB()

In [30]:
%time nb.fit(X_train_dtm, y_train)

Wall time: 2.91 ms


MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [31]:
X_test_dtm = vect.transform(X_test)
y_pred_class = nb.predict(X_test_dtm)

In [32]:
from sklearn import metrics

In [33]:
print(metrics.accuracy_score(y_test, y_pred_class))

0.9885139985642498


In [34]:
confusion = metrics.confusion_matrix(y_test, y_pred_class)

In [35]:
confusion

array([[1203,    5],
       [  11,  174]], dtype=int64)

In [38]:
y_pred_class.shape

(1393,)

In [40]:
y_test.index

Int64Index([1078, 4028,  958, 4642, 4674, 5461, 4210, 4216, 1603, 1504,
            ...
            2870, 5458, 2890, 3658, 4285, 3207, 4655, 1140, 1793, 1710],
           dtype='int64', length=1393)

In [52]:
check = pd.DataFrame(data = {'y_test':y_test, 'y_pred_class':y_pred_class})

In [60]:
#False positive: Not spam but predicted spam
sms.loc[check.loc[(y_test == 0) & (y_pred_class == 1)].index].message

574               Waiting for your call.
3375             Also andros ice etc etc
45      No calls..messages..missed calls
3415             No pic. Please re-send.
1988    No calls..messages..missed calls
Name: message, dtype: object

In [62]:
# Easy False Positive
X_test[(y_test == 0) & (y_pred_class == 1)]

574               Waiting for your call.
3375             Also andros ice etc etc
45      No calls..messages..missed calls
3415             No pic. Please re-send.
1988    No calls..messages..missed calls
Name: message, dtype: object

In [61]:
#False Negative: Spam but predicted not spam
sms.loc[check.loc[(y_test == 1) & (y_pred_class == 0)].index].message

3132    LookAtMe!: Thanks for your purchase of a video...
5       FreeMsg Hey there darling it's been 3 week's n...
3530    Xmas & New Years Eve tickets are now on sale f...
684     Hi I'm sue. I am 20 years old and work as a la...
1875    Would you like to see my XXX pics they are so ...
1893    CALL 09090900040 & LISTEN TO EXTREME DIRTY LIV...
4298    thesmszone.com lets you send free anonymous an...
4949    Hi this is Amy, we will be sending you a free ...
2821    INTERFLORA - It's not too late to order Inter...
2247    Hi ya babe x u 4goten bout me?' scammers getti...
4514    Money i have won wining number 946 wot do i do...
Name: message, dtype: object

In [65]:
y_pred_prob = nb.predict_proba(X_test_dtm)[:,1]

In [66]:
y_pred_prob

array([2.87744864e-03, 1.83488846e-05, 2.07301295e-03, ...,
       1.09026171e-06, 1.00000000e+00, 3.98279868e-09])

In [67]:
metrics.roc_auc_score(y_test, y_pred_prob)

0.9866431000536962

### Examining model further

In [68]:
X_train_token = vect.get_feature_names()
len(X_train_token)

7456

In [70]:
X_train_token[-50:]

['yer',
 'yes',
 'yest',
 'yesterday',
 'yet',
 'yetunde',
 'yijue',
 'ym',
 'ymca',
 'yo',
 'yoga',
 'yogasana',
 'yor',
 'yorge',
 'you',
 'youdoing',
 'youi',
 'youphone',
 'your',
 'youre',
 'yourjob',
 'yours',
 'yourself',
 'youwanna',
 'yowifes',
 'yoyyooo',
 'yr',
 'yrs',
 'ything',
 'yummmm',
 'yummy',
 'yun',
 'yunny',
 'yuo',
 'yuou',
 'yup',
 'zac',
 'zaher',
 'zealand',
 'zebra',
 'zed',
 'zeros',
 'zhong',
 'zindgi',
 'zoe',
 'zoom',
 'zouk',
 'zyada',
 'èn',
 '〨ud']

In [74]:
nb.feature_count_ # 1st token appears 0 times in ham nd 5 times in span

array([[ 0.,  0.,  0., ...,  1.,  1.,  1.],
       [ 5., 23.,  2., ...,  0.,  0.,  0.]])

In [85]:
tokens = pd.DataFrame({'token': X_train_token, 'ham':nb.feature_count_[0,:]
                      , 'spam': nb.feature_count_[1,:]})
tokens = tokens.set_index('token')

In [87]:
tokens.sample(5, random_state=6)

Unnamed: 0_level_0,ham,spam
token,Unnamed: 1_level_1,Unnamed: 2_level_1
very,64.0,2.0
nasty,1.0,1.0
villa,0.0,1.0
beloved,1.0,0.0
textoperator,0.0,2.0


In [88]:
nb.class_count_

array([3617.,  562.])

In [89]:
#to prevent divide by 0, add 1 to all
tokens['ham'] = tokens['ham']+1
tokens['spam'] = tokens['spam']+1