In [1]:
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd

## Part 2: Reading SMS data

In [10]:
sms = pd.read_csv('/Users/sb/DSP Material_04APRIL2019/4. ML2 (MBA-CF-TextMining)/14. Text Mining/sms case study/sms.csv')

In [12]:
sms.shape
sms.head(20)

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
5,spam,FreeMsg Hey there darling it's been 3 week's n...
6,ham,Even my brother is not like to speak with me. ...
7,ham,As per your request 'Melle Melle (Oru Minnamin...
8,spam,WINNER!! As a valued network customer you have...
9,spam,Had your mobile 11 months or more? U R entitle...


In [13]:
sms.label.value_counts()

ham     4825
spam     747
Name: label, dtype: int64

In [14]:
# convert label to a numeric variable
sms['label'] = sms.label.map({'ham':0, 'spam':1})

In [15]:
# define X and y
X = sms.message
y = sms.label

In [16]:
# split into training and testing sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)
print(X_train.shape)
print(X_test.shape)
print(y_test.shape)
print(y_train.shape)

(4179,)
(1393,)
(1393,)
(4179,)


## Part 3: Vectorizing SMS data

In [126]:
# instantiate the vectorizer
vect = CountVectorizer(strip_accents='unicode',stop_words='english',max_df=0.9,min_df=0.001)

In [127]:
# learn training data vocabulary, then create document-term matrix
vect.fit(X_train)
X_train_dtm = vect.transform(X_train)
X_train_dtm

<4179x1279 sparse matrix of type '<class 'numpy.int64'>'
	with 23463 stored elements in Compressed Sparse Row format>

In [128]:
# alternative: combine fit and transform into a single step
X_train_dtm = vect.fit_transform(X_train)
X_train_dtm

<4179x1279 sparse matrix of type '<class 'numpy.int64'>'
	with 23463 stored elements in Compressed Sparse Row format>

In [129]:
# transform testing data (using fitted vocabulary) into a document-term matrix
X_test_dtm = vect.transform(X_test)
X_test_dtm

<1393x1279 sparse matrix of type '<class 'numpy.int64'>'
	with 7734 stored elements in Compressed Sparse Row format>

## Part 4: Examining the tokens and their counts

In [130]:
# store token names
X_train_tokens = vect.get_feature_names()

In [131]:
# first 50 tokens
print(X_train_tokens[:50])

['00', '000', '03', '04', '0800', '08000839402', '08000930705', '0870', '08707509020', '08712300220', '08712460324', '10', '100', '1000', '10am', '10p', '11', '11mths', '12', '12hrs', '1327', '150', '150p', '150pm', '150ppm', '16', '18', '1st', '20', '200', '2000', '2003', '20p', '21', '25', '250', '25p', '2day', '2lands', '2nd', '2nite', '30', '3030', '350', '36504', '3g', '40gb', '4th', '4u', '50']


In [132]:
# last 50 tokens
print(X_train_tokens[-50:])

['wins', 'wish', 'wishes', 'wishing', 'wit', 'wiv', 'wk', 'wkly', 'woke', 'won', 'wonder', 'wonderful', 'wondering', 'wont', 'word', 'words', 'work', 'working', 'works', 'world', 'worried', 'worries', 'worry', 'worse', 'worth', 'wot', 'wow', 'write', 'wrong', 'www', 'xmas', 'xx', 'xxx', 'xy', 'ya', 'yar', 'yay', 'yeah', 'year', 'years', 'yep', 'yes', 'yest', 'yesterday', 'ym', 'yo', 'yr', 'yrs', 'yup', 'zed']


In [133]:
# view X_train_dtm as a dense matrix
X_train_dtm.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [134]:
# count how many times EACH token appears across ALL messages in X_train_dtm
import numpy as np
X_train_counts = np.sum(X_train_dtm.toarray(), axis=0)
X_train_counts

array([ 5, 23,  6, ...,  6, 33,  6], dtype=int64)

In [135]:
X_train_counts.shape

(1279,)

In [136]:
# create a DataFrame of tokens with their counts
pd.DataFrame({'token':X_train_tokens, 'count':X_train_counts})

Unnamed: 0,token,count
0,00,5
1,000,23
2,03,6
3,04,9
4,0800,10
5,08000839402,10
6,08000930705,11
7,0870,7
8,08707509020,6
9,08712300220,5


## Bonus: Calculating the "spamminess" of each token

In [137]:
# create separate DataFrames for ham and spam
sms_ham = sms[sms.label==0]
sms_spam = sms[sms.label==1]

In [138]:
# learn the vocabulary of ALL messages and save it
vect.fit(sms.message)
all_tokens = vect.get_feature_names()

In [139]:
# create document-term matrices for ham and spam
ham_dtm = vect.transform(sms_ham.message)
spam_dtm = vect.transform(sms_spam.message)

In [140]:
# count how many times EACH token appears across ALL ham messages
ham_counts = np.sum(ham_dtm.toarray(), axis=0)

In [141]:
# count how many times EACH token appears across ALL spam messages
spam_counts = np.sum(spam_dtm.toarray(), axis=0)

In [142]:
# create a DataFrame of tokens with their separate ham and spam counts
token_counts = pd.DataFrame({'token':all_tokens, 'ham':ham_counts, 'spam':spam_counts})

In [143]:
# add one to ham and spam counts to avoid dividing by zero (in the step that follows)
token_counts['ham'] = token_counts.ham + 1
token_counts['spam'] = token_counts.spam + 1

In [144]:
# calculate ratio of spam-to-ham for each token
token_counts['spam_ratio'] = token_counts.spam / token_counts.ham
token_counts.sort_values('spam_ratio')

Unnamed: 0,token,ham,spam,spam_ratio
506,gt,319,1,0.003135
709,lt,317,1,0.003155
695,lor,163,1,0.006135
307,da,151,1,0.006623
648,later,136,1,0.007353
125,ask,90,1,0.011111
1001,said,90,1,0.011111
357,doing,89,1,0.011236
105,amp,89,1,0.011236
780,morning,80,1,0.012500


## Part 5: Building a Naive Bayes model

We will use [Multinomial Naive Bayes](http://scikit-learn.org/stable/modules/generated/sklearn.naive_bayes.MultinomialNB.html):

> The multinomial Naive Bayes classifier is suitable for classification with **discrete features** (e.g., word counts for text classification). The multinomial distribution normally requires integer feature counts. However, in practice, fractional counts such as tf-idf may also work.

In [145]:
X_train_dtm

<4179x1279 sparse matrix of type '<class 'numpy.int64'>'
	with 23463 stored elements in Compressed Sparse Row format>

In [119]:
# train a Naive Bayes model using X_train_dtm
from sklearn.naive_bayes import MultinomialNB
nb = MultinomialNB()
nb.fit(X_train_dtm, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [120]:
# make class predictions for X_test_dtm
y_pred_class = nb.predict(X_test_dtm)

In [121]:
# calculate accuracy of class predictions
from sklearn import metrics
print(metrics.accuracy_score(y_test, y_pred_class))

0.9863603732950467


In [122]:
# confusion matrix
print(metrics.confusion_matrix(y_test, y_pred_class))

[[1199    9]
 [  10  175]]


In [123]:
# predict (poorly calibrated) probabilities
y_pred_prob = nb.predict_proba(X_test_dtm)[:, 1]
y_pred_prob

array([5.76898683e-03, 2.39278079e-04, 7.88400275e-02, ...,
       1.69830646e-06, 9.99999999e-01, 3.50265628e-08])

In [124]:
# calculate AUC
print(metrics.roc_auc_score(y_test, y_pred_prob))

0.9929076427420799


In [147]:
# print message text for the false positives
X_test[y_test < y_pred_class]

4419                           When you get free, call me
1587    There are no other charges after transfer char...
2903    Bill, as in: Are there any letters for me. i’m...
694     Will purchase d stuff today and mail to you. D...
45                       No calls..messages..missed calls
3589    If you were/are free i can give. Otherwise nal...
2162    Is she replying. Has boye changed his phone nu...
3415                              No pic. Please re-send.
1988                     No calls..messages..missed calls
Name: message, dtype: object

In [171]:
# print message text for the false negatives
X_test[y_test > y_pred_class]

3316    FREE MESSAGE Activate your 500 FREE Text Messa...
1745    Someone has conacted our dating service and en...
1573    Ur cash-balance is currently 500 pounds - to m...
5567    This is the 2nd time we have tried 2 contact u...
2473    Final Chance! Claim ur £150 worth of discount ...
3807    URGENT! We are trying to contact you. Last wee...
1064    We have new local dates in your area - Lots of...
1687    todays vodafone numbers ending with 0089(my la...
4460    Welcome to UK-mobile-date this msg is FREE giv...
1777                    Call FREEPHONE 0800 542 0578 now!
4592    Well done ENGLAND! Get the official poly ringt...
2680    New Tones This week include: 1)McFly-All Ab..,...
1217    You have 1 new voicemail. Please call 08719181...
3766    Someone U know has asked our dating service 2 ...
259     We tried to contact you re your reply to our o...
5566    REMINDER FROM O2: To get 2.50 pounds free call...
763     Urgent Ur £500 guaranteed award is still uncla...
3780    Claim 

In [149]:
# what do you notice about the false negatives?
X_test[3316]

'FREE MESSAGE Activate your 500 FREE Text Messages by replying to this message with the word FREE For terms & conditions, visit www.07781482378.com'