


Cross Validation & Naive Bayes Lab - SMS Spam Classification
===============
* orignally developed by Ankit Jain
* modified by Justin Breucop
* modified by Dylan Hercher

Data source: https://archive.ics.uci.edu/ml/datasets/SMS+Spam+Collection

# Section 1)

## Cross Validation from Scratch

Let's build it the function together! The steps to cross validation are:
1. Randomly separate your training set into _k_ groups
2. For each group _k_:
>1. Train your model on the other groups
>2. Score your model using group _k_ as validation
>3. Save your score and move to your next group

3. Add your _k_ scores and divide by _k_ to get your average score

In [12]:
# Importing Packages 
import numpy as np
import pandas as pd

from sklearn.neighbors import KNeighborsClassifier
from sklearn.cross_validation import train_test_split, StratifiedKFold, KFold

In [2]:
model = KNeighborsClassifier(10)

df = pd.DataFrame.from_csv('../data/titanic-train.csv',index_col=None)[['Age','Pclass','SibSp','Survived']].dropna()
to_predict = "Survived"
features=['Age','Pclass','SibSp']
data = df[features]
label = df[to_predict]
folds=5

In [27]:
from sklearn.cross_validation import cross_val_score

# Results of using the built-in cross validation
# Note: Default is Kfold, but within sklearn.cross_validation
# there are many types of validation that can be used
cross_val_score(model, data, label, cv=StratifiedKFold(label, 5, shuffle=False)) ## set shuffle to false to randomize the split

array([ 0.64335664,  0.69230769,  0.6993007 ,  0.74125874,  0.67605634])

In [7]:
data.columns

Index([u'Age', u'Pclass', u'SibSp'], dtype='object')

In [8]:
label.head()

0    0
1    1
2    1
3    1
4    0
Name: Survived, dtype: int64

In [10]:
data.ix[:,:]

Unnamed: 0,Age,Pclass,SibSp
0,22,3,1
1,38,1,1
2,26,3,0
3,35,1,1
4,35,3,0
6,54,1,0
7,2,3,3
8,27,3,0
9,14,2,1
10,4,3,1


Let's build in class!

In [25]:
#Function here:
def cross_validate_df(data,label,model,k):
    X = data.ix[:,:].values
    y = label
    score = []
    for n in range(k):
        X_train, X_test, y_train, y_test =  train_test_split(X, y, test_size = 0.2)
        model = KNeighborsClassifier(5).fit(X_train, y_train)
        model.predict(X_test)
        model.score = score(X_test, y_test)
        score.append(score)
        
    return score 

# df1 = pd.DataFrame(n_neighbors, columns = ['n'])
# df1.Scores = scores

                     
# fig = plt.figure(figsize=(6,5))
# plt.title('KNN score as a funtion of number of neighbors')
# plt.ylim(0.2,1.1)                                      
# plt.plot(split_num, scores)  


"""Return Average score across k iterations
    Parameters
    -----------
    data : DataFrame with features to use in X
    label : Series with target y
    model : ML Model to use
    k : int number of iterations
    """
    # TODO: Design the cross validation function
    # it may be helpful to try writing the function with k=1 first
    
    

'Return Average score across k iterations\n    Parameters\n    -----------\n    data : DataFrame with features to use in X\n    label : Series with target y\n    model : ML Model to use\n    k : int number of iterations\n    '

In [None]:
def cross_validate_df(data,label,model,k):
    train_slices = []
    test_slices = []
    positions = data.index.values
    np.random.shuffle(positions)
    
    for n in range(k):
        X_train, X_test, y_train, y_test =  test_train_split(data, label, test_size = 1.0/k)
#         model = KNeighborsClassifier(5).fit(X_train, y_train)
#         model.predict(X_test)
#         model.score = score(X_test, y_test)
#         score.append(score)
       
    return None  

In [26]:
# Once Complete test your function here
cross_validate_df(data,label,model,5)

TypeError: 'list' object is not callable

In [27]:
zip([1,2,3],[4,5,6])

[(1, 4), (2, 5), (3, 6)]

In [None]:
## zip the train and the test

### Built-in Cross Validation
There is also a very simple cross validation function provided by sklearn

In [41]:
from sklearn.cross_validation import cross_val_score

# Results of using the built-in cross validation
# Note: Default is Kfold, but within sklearn.cross_validation
# there are many types of validation that can be used
cross_val_score(model, data, label, cv=5)

array([ 0.64335664,  0.69230769,  0.6993007 ,  0.74125874,  0.67605634])

In [40]:
cross_val_score(model, data, label, cv=3)

array([ 0.64016736,  0.70168067,  0.69198312])

In [None]:
np.mean(cross_val_score(model, data, label, cv=3))

# Section 2)
## Naive Bayes and SMS Spam Classification

In [28]:
import numpy as np
import pandas as pd

from sklearn.cross_validation import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics


In [29]:
## READING IN THE DATA
df = pd.DataFrame.from_csv("../data/SMSSpamCollection.tsv",sep='\t',header=0,index_col=None)

In [30]:
# examine the data
df.head()

Unnamed: 0,label,msg
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [31]:
df[df.label=='spam'].head()

Unnamed: 0,label,msg
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
5,spam,FreeMsg Hey there darling it's been 3 week's n...
8,spam,WINNER!! As a valued network customer you have...
9,spam,Had your mobile 11 months or more? U R entitle...
11,spam,"SIX chances to win CASH! From 100 to 20,000 po..."


In [32]:
df.label.value_counts()

ham     4825
spam     747
dtype: int64

In [33]:
df.msg.describe()

count                       5572
unique                      5169
top       Sorry, I'll call later
freq                          30
Name: msg, dtype: object

In [None]:
# Convert the label into a binary variable
# Remember the map function we learned before?
df['label'] = df.label.map({'ham': 0 , 'spam':1})

In [34]:
df.head()

Unnamed: 0,label,msg
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [35]:
# split into training and testing sets by calling sklearn lib
# by default, the data set is split into 0.75 (training) and 0.25 (testing)
from sklearn.cross_validation import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df.msg, df.label, random_state=1)

In [36]:
print X_train.shape
print X_train

(4179,)
[ '4mths half price Orange line rental & latest camera phones 4 FREE. Had your phone 11mths+? Call MobilesDirect free on 08000938767 to update now! or2stoptxt T&Cs'
 'Did you stitch his trouser'
 'Hope you enjoyed your new content. text stop to 61610 to unsubscribe. help:08712400602450p Provided by tones2you.co.uk'
 ...,
 'CERI U REBEL! SWEET DREAMZ ME LITTLE BUDDY!! C YA 2MORO! WHO NEEDS BLOKES'
 'Text & meet someone sexy today. U can find a date or even flirt its up to U. Join 4 just 10p. REPLY with NAME & AGE eg Sam 25. 18 -msg recd@thirtyeight pence'
 'K k:) sms chat with me.']


In [37]:
X_test.shape

(1393,)

Now we need to convert the text into feature vectors which can be used for machine learning purposes.
We will use the scikit function of CountVectorizer to 'convert text into a matrix of token counts'

 http://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html

#### Lets try a simple example

In [38]:
from sklearn.feature_extraction.text import CountVectorizer

In [39]:
# start with a simple example
train_simple = ['call you tonight',
                'Call me a cab',
                'please call me... PLEASE!']

In [45]:
# learn the 'vocabulary' of the training data
vect = CountVectorizer(decode_error='ignore')
vect.fit(train_simple) ##fit removes "a"
vect.get_feature_names()

[u'cab', u'call', u'me', u'please', u'tonight', u'you']

In [41]:
# transform training data into a 'document-term matrix'
train_simple_dtm = vect.transform(train_simple)
train_simple_dtm.toarray()

array([[0, 1, 0, 0, 1, 1],
       [1, 1, 1, 0, 0, 0],
       [0, 1, 1, 2, 0, 0]])

In [42]:
# We can see how we've adjusted our data easily!
# examine the vocabulary and document-term matrix together
print train_simple
    
pd.DataFrame(train_simple_dtm.toarray(), columns=vect.get_feature_names())

['call you tonight', 'Call me a cab', 'please call me... PLEASE!']


Unnamed: 0,cab,call,me,please,tonight,you
0,0,1,0,0,1,1
1,1,1,1,0,0,0
2,0,1,1,2,0,0


In [43]:
# transform testing data into a document-term matrix (using existing vocabulary)
test_simple = ["please don't call me"] ##don't isn;t recognized and taken into account..
test_simple_dtm = vect.transform(test_simple)
test_simple_dtm.toarray()

pd.DataFrame(test_simple_dtm.toarray(), columns=vect.get_feature_names())

Unnamed: 0,cab,call,me,please,tonight,you
0,0,1,1,1,0,0


#### Question:  How does the above test_simple show how things can go wrong?

#### Exercise: Using the dataset below
   * Vectorize the text
   * Store the results in a DataFrame
   * Show word counts (hint: one dataframe describer can do this)
   * Transform the test text

In [47]:
train_exp = ['where is my taco?',
                'did I eat the taco',
                'I can easily eat my way through that whole box of tacos!',
                'I think way too much about tacos, huh',
                'taco, taco, taco!!!'                
               ]
test_exp = [
    'where did he go?', 'how long did the whole thing last', 'lets go eat one taco or multiple tacos'
]

In [48]:
vect = CountVectorizer(decode_error='ignore')
vect.fit(train_exp)
vect.get_feature_names()

[u'about',
 u'box',
 u'can',
 u'did',
 u'easily',
 u'eat',
 u'huh',
 u'is',
 u'much',
 u'my',
 u'of',
 u'taco',
 u'tacos',
 u'that',
 u'the',
 u'think',
 u'through',
 u'too',
 u'way',
 u'where',
 u'whole']

In [49]:
train_simple_dtm = vect.transform(train_exp)
train_simple_dtm.toarray()

array([[0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0],
       [0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0],
       [0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1],
       [1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0]])

In [52]:
print train_simple
    
train_simple_df = pd.DataFrame(train_simple_dtm.toarray(), columns=vect.get_feature_names())

['call you tonight', 'Call me a cab', 'please call me... PLEASE!']


In [55]:
train_simple_df

Unnamed: 0,about,box,can,did,easily,eat,huh,is,much,my,...,taco,tacos,that,the,think,through,too,way,where,whole
0,0,0,0,0,0,0,0,1,0,1,...,1,0,0,0,0,0,0,0,1,0
1,0,0,0,1,0,1,0,0,0,0,...,1,0,0,1,0,0,0,0,0,0
2,0,1,1,0,1,1,0,0,0,1,...,0,1,1,0,0,1,0,1,0,1
3,1,0,0,0,0,0,1,0,1,0,...,0,1,0,0,1,0,1,1,0,0
4,0,0,0,0,0,0,0,0,0,0,...,3,0,0,0,0,0,0,0,0,0


In [56]:
for col in train_simple_df.columns:
    print col+':',
    print train_simple_df[col].sum()
    

about: 1
box: 1
can: 1
did: 1
easily: 1
eat: 2
huh: 1
is: 1
much: 1
my: 2
of: 1
taco: 5
tacos: 2
that: 1
the: 1
think: 1
through: 1
too: 1
way: 2
where: 1
whole: 1


In [51]:
test_simple_dtm = vect.transform(test_exp)
test_simple_dtm.toarray()

pd.DataFrame(test_simple_dtm.toarray(), columns=vect.get_feature_names())

Unnamed: 0,about,box,can,did,easily,eat,huh,is,much,my,...,taco,tacos,that,the,think,through,too,way,where,whole
0,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
1,0,0,0,1,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,1
2,0,0,0,0,0,1,0,0,0,0,...,1,1,0,0,0,0,0,0,0,0


## Vectorizing our SMS Dataset

In [86]:
# instantiate the vectorizer ( use variable name as vect)
vect = CountVectorizer(decode_error = 'ignore')
vect.fit(X_train)
vect.get_feature_names()

[u'00',
 u'000',
 u'008704050406',
 u'0121',
 u'01223585236',
 u'01223585334',
 u'0125698789',
 u'02',
 u'0207',
 u'02072069400',
 u'02073162414',
 u'02085076972',
 u'021',
 u'03',
 u'04',
 u'0430',
 u'05',
 u'050703',
 u'0578',
 u'06',
 u'07',
 u'07008009200',
 u'07090201529',
 u'07090298926',
 u'07123456789',
 u'07732584351',
 u'07734396839',
 u'07742676969',
 u'0776xxxxxxx',
 u'07781482378',
 u'07786200117',
 u'078',
 u'07801543489',
 u'07808',
 u'07808247860',
 u'07808726822',
 u'07815296484',
 u'07821230901',
 u'07880867867',
 u'0789xxxxxxx',
 u'07946746291',
 u'0796xxxxxx',
 u'07973788240',
 u'07xxxxxxxxx',
 u'08',
 u'0800',
 u'08000407165',
 u'08000776320',
 u'08000839402',
 u'08000930705',
 u'08000938767',
 u'08001950382',
 u'08002888812',
 u'08002986030',
 u'08002986906',
 u'08002988890',
 u'08006344447',
 u'0808',
 u'08081263000',
 u'08081560665',
 u'0825',
 u'083',
 u'0844',
 u'08448714184',
 u'0845',
 u'08450542832',
 u'08452810071',
 u'08452810073',
 u'08452810075over18',


In [58]:
# transform testing data into a document-term matrix: Use Variable name as test_dtm
train_dtm = vect.transform(X_train)
test_dtm = vect.transform(X_test)
print test_dtm

  (0, 1538)	1
  (0, 5189)	1
  (0, 6542)	1
  (0, 7405)	1
  (1, 1016)	1
  (1, 3050)	1
  (1, 4163)	1
  (1, 4238)	1
  (1, 4370)	1
  (1, 5200)	1
  (1, 6656)	1
  (1, 7407)	1
  (1, 7420)	1
  (2, 986)	1
  (2, 3244)	1
  (2, 7162)	1
  (3, 3237)	1
  (4, 887)	2
  (4, 1060)	1
  (4, 1595)	1
  (4, 2066)	1
  (4, 2833)	1
  (4, 3388)	1
  (4, 3623)	1
  (4, 3921)	1
  :	:
  (1391, 4373)	1
  (1391, 4413)	1
  (1391, 4441)	1
  (1391, 4743)	1
  (1391, 4778)	1
  (1391, 6017)	1
  (1391, 6057)	1
  (1391, 6829)	1
  (1391, 6904)	1
  (1391, 7012)	1
  (1391, 7120)	1
  (1391, 7230)	2
  (1391, 7239)	1
  (1391, 7287)	1
  (1391, 7357)	1
  (1392, 848)	1
  (1392, 2400)	1
  (1392, 2873)	1
  (1392, 3158)	1
  (1392, 4238)	1
  (1392, 4255)	2
  (1392, 4487)	1
  (1392, 4802)	1
  (1392, 5565)	1
  (1392, 7075)	1


In [59]:
# Get the length and names of the feature names
train_features = vect.get_feature_names()
len(train_features)

7456

In [60]:
train_features[:50]

[u'00',
 u'000',
 u'008704050406',
 u'0121',
 u'01223585236',
 u'01223585334',
 u'0125698789',
 u'02',
 u'0207',
 u'02072069400',
 u'02073162414',
 u'02085076972',
 u'021',
 u'03',
 u'04',
 u'0430',
 u'05',
 u'050703',
 u'0578',
 u'06',
 u'07',
 u'07008009200',
 u'07090201529',
 u'07090298926',
 u'07123456789',
 u'07732584351',
 u'07734396839',
 u'07742676969',
 u'0776xxxxxxx',
 u'07781482378',
 u'07786200117',
 u'078',
 u'07801543489',
 u'07808',
 u'07808247860',
 u'07808726822',
 u'07815296484',
 u'07821230901',
 u'07880867867',
 u'0789xxxxxxx',
 u'07946746291',
 u'0796xxxxxx',
 u'07973788240',
 u'07xxxxxxxxx',
 u'08',
 u'0800',
 u'08000407165',
 u'08000776320',
 u'08000839402',
 u'08000930705']

In [61]:
train_features[-50:]

[u'yer',
 u'yes',
 u'yest',
 u'yesterday',
 u'yet',
 u'yetunde',
 u'yijue',
 u'ym',
 u'ymca',
 u'yo',
 u'yoga',
 u'yogasana',
 u'yor',
 u'yorge',
 u'you',
 u'youdoing',
 u'youi',
 u'youphone',
 u'your',
 u'youre',
 u'yourjob',
 u'yours',
 u'yourself',
 u'youwanna',
 u'yowifes',
 u'yoyyooo',
 u'yr',
 u'yrs',
 u'ything',
 u'yummmm',
 u'yummy',
 u'yun',
 u'yunny',
 u'yuo',
 u'yuou',
 u'yup',
 u'zac',
 u'zaher',
 u'zealand',
 u'zebra',
 u'zed',
 u'zeros',
 u'zhong',
 u'zindgi',
 u'zoe',
 u'zoom',
 u'zouk',
 u'zyada',
 u'\xe8n',
 u'\u3028ud']

In [62]:
# convert train_dtm to a regular array
train_arr = train_dtm.toarray()
train_arr

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ..., 
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [63]:

# Revisit Numpy
arr = np.array([[1, 2, 3, 4], [5, 6, 7, 8]])
print arr[0, 0]
print arr[1, 3]
print arr[0, :]
print arr[:, 0]
print np.sum(arr)
print np.sum(arr,axis = 0)
print np.sum(arr,axis = 1)




1
8
[1 2 3 4]
[1 5]
36
[ 6  8 10 12]
[10 26]


In [64]:
# exercise: calculate the number of tokens in the 0th message in train_arr
print np.sum(train_arr[0,:])

24


In [66]:
train_arr[0,:]

array([0, 0, 0, ..., 0, 0, 0])

In [None]:
# exercise: count how many times the 0th token appears across ALL messages in train_arr
print np.sum(train_arr[:,0]) ## token means feature

In [65]:
train_arr[:,0]

array([0, 0, 0, ..., 0, 0, 0])

In [67]:
# exercise: count how many times EACH token appears across ALL messages in train_arr
print np.sum(train_arr, axis=0)

[ 5 23  2 ...,  1  1  1]


In [76]:
train_features_df = pd.DataFrame(train_features)
train_features_df

UnicodeDecodeError: 'ascii' codec can't decode byte 0xc3 in position 1217: ordinal not in range(128)

In [73]:
# exercise: create a DataFrame of tokens with their counts.
new_df = pd.DataFrame(np.sum(train_arr, axis=0))
new_df.transpose()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,7446,7447,7448,7449,7450,7451,7452,7453,7454,7455
0,5,23,2,1,1,2,1,4,3,1,...,6,1,1,2,2,1,1,1,1,1


### Let's build the model with Naive Bayes Now

http://scikit-learn.org/stable/modules/naive_bayes.html

In [77]:
# train a Naive Bayes model using train_dtm
from sklearn.naive_bayes import MultinomialNB

nb = MultinomialNB()
nb.fit(train_dtm, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [78]:
# make predictions on test data using test_dtm
preds = nb.predict(test_dtm)
preds

array(['ham', 'ham', 'ham', ..., 'ham', 'spam', 'ham'], 
      dtype='|S4')

In [79]:
# compare predictions to true labels
from sklearn import metrics
print metrics.accuracy_score(y_test, preds)
print metrics.confusion_matrix(y_test, preds)

# confusion matrix: http://en.wikipedia.org/wiki/Confusion_matrix
# predicted not spam and want spam: 1203
# predicted spam and was spam: 174
# 5 and 11 are misses.

0.988513998564
[[1203    5]
 [  11  174]]


In [80]:
# exercise: show the message text for the false positives
X_test[(y_test == 0) & (preds == 1)]

array([], dtype=object)

In [85]:
# exercise: show the message text for the false negatives
X_test[y_test > preds]
# or
X_test[(y_test == 1) & (preds == 0)]

array([], dtype=object)

In [None]:
## USING ALL DATA AND CROSS-VALIDATION, run NB again


In [83]:
# Here is the condensed code needed for reference
df = pd.DataFrame.from_csv("../data/SMSSpamCollection.tsv",sep='\t',header=0,index_col=None)
X_train, X_test, y_train, y_test = train_test_split(df.msg, df.label, random_state=1)

vect = CountVectorizer(decode_error = 'ignore')

vect.fit(X_train)
vect.get_feature_names()

train_dtm = vect.transform(X_train)
test_dtm = vect.transform(X_test)

In [None]:
## EXERCISE: CALCULATE THE 'SPAMMINESS' OF EACH TOKEN

# create separate DataFrames for ham and spam ( df_ham and df_spam)


In [None]:
# learn the vocabulary of ALL messages and save it


In [None]:
# create document-term matrix of ham, then convert to a regular array


In [None]:
# create document-term matrix of spam, then convert to a regular array


In [None]:
# count how many times EACH token appears across ALL messages in ham_arr


In [None]:
# count how many times EACH token appears across ALL messages in spam_arr


In [None]:
# create a DataFrame of tokens with their separate ham and spam counts


In [None]:
# add one to ham counts and spam counts so that ratio calculations (below) make more sensse


In [None]:
# calculate ratio of spam-to-ham for each token


In [None]:
# advanced: implement your own naive bayes classifier
