In [32]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats
from sklearn import preprocessing
import math
from matplotlib.mlab import PCA as mlabPCA
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA 
from sklearn.naive_bayes import BernoulliNB
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score

In [2]:
# Grab and process the raw data.
data_path = ("https://raw.githubusercontent.com/Thinkful-Ed/data-201-resources/"
             "master/sms_spam_collection/SMSSpamCollection"
            )
sms_raw = pd.read_csv(data_path, delimiter= '\t', header=None)
sms_raw.columns = ['spam', 'message']

# Enumerate our spammy keywords.
keywords = ['click', 'offer', 'winner', 'buy', 'free', 'cash', 'urgent']

for key in keywords:
    sms_raw[str(key)] = sms_raw.message.str.contains(
        ' ' + str(key) + ' ',
        case=False
)

sms_raw['allcaps'] = sms_raw.message.str.isupper()
sms_raw['spam'] = (sms_raw['spam'] == 'spam')
data = sms_raw[keywords + ['allcaps']]
target = sms_raw['spam']

from sklearn.naive_bayes import BernoulliNB
bnb = BernoulliNB()
y_pred = bnb.fit(data, target).predict(data)

In [3]:
sms_raw.head()

Unnamed: 0,spam,message,click,offer,winner,buy,free,cash,urgent,allcaps
0,False,"Go until jurong point, crazy.. Available only ...",False,False,False,False,False,False,False,False
1,False,Ok lar... Joking wif u oni...,False,False,False,False,False,False,False,False
2,True,Free entry in 2 a wkly comp to win FA Cup fina...,False,False,False,False,False,False,False,False
3,False,U dun say so early hor... U c already then say...,False,False,False,False,False,False,False,False
4,False,"Nah I don't think he goes to usf, he lives aro...",False,False,False,False,False,False,False,False


In [6]:
data_size = sms_raw.shape[0]
correctly_classified = (target == y_pred).sum()
incorrectly_classified = (target != y_pred).sum()

In [8]:
print('% Correct: ', correctly_classified/data_size)
print('% Incorrect: ', incorrectly_classified/data_size)

% Correct:  0.89160086145
% Incorrect:  0.10839913855


#### Confusion Matrix

In [10]:
confusion_matrix(target, y_pred)

array([[4770,   55],
       [ 549,  198]], dtype=int64)

Here the columns are prediction and the rows are actual.

So what do we learn?

We learn the majority of our error is coming from times where we failed to identify a spam message. 549 of our 604 errors are from failing to identify spam. So we need to get a little bit better at identifying spam messages.

But before we move on or iterate on the model, let's talk about some key terms that you may run into when thinking about this kind of matrix.

Let's assume our goal is to identify spam (rather than identify ham).

Firstly, when we talk about errors in a binary classifier (where there are only two outcomes) we're generally referring to two kinds of errors. A false positive is when we identify something as spam that is not. In this case we had 55 of these. This is sometimes also called a "Type I Error" or a "false alarm".

A false negative is therefore when we mistakenly identify something as not spam when it is. We had 549 of these. This is also called a "Type II Error" or a "miss".

This also brings us to a conversation of sensitivity vs specificity.

Sensitivity is the percentage of positives correctly identified, in our case 198/747 or 27%. This shows how good we are at catching positives, or how sensitive our model is to identifying positives.

Specificity is just the opposite, the percentage of negatives correctly identified, 4770/4825 or 99%.

Again this confirms that we're not great at identifying spam, though we do label ham quite accurately. You should get familiar with these terms as in the practicing world they will often be used with little explanation and you will be expected to understand them.

#### Drill:

It's worth calculating these with code so that you fully understand how these statistics work, so here is your task for the cell below. Manually generate (meaning don't use the SKLearn function) your own confusion matrix and print it along with the sensitivity and specificity.

In [15]:
data_size = sms_raw.shape[0]
correctly_classified = ((target == y_pred)&(target==True)).sum()
incorrectly_classified = ((target != y_pred)&(target==False)).sum()

In [14]:
target.unique()

array([False,  True], dtype=bool)

In [25]:
type_1error = ((target != y_pred)&(target==False)).sum()
type_2error = ((target != y_pred)&(target==True)).sum()
sensitivity = ((target == y_pred)&(target==True)).sum()
specificity = ((target == y_pred)&(target==False)).sum()

In [23]:
type_1error

55

In [24]:
type_2error

549

In [26]:
sensitivity

198

In [27]:
specificity

4770

In [29]:
# how made confusion matrix
print(specificity,type_1error)
print(type_2error,sensitivity)

4770 55
549 198


#### In Sample evaluation and cross validation

In [31]:
# Use train_test_split to create the necessary training and test groups
X_train, X_test, y_train, y_test = train_test_split(data, target, test_size=0.2, random_state=20)
print('With 20% Holdout: ' + str(bnb.fit(X_train, y_train).score(X_test, y_test)))
print('Testing on Sample: ' + str(bnb.fit(data, target).score(data, target)))

With 20% Holdout: 0.884304932735
Testing on Sample: 0.89160086145


In [33]:
cross_val_score(bnb, data, target, cv=10)

array([ 0.89784946,  0.89426523,  0.89426523,  0.890681  ,  0.89605735,
        0.89048474,  0.88150808,  0.89028777,  0.88489209,  0.89568345])

#### Challenge: Iterate and evaluate your classifier

It's time to revisit your classifier from the previous assignment. Using the evaluation techniques we've covered here, look at your classifier's performance in more detail. Then go back and iterate by engineering new features, removing poor features, or tuning parameters. Repeat this process until you have five different versions of your classifier. Once you've iterated, answer these questions to compare the performance of each:

Do any of your classifiers seem to overfit?

Which seem to perform the best? Why?

Which features seemed to be most impactful to performance?

Write up your iterations and answers to the above questions in a few pages. Submit a link below and go over it with your mentor to see if they have any other ideas on how you could improve your classifier's performance.

In [62]:
amazon_path = "https://raw.githubusercontent.com/Thinkful-Ed/data-201-resources/master/sentiment_labelled_sentences/amazon_cells_labelled.txt"
imdb_path ="https://raw.githubusercontent.com/Thinkful-Ed/data-201-resources/master/sentiment_labelled_sentences/imdb_labelled.txt"
yelp_path ="https://raw.githubusercontent.com/Thinkful-Ed/data-201-resources/master/sentiment_labelled_sentences/yelp_labelled.txt"

In [63]:
keywords = ['good', 'great', 'best', 'delicious', 'oustanding', 'bad', 'terrible', 'worst', 'never', 'broken', 'boring', 'deplorable']

In [64]:
amazon = pd.read_csv(amazon_path, delimiter= '\t', header=None)
amazon.columns = ["Message","Sentiment"]
for key in keywords:
    amazon[str(key)] = amazon.Message.str.contains(
        ' ' + str(key) + ' ',
        case=False
    )

In [65]:
data_amazon = amazon[keywords]
target_amazon = amazon['Sentiment']

In [66]:
bnb = BernoulliNB()

bnb.fit(data_amazon, target_amazon)

# Classify, storing the result in a new variable.
y_pred = bnb.predict(data_amazon)

# Display our results.
print("Number of mislabeled points out of a total {} points : {}".format(data_amazon.shape[0],(target_amazon != y_pred).sum()))

Number of mislabeled points out of a total 1000 points : 429


In [67]:
# confusions matrix for amazon
confusion_matrix(target_amazon, y_pred)

array([[487,  13],
       [416,  84]], dtype=int64)

In [68]:
X_train_amazon, X_test_amazon, y_train_amazon, y_test_amazon = train_test_split(data_amazon, target_amazon, test_size=0.2, random_state=20)
print('With 20% Holdout: ' + str(bnb.fit(X_train_amazon, y_train_amazon).score(X_test_amazon, y_test_amazon)))
print('Testing on Sample: ' + str(bnb.fit(data_amazon, target_amazon).score(data_amazon, target_amazon)))

With 20% Holdout: 0.575
Testing on Sample: 0.571


Model doesn't seem to be overfitting, just not very good either

In [69]:
cross_val_score(bnb, data_amazon, target_amazon, cv=10)

array([ 0.56,  0.59,  0.59,  0.57,  0.57,  0.58,  0.56,  0.57,  0.59,  0.53])

Supports that the model isn't overfitting

In [70]:
keywords

['good',
 'great',
 'best',
 'delicious',
 'oustanding',
 'bad',
 'terrible',
 'worst',
 'never',
 'broken',
 'boring',
 'deplorable']

Five different keyword combinations

In [93]:
keywords_1 = ['good', 'great', 'best', 'oustanding', 'bad', 'terrible', 'worst', 'never', 'broken', 'boring']
keywords_2 = ['good', 'great', 'best', 'oustanding', 'amazing']
keywords_3 = ['bad', 'terrible', 'worst', 'never', 'broken', 'boring', 'deplorable']
keywords_4 = ['good','bad']
keywords_5 = ['best', 'worst']

In [72]:
keyword_lists = [keywords_1, keywords_2, keywords_3, keywords_4, keywords_5]

In [116]:
def keyword_analyzer(data_path, groups):
    data_input = pd.read_csv(data_path, delimiter= '\t', header=None)
    data_input.columns = ['Message','Sentiment']
    for group in groups:
        for word in group:
            data_input[str(word)] = data_input.Message.str.contains(' ' + str(word) + ' ',case=False)
        data = data_input[group]
        target = data_input['Sentiment']
        bnb = BernoulliNB()
        bnb.fit(data, target)
        y_pred = bnb.predict(data)
        print("Number of mislabeled points out of a total {} points : {}".format(data.shape[0],(target != y_pred).sum()))
        print("Confusion Matrix: ","\n", confusion_matrix(target, y_pred))
        X_train, X_test, y_train, y_test = train_test_split(data, target, test_size=0.2, random_state=20)
        print('With 20% Holdout: ' + str(bnb.fit(X_train, y_train).score(X_test, y_test)))
        print('Testing on Sample: ' + str(bnb.fit(data, target).score(data, target)))
        print("Cross Value Scores: ", cross_val_score(bnb, data, target, cv=10))
    return

In [117]:
keyword_analyzer(amazon_path, keyword_lists)

Number of mislabeled points out of a total 1000 points : 429
Confusion Matrix:  
 [[487  13]
 [416  84]]
With 20% Holdout: 0.575
Testing on Sample: 0.571
Cross Value Scores:  [ 0.56  0.59  0.59  0.57  0.57  0.58  0.56  0.57  0.59  0.53]
Number of mislabeled points out of a total 1000 points : 429
Confusion Matrix:  
 [[487  13]
 [416  84]]
With 20% Holdout: 0.575
Testing on Sample: 0.571
Cross Value Scores:  [ 0.56  0.59  0.59  0.57  0.57  0.58  0.56  0.57  0.59  0.53]
Number of mislabeled points out of a total 1000 points : 480
Confusion Matrix:  
 [[ 22 478]
 [  2 498]]
With 20% Holdout: 0.53
Testing on Sample: 0.52
Cross Value Scores:  [ 0.48  0.5   0.51  0.56  0.52  0.52  0.55  0.5   0.53  0.52]
Number of mislabeled points out of a total 1000 points : 474
Confusion Matrix:  
 [[488  12]
 [462  38]]
With 20% Holdout: 0.515
Testing on Sample: 0.526
Cross Value Scores:  [ 0.5   0.55  0.54  0.54  0.51  0.56  0.51  0.49  0.55  0.51]
Number of mislabeled points out of a total 1000 points

In [122]:
for i in keyword_lists:
    print(i)

['good', 'great', 'best', 'oustanding', 'bad', 'terrible', 'worst', 'never', 'broken', 'boring']
['good', 'great', 'best', 'oustanding', 'amazing']
['bad', 'terrible', 'worst', 'never', 'broken', 'boring', 'deplorable']
['good', 'bad']
['best', 'worst']


None seem to overfit, they just aren't particularly good. 

The ones with the positive words seemed to do the best.

The positive word features appear to have the most impact as the number of mislabels went up when they were removed.