# From here down to the markdown cell that reads: 'Unit 2.3.4' is an exact copy of Unit 2.2.7

In [1]:
%matplotlib inline
import numpy as np
import pandas as pd
import scipy
import sklearn
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
data_path = r'C:\Users\Toshiba P55w\sentiment labelled sentences\sentiment labelled sentences\yelp_labelled.txt'
yelp = pd.read_csv(data_path, delimiter='\t', header=None)
yelp.columns = ['Review', 'Positive']
yelp.head()

Unnamed: 0,Review,Positive
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1


# Goal: build a naive bayes classifier that predicts whether a review is positive or negative

## Feature engineering overview:
I am going to identify words that are used a lot more in one type of review than the other. For each one of these words there will be a feature that shows whether the review contains the word. 

In [3]:
# turn every review into a list of words
yelp['Word_list'] = yelp['Review'].str.lower().str.replace(r'[^\w\s]', '',).str.strip().str.split()

yelp.head()

Unnamed: 0,Review,Positive,Word_list
0,Wow... Loved this place.,1,"[wow, loved, this, place]"
1,Crust is not good.,0,"[crust, is, not, good]"
2,Not tasty and the texture was just nasty.,0,"[not, tasty, and, the, texture, was, just, nasty]"
3,Stopped by during the late May bank holiday of...,1,"[stopped, by, during, the, late, may, bank, ho..."
4,The selection on the menu was great and so wer...,1,"[the, selection, on, the, menu, was, great, an..."


In [4]:
from functools import reduce
positive_words = reduce(lambda x,y: x+y ,yelp[yelp['Positive']==1]['Word_list'] )
negative_words = reduce(lambda x,y: x+y ,yelp[yelp['Positive']==0]['Word_list'] )

In [5]:
from collections import Counter

word_count_positive_dct = Counter(positive_words)

word_count_negative_dct = Counter(negative_words)

In [6]:
all_words = set(list(word_count_positive_dct.keys())+list(word_count_negative_dct.keys()))
word_counts = pd.DataFrame(list(all_words), columns=['Word'])

word_counts['Positive_count'] = word_counts['Word'].apply(lambda x: word_count_positive_dct.get(x, 0))
word_counts['Negative_count'] = word_counts['Word'].apply(lambda x: word_count_negative_dct.get(x, 0))
word_counts['Total_count'] = word_counts['Positive_count'] + word_counts['Negative_count']

# ratio of reviews word is in that are positive
word_counts['P_ratio'] = round(word_counts['Positive_count']/(word_counts['Positive_count'] + word_counts['Negative_count']),4)

# ratio of reviews word is in that are negative
word_counts['N_ratio'] = round(word_counts['Negative_count']/(word_counts['Positive_count'] + word_counts['Negative_count']),4)

word_counts['Sentiment'] = np.where(word_counts['Negative_count'] > word_counts['Positive_count'], 'Negative', 'Positive')

# use words that show up in one type of review at a minimum ratio and that show up in all review a minimum number of times
min_ratio = .8
min_total_count = 5
keywords_df = word_counts[((word_counts['P_ratio'] > min_ratio) | (word_counts['N_ratio'] > min_ratio)) &
                          (word_counts['Total_count'] > min_total_count)
                         ]

keywords_df.head()

Unnamed: 0,Word,Positive_count,Negative_count,Total_count,P_ratio,N_ratio,Sentiment
3,every,8,1,9,0.8889,0.1111,Positive
18,wasnt,0,13,13,0.0,1.0,Negative
49,either,0,6,6,0.0,1.0,Negative
50,loved,10,0,10,1.0,0.0,Positive
134,dont,3,25,28,0.1071,0.8929,Negative


In [7]:
# make feature for each keyword
keywords = list(keywords_df['Word'])

for key in keywords:    
    yelp[str(key)] = yelp['Word_list'].apply(lambda x: key in x)

In [8]:
data = yelp[keywords]
target = yelp['Positive']

# Our data is binary / boolean, so we're importing the Bernoulli classifier.
from sklearn.naive_bayes import BernoulliNB

# Instantiate our model and store it in a new variable.
bnb = BernoulliNB()

# Fit our model to the data.
bnb.fit(data, target)

# Classify, storing the result in a new variable.
y_pred = bnb.predict(data)

# Display our results.
print("accuracy : {}".format(
    (target == y_pred).sum()/data.shape[0]
))

accuracy : 0.792


### Test classifier on one of the other datasets to see how well these kinds of classifiers translate from one context to another.

In [9]:
amzn_path = r'C:\Users\Toshiba P55w\sentiment labelled sentences\sentiment labelled sentences\amazon_cells_labelled.txt'

amzn = pd.read_csv(amzn_path, delimiter='\t', header=None)
amzn.columns = ['Review', 'Positive']
amzn.head()

Unnamed: 0,Review,Positive
0,So there is no way for me to plug it in here i...,0
1,"Good case, Excellent value.",1
2,Great for the jawbone.,1
3,Tied to charger for conversations lasting more...,0
4,The mic is great.,1


In [10]:
amzn['Word_list'] = amzn['Review'].str.lower().str.replace(r'[^\w\s]', '',).str.strip().str.split()
amzn.head()

Unnamed: 0,Review,Positive,Word_list
0,So there is no way for me to plug it in here i...,0,"[so, there, is, no, way, for, me, to, plug, it..."
1,"Good case, Excellent value.",1,"[good, case, excellent, value]"
2,Great for the jawbone.,1,"[great, for, the, jawbone]"
3,Tied to charger for conversations lasting more...,0,"[tied, to, charger, for, conversations, lastin..."
4,The mic is great.,1,"[the, mic, is, great]"


In [11]:
for key in keywords:
    amzn[str(key)] = amzn['Word_list'].apply(lambda x: key in x)

In [12]:
amzn_data = amzn[keywords]
amzn_target = amzn['Positive']

amzn_y_pred = bnb.predict(amzn_data)

print("accuracy : {}".format(
    (amzn_target == amzn_y_pred).sum()/data.shape[0]
))


accuracy : 0.686


# Unit 2.3.4 - Iterate and evaluate your classifier

It's time to revisit your classifier from the previous assignment. Using the evaluation techniques we've covered here, look at your classifier's performance in more detail. Then go back and iterate by engineering new features, removing poor features, or tuning parameters. Repeat this process until you have five different versions of your classifier. Once you've iterated, answer these questions to compare the performance of each:

Do any of your classifiers seem to overfit?
Which seem to perform the best? Why?
Which features seemed to be most impactful to performance?
Write up your iterations and answers to the above questions in a few pages. Submit a link below and go over it with your mentor to see if they have any other ideas on how you could improve your classifier's performance.

look at classifiers performance in more detail:

- cross validate
- sensitivity (true positive rate)
- specificity (true negative rate)
- precision  (positive predictive value)
- negative predictive value

In [13]:
from sklearn.model_selection import cross_val_score
cross_val_score(bnb, data, target, cv=10)

array([ 0.81,  0.76,  0.82,  0.75,  0.76,  0.77,  0.75,  0.83,  0.76,  0.83])

In [14]:
# make a binary confusion matrix class with all the metrics we want
from sklearn.metrics import confusion_matrix

class Binary_confusion_matrix():
    
    def __init__(self, target, y_pred):
        self.target = target
        self.y_pred = y_pred
        self.cm = confusion_matrix(target, y_pred, labels=[1, 0])
        self.df = pd.DataFrame(self.cm, columns=['pred_true','pred_false'], index=['actual_true', 'actual_false'])
        self.tp, self.fn, self.fp, self.tn = self.cm.ravel()
        self.sensitivity = self.tp/(self.tp+self.fn)
        self.specificity = self.tn/(self.tn+self.fp)
        self.precision = self.tp/(self.tp+self.fp)
        self.neg_pred_val = self.tn/(self.tn+self.fn)
        self.accuracy = (self.tp+self.tn)/(self.tp+self.tn+self.fp+self.fn)
    
    def display_metrics(self):
        metrics_string = 'Accuracy = {}\nSensitivity = {}\nSpecificity = {}\nPrecision = {}\nnegative predictive value = {}\n'.format(self.accuracy, self.sensitivity, self.specificity, self.precision, self.neg_pred_val)
        print(metrics_string)
        



In [15]:
# run instantiate binary confusion metrics with target and prediction data.
cm = Binary_confusion_matrix(target, y_pred)

# look at matrix and metrics
cm.display_metrics()
cm.df

Accuracy = 0.792
Sensitivity = 0.934
Specificity = 0.65
Precision = 0.7274143302180686
negative predictive value = 0.9078212290502793



Unnamed: 0,pred_true,pred_false
actual_true,467,33
actual_false,175,325


We can see from the confusion matrix that there are many more false positives than false negatives. Let's explore why this is.

In [16]:
# add prediction result to dataframe
yelp['predicted'] = y_pred

# count how many keywords each review has
yelp['kw_count'] = yelp[keywords].sum(axis=1)

yelp.head()

Unnamed: 0,Review,Positive,Word_list,every,wasnt,either,loved,dont,awesome,before,...,twice,bland,impressed,getting,attentive,family,atmosphere,beer,predicted,kw_count
0,Wow... Loved this place.,1,"[wow, loved, this, place]",False,False,False,True,False,False,False,...,False,False,False,False,False,False,False,False,1,1
1,Crust is not good.,0,"[crust, is, not, good]",False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,0,1
2,Not tasty and the texture was just nasty.,0,"[not, tasty, and, the, texture, was, just, nasty]",False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,0,1
3,Stopped by during the late May bank holiday of...,1,"[stopped, by, during, the, late, may, bank, ho...",False,False,False,True,False,False,False,...,False,False,False,False,False,False,False,False,1,1
4,The selection on the menu was great and so wer...,1,"[the, selection, on, the, menu, was, great, an...",False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,1,2


In [17]:
false_positives = yelp[(yelp['Positive']==0) & yelp['predicted'] == 1]

false_positives['kw_count'].value_counts()

0    168
1      5
2      2
Name: kw_count, dtype: int64

In [18]:
zero_kw = yelp[yelp.kw_count==0]

zero_kw.predicted.value_counts()

1    376
Name: predicted, dtype: int64

In [19]:
keywords_df.Sentiment.value_counts()

Negative    42
Positive    26
Name: Sentiment, dtype: int64

Almost all of our false positives contain 0 keywords in their review... and all of our reviews with no keywords were predicted to be positive. This is likely because there is an imbalance of negative and positive keywords. Lets add more postive keywords to our features

In [20]:
# lets make an equal number of positive and negative keywords

# get the top 42 positive keywords (with total occurences > 5)
top_42_pos = word_counts[word_counts['Total_count'] > 5].sort_values(by='P_ratio', ascending=False)[:42]

# add them to the keywords list
keywords += list(top_42_pos['Word'])

# get rid of redundancies
keywords = list(set(keywords))

# make each keyword a feature
for key in keywords:
    yelp[str(key)] = yelp['Word_list'].apply(lambda x: key in x)
    

In [21]:
#refit our model

data = yelp[keywords]
target = yelp['Positive']

# Our data is binary / boolean, so we're importing the Bernoulli classifier.
from sklearn.naive_bayes import BernoulliNB

# Instantiate our model and store it in a new variable.
bnb = BernoulliNB()

# Fit our model to the data.
bnb.fit(data, target)

# Classify, storing the result in a new variable.
y_pred = bnb.predict(data)

# Display our results.
print("accuracy : {}".format(
    (target == y_pred).sum()/data.shape[0]
))

accuracy : 0.824


In [22]:
cm = Binary_confusion_matrix(target, y_pred)
cm.display_metrics()
cm.df

Accuracy = 0.824
Sensitivity = 0.684
Specificity = 0.964
Precision = 0.95
negative predictive value = 0.753125



Unnamed: 0,pred_true,pred_false
actual_true,342,158
actual_false,18,482


The model is more accurate overall, but now we have the opposite problem as before. There are many more false negatives than false positives. We could just keep adding key words, but then our model would eventually be overfit (if it isn't already). A possible next step could be to look for other features besides keywords that could help classify sentiment (length of review, use of certain punctuation or capitalization...)