In [None]:
%matplotlib inline
import numpy as np
import pandas as pd
import scipy
import sklearn
import matplotlib.pyplot as plt
import seaborn as sns 

# Classifier from previous assignment 

amazon = pd.read_csv('amazon_cells_labelled.txt', sep='\t', header=None)#, delimiter= '\t', header=None)
amazon.columns = ['review', 'score']

keywords = ['bad', 'littered', 'awful', 'but', 'poor', 'no', 'borring', 'cheap', 'dislike'
           , 'empty', 'hollow', 'waste', 'worst', "don't", "can't", 'hate', 'flaws', 'pathetic', 'atrocity', 'maybe']
for key in keywords:
    amazon[str(key)] = amazon.review.str.contains(' ' + str(key) + ' ', case=False)
    
amazon['allcaps'] = amazon.review.str.isupper()

sns.heatmap(amazon.corr(), cmap="YlGnBu")

from sklearn.naive_bayes import BernoulliNB

bnb = BernoulliNB()

data_A = amazon[keywords + ['allcaps']]
target_A = amazon['score']
bnb.fit(data_A, target_A)
y_pred_amazon = bnb.predict(data_A)

print('For Amazon reviews, Number of mislabelled points out of a total {} points : {}'.format(data_A.shape[0], (target_A != y_pred_amazon).sum()))



### Evaluation

In [None]:
from sklearn.metrics import confusion_matrix
confusion_matrix(target_A, y_pred_amazon)

In [None]:
print('Accuracy =', (1000-450)*100/1000,'%')
print('Total:', data_A.shape[0])
print('True Negative = 88')
print('True Positive = 462')
print('False Negative = 38')
print('False Positive = 412')
print('Sensitivity=', 462 *100/(462+38),'%')
print('Specificity =', 88 * 100 /(88+412),'%')

### Iterating five different versions of the classifier  

<b>Iteration 1. Tune keywords and use pattern matching</b> 

In [None]:
amazon_1 = amazon

In [None]:
keywords_1 = ['bad', 'awful', 'but', 'poor', 'no', 'borring', 'cheap', 'empty', 'hollow', 'waste', 'worst', "can't", 'hate', 'flaws', 'pathetic', 'atrocity', 'maybe']

for key in keywords_1:
    amazon_1[str(key)] = amazon_1.review.str.contains(str(key), case=False)
    
amazon_1['allcaps'] = amazon_1.review.str.isupper()

In [None]:
sns.heatmap(amazon_1.corr(), cmap="YlGnBu")

In [None]:
data_1 = amazon_1[keywords + ['allcaps']]
target_1 = amazon_1['score']
bnb.fit(data_1, target_1)
y_pred_amazon_1 = bnb.predict(data_1)

print('For Amazon reviews, Number of mislabelled points out of a total {} points : {}'.format(data_1.shape[0], (target_1 != y_pred_amazon_1).sum()))


In [None]:
from sklearn.metrics import confusion_matrix
confusion_matrix(target_1, y_pred_amazon_1)


In [None]:
print('Accuracy =', (1000-337)*100/1000,'%')
print('Total:', data_1.shape[0])
print('True Negative = 223')
print('True Positive = 440')
print('False Negative = 60')
print('False Positive = 277')
print('Sensitivity=', 440 *100/(440+60),'%')
print('Specificity =', 223 * 100 /(223+277),'%')

<b>Iteration 2. More keywords and tuning</b>

In [None]:
amazon_2 = amazon

In [None]:
keywords_2 = ['bad', 'waste', 'but', 'poor', 'none', 'short', 'cheap', 'ever', 'mistake', 'hype', 'worst', 'cannot', 'hate', 'flaws', 'useless', 'lost',
              'only', 'money', 'trash']

for key in keywords_2:
    amazon_2[str(key)] = amazon_2.review.str.contains(' '+ str(key)+ ' ', case=False)
    
amazon_2['allcaps'] = amazon_2.review.str.isupper()

In [None]:
sns.heatmap(amazon_1.corr(), cmap="YlGnBu")

In [None]:
data_2 = amazon_2[keywords + ['allcaps']]
target_2 = amazon_2['score']
bnb.fit(data_2, target_2)
y_pred_amazon_2 = bnb.predict(data_2)

print('For Amazon reviews, Number of mislabelled points out of a total {} points : {}'.format(data_2.shape[0], (target_2 != y_pred_amazon_2).sum()))


In [None]:
confusion_matrix(target_2, y_pred_amazon_2)

In [None]:
print('Accuracy =', (1000-370)*100/1000,'%')
print('Total:', data_2.shape[0])
print('True Negative = 189')
print('True Positive = 441')
print('False Negative = 59')
print('False Positive = 311')
print('Sensitivity=', 441 *100/(441+59),'%')
print('Specificity =', 189 * 100 /(189+311),'%')

<b>Iteration 3. Remove a feature</b> 

In [None]:
# remove the all caps feature and use the original array of keywords
amazon_3 = amazon 
data_3 = amazon_3[keywords]
target_3 = amazon_3['score']
bnb.fit(data_3, target_3)
y_pred_amazon_3 = bnb.predict(data_3)

print('For Amazon reviews, Number of mislabelled points out of a total {} points : {}'.format(data_3.shape[0], (target_3 != y_pred_amazon_3).sum()))


In [None]:
confusion_matrix(target_3, y_pred_amazon_3)

In [None]:
print('Accuracy =', (1000-371)*100/1000,'%')
print('Total:', data_3.shape[0])
print('True Negative = 183')
print('True Positive = 446')
print('False Negative = 54')
print('False Positive = 317')
print('Sensitivity=', 446 *100/(446+54),'%')
print('Specificity =', 183 * 100 /(183+317),'%')

The variables are independent a shown in the heatmap and hence the classifires won't overfit. 

The first itteration of classifiers seem to perform better with a relatively high accuracy, sensitivity, and specificity. This is due to pattern matching in the keyworks used instead of word match. Over all the keywords feature is most impactful. 