In [160]:
import pandas as pd
import numpy as np
import sklearn 
import matplotlib.pyplot as plt
import seaborn as sns
import re

%matplotlib inline

In [193]:
#MODEL 1: ORIGINAL LIST (ALREADY ITERATED BASED ON EXAMINATION OF IMDB RATINGS FILE, TRIAL & ERROR)
imdb = pd.read_csv('C:\\Users\\ryan\\Desktop\\Thinkful DS Sample Data - Main Course\\Unit 2\\sentiment labelled sentences\\imdb_labelled.csv', delimiter = '\t', header = None)

imdb.columns = ['review', 'positive']

keywords = ['terrible', 'awful', 'worst', 'bad', 'stupid', 'poor', 'worse', 'attempt', 'crap', 'fail', 'annoying', 'cheap',
           'painful', 'avoid', 'slow', 'pretentious', 'problem', 'embarrassing', 'bored', 'horrible', 'lousy', 'unfortunate', 
           'boring', 'sucks', 'sucked', 'waste', 'unbear', ' mess ', 'wasting', 'mediocre', 'sloppy',
           'disappoint', 'garbage', 'whine', 'whiny', 'plot', 'hate ', 'hated', 'negative', 'nobody', 'flaw',
           'script', 'insult', 'do not', 'torture', ' lack', 'lame', 'ridiculous', 'not', 'unbelievable', 'skip', 'shame', 
           'not even', 'miss', 'excellent', 'amazing', 'love', 'incredible', 'fantastic', 'terrific', 'best', 'great', 'fun',
           'beautiful', 'well done', 'enjoy', 'perfect', 'smart', 'highly', 'impress', 'well']

#removed the required space before/after the keyword to improve model accuracy (many sentences in IMDB dataset began with
#these words, so no space in front)
for key in keywords:
    imdb[str(key)] = imdb.review.str.contains(str(key), case = False)

imdb['positive'] = (imdb['positive'] == 1)
    
data = imdb[keywords]
target = imdb['positive']

# Our data is binary / boolean, so we're importing the Bernoulli classifier.
from sklearn.naive_bayes import BernoulliNB

# Instantiate our model and store it in a new variable.
bnb = BernoulliNB()

# Fit our model to the data.
bnb.fit(data, target)

pred = bnb.predict(data)

print('Out of {} predictions, {} were misclassified'.format(data.shape[0], (pred != target).sum()))

Out of 748 predictions, 150 were misclassified


In [194]:
#Test the accuracy, sensitivity, and specificity

from sklearn.metrics import confusion_matrix
c = confusion_matrix(target, pred)

#Accuracy
print('The accuracy of the model is: ', 1-((pred != target).sum()/data.shape[0]))

#Sensitivity
print('The sensitivity of the model is: {}'.format((c[1][1])/(c[1][1] + c[1][0])))

#Specificity
print('The specificity of the model is: {}'.format((c[0][0])/(c[0][0] + c[0][1])))

#print(c)

The accuracy of the model is:  0.799465240642
The sensitivity of the model is: 0.927461139896373
The specificity of the model is: 0.6629834254143646


In [195]:
#Now let's see how well the model accuracy stands up to cross-validation. 748 predictions, let's use 3 folds of 249, 249, 250 to cross-validate

fold1 = data.loc[:249, :]
keep1 = data.drop(data.index[:249])
fold2 = data.loc[250:498, :]
keep2 = data.drop(data.index[250:498])
fold3 = data.loc[499:, :]
keep3 = data.drop(data.index[499:])

targ_fold1 = target.loc[:249]
targ_keep1 = target.drop(target.index[:249])
targ_fold2 = target.loc[250:498]
targ_keep2 = target.drop(target.index[250:498])
targ_fold3 = target.loc[499:]
targ_keep3 = target.drop(target.index[499:])

bnb.fit(keep1, targ_keep1)
pred = bnb.predict(fold1)

#Accuracy
print('The accuracy of the model (fold 1) is: ', 1-((pred != targ_fold1).sum()/len(pred)))

bnb.fit(keep2, targ_keep2)
pred = bnb.predict(fold2)

#Accuracy
print('The accuracy of the model (fold 2) is: ', 1-((pred != targ_fold2).sum()/len(pred)))

bnb.fit(keep3, targ_keep3)
pred = bnb.predict(fold3)

#Accuracy
print('The accuracy of the model (fold 3) is: ', 1-((pred != targ_fold3).sum()/len(pred)))

The accuracy of the model (fold 1) is:  0.72
The accuracy of the model (fold 2) is:  0.706827309237
The accuracy of the model (fold 3) is:  0.799196787149


In [196]:
#Given the results of the cross-validation, there does appear to be some over-fitting for my initial model. 
#Accuracies ranged from 70.7% to 79.9% using 3 folds

In [197]:
#MODEL 2: Try to minimize false positives (minimize the number of reviews tagged as negative that are actually positive)
#In this instance, we don't care as much about accuracy as we do about categorizing a negative review incorrectly...

keywords = ['awful', 'worst', 'trash', 'painful', 'sloppy', 'pretentious', 'embarrassing', 'hate', 'torture', 'skip']

for key in keywords:
    imdb[str(key)] = imdb.review.str.contains(str(key), case = False)

imdb['positive'] = (imdb['positive'] == 1)
    
data = imdb[keywords]
target = imdb['positive']

# Our data is binary / boolean, so we're importing the Bernoulli classifier.
from sklearn.naive_bayes import BernoulliNB

# Instantiate our model and store it in a new variable.
bnb = BernoulliNB()

# Fit our model to the data.
bnb.fit(data, target)

pred = bnb.predict(data)
            
#Test the accuracy, sensitivity, and specificity

from sklearn.metrics import confusion_matrix
c = confusion_matrix(target, pred)

#Accuracy
print('The accuracy of the model is: ', 1-((pred != target).sum()/data.shape[0]))

#Sensitivity
print('The sensitivity of the model is: {}'.format((c[1][1])/(c[1][1] + c[1][0])))

#Specificity
print('The specificity of the model is: {}'.format((c[0][0])/(c[0][0] + c[0][1])))

#print(c)


The accuracy of the model is:  0.578877005348
The sensitivity of the model is: 1.0
The specificity of the model is: 0.1298342541436464


In [198]:
#MODEL 3: Try to maximize accuracy using positive sentiment wordlist from internet (words from http://ptrckprry.com/course/ssd/data/positive-words.txt):
df = pd.read_csv('C:\\Users\\ryan\\Desktop\\Thinkful DS Sample Data - Main Course\\Unit 2\\sentiment labelled sentences\\positive_word_list_from_internet.csv')

In [199]:
pos_list = df['positive_sentiment_list'].tolist()

In [200]:
imdb = pd.read_csv('C:\\Users\\ryan\\Desktop\\Thinkful DS Sample Data - Main Course\\Unit 2\\sentiment labelled sentences\\imdb_labelled.csv', delimiter = '\t', header = None)

imdb.columns = ['review', 'positive_review']

keywords = pos_list

for key in keywords:
    imdb[str(key)] = imdb.review.str.contains(str(key), case = False)

imdb['positive_review'] = (imdb['positive_review'] == 1)
    
data = imdb[keywords]
target = imdb['positive_review']

# Our data is binary / boolean, so we're importing the Bernoulli classifier.
from sklearn.naive_bayes import BernoulliNB

# Instantiate our model and store it in a new variable.
bnb = BernoulliNB()

# Fit our model to the data.
bnb.fit(data, target)

pred = bnb.predict(data)

from sklearn.metrics import confusion_matrix
c = confusion_matrix(target, pred)

#Accuracy
print('The accuracy of the model is: ', 1-((pred != target).sum()/data.shape[0]))

#Sensitivity
print('The sensitivity of the model is: {}'.format((c[1][1])/(c[1][1] + c[1][0])))

#Specificity
print('The specificity of the model is: {}'.format((c[0][0])/(c[0][0] + c[0][1])))

The accuracy of the model is:  0.780748663102
The sensitivity of the model is: 0.6321243523316062
The specificity of the model is: 0.9392265193370166


In [201]:
#Positive keyword list was slightly less accurate than my model. Sensitivity and specificity were lower and higher, respectively.
#What about it's cross-validation performance?

In [202]:
#Now let's see how well the model accuracy stands up to cross-validation. 748 predictions, let's use 3 folds of 249, 249, 250 to cross-validate

fold1 = data.loc[:249, :]
keep1 = data.drop(data.index[:249])
fold2 = data.loc[250:498, :]
keep2 = data.drop(data.index[250:498])
fold3 = data.loc[499:, :]
keep3 = data.drop(data.index[499:])

targ_fold1 = target.loc[:249]
targ_keep1 = target.drop(target.index[:249])
targ_fold2 = target.loc[250:498]
targ_keep2 = target.drop(target.index[250:498])
targ_fold3 = target.loc[499:]
targ_keep3 = target.drop(target.index[499:])

bnb.fit(keep1, targ_keep1)
pred = bnb.predict(fold1)

#Accuracy
print('The accuracy of the model (fold 1) is: ', 1-((pred != targ_fold1).sum()/len(pred)))

bnb.fit(keep2, targ_keep2)
pred = bnb.predict(fold2)

#Accuracy
print('The accuracy of the model (fold 2) is: ', 1-((pred != targ_fold2).sum()/len(pred)))

bnb.fit(keep3, targ_keep3)
pred = bnb.predict(fold3)

#Accuracy
print('The accuracy of the model (fold 3) is: ', 1-((pred != targ_fold3).sum()/len(pred)))

The accuracy of the model (fold 1) is:  0.524
The accuracy of the model (fold 2) is:  0.489959839357
The accuracy of the model (fold 3) is:  0.4859437751


In [203]:
#Model 3, with only positive keywords, suffered tremendously when performing cross-validation. The data is over-fitting greatly.

In [204]:
#Model 4: Try to maximize accuracy by using negative sentiment wordlist from internet (words from http://ptrckprry.com/course/ssd/data/negative-words.txt):
df2 = pd.read_csv('C:\\Users\\ryan\\Desktop\\Thinkful DS Sample Data - Main Course\\Unit 2\\sentiment labelled sentences\\negative_word_list_from_internet.csv')

In [205]:
neg_list = df2['negative_keywords_from_internet'].tolist()

In [207]:
imdb = pd.read_csv('C:\\Users\\ryan\\Desktop\\Thinkful DS Sample Data - Main Course\\Unit 2\\sentiment labelled sentences\\imdb_labelled.csv', delimiter = '\t', header = None)

imdb.columns = ['review', 'positive']

keywords = neg_list

## HAD TO REMOVE SPECIAL CHARACTERS (*, -, etc)
for key in keywords:
    imdb[str(key)] = imdb.review.str.contains(str(key), case = False)
    
imdb['positive'] = (imdb['positive'] == 1)

data = imdb[keywords]
target = imdb['positive']

# Our data is binary / boolean, so we're importing the Bernoulli classifier.
from sklearn.naive_bayes import BernoulliNB

# Instantiate our model and store it in a new variable.
bnb = BernoulliNB()

# Fit our model to the data.
bnb.fit(data, target)

pred = bnb.predict(data)

from sklearn.metrics import confusion_matrix
c = confusion_matrix(target, pred)

#Accuracy
print('The accuracy of the model is: ', 1-((pred != target).sum()/data.shape[0]))

#Sensitivity
print('The sensitivity of the model is: {}'.format((c[1][1])/(c[1][1] + c[1][0])))

#Specificity
print('The specificity of the model is: {}'.format((c[0][0])/(c[0][0] + c[0][1])))

The accuracy of the model is:  0.741978609626
The sensitivity of the model is: 0.9870466321243523
The specificity of the model is: 0.48066298342541436


In [208]:
#Now let's see how well the model accuracy stands up to cross-validation. 748 predictions, let's use 3 folds of 249, 249, 250 to cross-validate

fold1 = data.loc[:249, :]
keep1 = data.drop(data.index[:249])
fold2 = data.loc[250:498, :]
keep2 = data.drop(data.index[250:498])
fold3 = data.loc[499:, :]
keep3 = data.drop(data.index[499:])

targ_fold1 = target.loc[:249]
targ_keep1 = target.drop(target.index[:249])
targ_fold2 = target.loc[250:498]
targ_keep2 = target.drop(target.index[250:498])
targ_fold3 = target.loc[499:]
targ_keep3 = target.drop(target.index[499:])

bnb.fit(keep1, targ_keep1)
pred = bnb.predict(fold1)

#Accuracy
print('The accuracy of the model (fold 1) is: ', 1-((pred != targ_fold1).sum()/len(pred)))

bnb.fit(keep2, targ_keep2)
pred = bnb.predict(fold2)

#Accuracy
print('The accuracy of the model (fold 2) is: ', 1-((pred != targ_fold2).sum()/len(pred)))

bnb.fit(keep3, targ_keep3)
pred = bnb.predict(fold3)

#Accuracy
print('The accuracy of the model (fold 3) is: ', 1-((pred != targ_fold3).sum()/len(pred)))

The accuracy of the model (fold 1) is:  0.476
The accuracy of the model (fold 2) is:  0.465863453815
The accuracy of the model (fold 3) is:  0.369477911647


In [215]:
#Why do the positive and negative keyword lists suffer so much when performing cross-validation??

In [216]:
#MODEL 5: Positive & Negative sentiment lists combined (from internet, not my list)

posneg_list = pos_list + neg_list

In [220]:
imdb = pd.read_csv('C:\\Users\\ryan\\Desktop\\Thinkful DS Sample Data - Main Course\\Unit 2\\sentiment labelled sentences\\imdb_labelled.csv', delimiter = '\t', header = None)

imdb.columns = ['review', 'positive_review']

keywords = posneg_list

for key in keywords:
    imdb[str(key)] = imdb.review.str.contains(str(key), case = False)

imdb['positive_review'] = (imdb['positive_review'] == 1)
    
data = imdb[keywords]
target = imdb['positive_review']

# Our data is binary / boolean, so we're importing the Bernoulli classifier.
from sklearn.naive_bayes import BernoulliNB

# Instantiate our model and store it in a new variable.
bnb = BernoulliNB()

# Fit our model to the data.
bnb.fit(data, target)

pred = bnb.predict(data)

from sklearn.metrics import confusion_matrix
c = confusion_matrix(target, pred)

#Accuracy
print('The accuracy of the model is: ', 1-((pred != target).sum()/data.shape[0]))

#Sensitivity
print('The sensitivity of the model is: {}'.format((c[1][1])/(c[1][1] + c[1][0])))

#Specificity
print('The specificity of the model is: {}'.format((c[0][0])/(c[0][0] + c[0][1])))

The accuracy of the model is:  0.816844919786
The sensitivity of the model is: 0.9844559585492227
The specificity of the model is: 0.638121546961326


In [221]:
#The model performs best with both positive and negative keywords, and this model outperformed my original model in terms of 
#accuracy. Let's see how it does on cross-validation..

In [222]:
#Now let's see how well the model accuracy stands up to cross-validation. 748 predictions, let's use 3 folds of 249, 249, 250 to cross-validate

fold1 = data.loc[:249, :]
keep1 = data.drop(data.index[:249])
fold2 = data.loc[250:498, :]
keep2 = data.drop(data.index[250:498])
fold3 = data.loc[499:, :]
keep3 = data.drop(data.index[499:])

targ_fold1 = target.loc[:249]
targ_keep1 = target.drop(target.index[:249])
targ_fold2 = target.loc[250:498]
targ_keep2 = target.drop(target.index[250:498])
targ_fold3 = target.loc[499:]
targ_keep3 = target.drop(target.index[499:])

bnb.fit(keep1, targ_keep1)
pred = bnb.predict(fold1)

#Accuracy
print('The accuracy of the model (fold 1) is: ', 1-((pred != targ_fold1).sum()/len(pred)))

bnb.fit(keep2, targ_keep2)
pred = bnb.predict(fold2)

#Accuracy
print('The accuracy of the model (fold 2) is: ', 1-((pred != targ_fold2).sum()/len(pred)))

bnb.fit(keep3, targ_keep3)
pred = bnb.predict(fold3)

#Accuracy
print('The accuracy of the model (fold 3) is: ', 1-((pred != targ_fold3).sum()/len(pred)))

The accuracy of the model (fold 1) is:  0.46
The accuracy of the model (fold 2) is:  0.465863453815
The accuracy of the model (fold 3) is:  0.40562248996
