Use one of the following datasets to perform sentiment analysis on the given Amazon reviews. Pick one of the "small" datasets that is a reasonable size for your computer. The goal is to create a model to algorithmically predict if a review is positive or negative just based on its text. Try to see how these reviews compare across categories. Does a review classification model for one category work for another?

http://jmcauley.ucsd.edu/data/amazon/

In [19]:
import pandas as pd
import numpy as np
import scipy.stats as stats
import seaborn as sns
import string


# data is binary so I'll use the Bernoulli classifier.
from sklearn.naive_bayes import BernoulliNB


In [40]:
# import json file
df = pd.read_json('Toys_and_Games_5.json',lines=True)
#df.head()

In [43]:

# drop unrelated fields (although summary might be useful - instructions asked for review text)
df_cleaned = df.drop(['asin','helpful','reviewTime','reviewerID','reviewerName','unixReviewTime','summary'],axis=1)

# clean up message
df_cleaned['review_cleaned'] = df_cleaned['reviewText'].apply(lambda x:''.join([i for i in x if i not in string.punctuation]))

#create a binary value for sentiment (4+ good (1), 3- negative (0))
print(df_cleaned.isnull().sum())
df_cleaned['sentiment'] =  np.where((df_cleaned['overall'] > 3), 1, 0)
#df_cleaned.head()




overall           0
reviewText        0
review_cleaned    0
dtype: int64


In [24]:
# load positive and negative word lexicon from http://mpqa.cs.pitt.edu/
df_positive_words = pd.read_csv('positive-words.txt', header = None)
df_positive_words.columns=['pos_words']
df_negative_words = pd.read_csv('negative-words2.txt', header = None, encoding = "ISO-8859-1")
df_negative_words.columns=['neg_words']

In [34]:

#DO NOT RERUN THIS CELL - LOAD THE FEATURES GENERATED AND SAVED AS A CSV

#create a series for negative words and for positive words using the text files 

keywords_positive = df_positive_words['pos_words']
keywords_negative = df_negative_words['neg_words']


#create a binary feature for the presence of positive words
data = pd.DataFrame()
for key in keywords_positive:
    # spaces around the key to get the word,not just pattern matching.
    data[str(key)] = df_cleaned.review_cleaned.str.contains(' ' + str(key) + ' ', case=False).astype(int)

for key in keywords_negative:
    # spaces around the key to get the word,not just pattern matching.
    data[str(key)] = df_cleaned.review_cleaned.str.contains(' ' + str(key) + ' ', case=False).astype(int)

data.head()


Unnamed: 0,a+,abound,abounds,abundance,abundant,accessable,accessible,acclaim,acclaimed,acclamation,...,wrongly,wrought,yawn,zap,zapped,zaps,zealot,zealous,zealously,zombie
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [35]:
# return most common sentiments used in training data

data = data.loc[:, (data.sum(axis=0) > 10)]
#data = data.drop(['a+'],axis=1)
sums = data.sum(axis=0)
sums.sort(ascending=False)
print(sums)

a+             126768
great           43430
like            41536
fun             37155
loves           33978
well            30560
love            30541
good            26847
easy            19504
nice            15434
recommend       14960
pretty          13953
cute            13778
enough          13530
loved           12017
better          11899
right           11549
work            10951
hard            10814
perfect         10408
durable         10093
best             8806
worth            8344
sturdy           8261
enjoy            8189
likes            8126
favorite         7687
happy            7128
top              6906
cool             6711
                ...  
fondly             11
hassles            11
scaly              11
radical            11
fudge              11
sceptical          11
scoff              11
sunken             11
tediously          11
quaint             11
witty              11
hopeless           11
fluent             11
ardent             11
mindlessly



In [36]:
#DO NOT RERUN THIS CELL - LOAD THE FEATURES GENERATED AND SAVED AS A CSV

# save the features dataset to a csv to use directly

data.to_csv("amazon_features.csv")

In [65]:
# read features csv
df_features = pd.read_csv("amazon_features.csv")
df_features.head()


Unnamed: 0.1,Unnamed: 0,a+,abundance,abundant,accessible,accomplish,accomplished,accomplishment,accomplishments,accurate,...,wreak,wreck,wrestle,wrinkle,wrinkled,wrinkles,wrong,wrongly,yawn,zombie
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,2,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,3,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,4,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [80]:
#drop a+ from feature set
#df_features = df_features.drop(['a+','Unnamed: 0'],axis=1)


# create a new field for good, bad reviews (4+ good (1), 3- negative (0))
target = df_cleaned['sentiment']

In [81]:

# Instantiate our model and store it in a new variable.
bnb = BernoulliNB()

# Fit our model to the data.
bnb.fit(df_features, target)

# Classify, storing the result in a new variable.
y_pred = bnb.predict(df_features)

# Display our results.
print("Number of mislabeled points out of a total {} points : {}".format(
    df_features.shape[0],
    (target != y_pred).sum()
))

Number of mislabeled points out of a total 167597 points : 26134


In [82]:
table = pd.crosstab(target,y_pred)
print('Prediction Confusion Matrix:')

print(table)

print('\n Percentage accuracy')
print((table.iloc[0,0] + table.iloc[1,1]) / (table.sum().sum()))

Prediction Confusion Matrix:
col_0         0       1
sentiment              
0          8608   18754
1          7380  132855

 Percentage accuracy
0.844066421237


In [47]:
# Try model on a different dataset from amazon
# import json file
df_test = pd.read_json('Tools_and_Home_Improvement_5.json',lines=True)


In [51]:
#  drop unrelated fields (although summary might be useful - instructions asked for review text)
df_test_cleaned = df_test.drop(['asin','helpful','reviewTime','reviewerID','reviewerName','unixReviewTime','summary'],axis=1)

# clean up message
df_test_cleaned['review_cleaned'] = df_test_cleaned['reviewText'].apply(lambda x:''.join([i for i in x if i not in string.punctuation]))

#create a binary value for sentiment (4+ good (1), 3- negative (0))
print(df_test_cleaned.isnull().sum())
df_test_cleaned['sentiment'] =  np.where((df_test_cleaned['overall'] > 3), 1, 0)
df_test_cleaned = df_test_cleaned.drop('reviewText',1)
df_test_cleaned.head()

overall           0
reviewText        0
review_cleaned    0
dtype: int64


Unnamed: 0,overall,review_cleaned,sentiment
0,5,I hate it when my shirt collars not otherwise ...,1
1,5,These little magnets are really powerful for t...,1
2,5,I wanted something this small to mount on the ...,1
3,5,I use these to magnetize my Warhammer 40K mini...,1
4,5,They are soo freaking annoying Why You spend ...,1


In [68]:
#DO NOT RERUN THIS CELL - LOAD THE FEATURES GENERATED AND SAVED AS A CSV
#create a binary feature for the presence of positive words and negative words in test set
data_test = pd.DataFrame()
for key in keywords_positive:
    # spaces around the key to get the word,not just pattern matching.
    data_test[str(key)] = df_test_cleaned.review_cleaned.str.contains(' ' + str(key) + ' ', case=False).astype(int)

for key in keywords_negative:
    # spaces around the key to get the word,not just pattern matching.
    data_test[str(key)] = df_test_cleaned.review_cleaned.str.contains(' ' + str(key) + ' ', case=False).astype(int)


In [69]:
column_names = df_features.columns.tolist()
data_test_final_features = data_test.filter(items=column_names)
data_test_final_features.head()

Unnamed: 0,abundance,abundant,accessible,accomplish,accomplished,accomplishment,accomplishments,accurate,accurately,achievement,...,wreak,wreck,wrestle,wrinkle,wrinkled,wrinkles,wrong,wrongly,yawn,zombie
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [70]:
#DO NOT RERUN THIS CELL - LOAD THE FEATURES GENERATED AND SAVED AS A CSV

data_test_final_features.to_csv("amazon_test_features.csv")



In [71]:
# read features csv
data_test_final_features = pd.read_csv("amazon_test_features.csv")
data_test_final_features.head()



Unnamed: 0.1,Unnamed: 0,abundance,abundant,accessible,accomplish,accomplished,accomplishment,accomplishments,accurate,accurately,...,wreak,wreck,wrestle,wrinkle,wrinkled,wrinkles,wrong,wrongly,yawn,zombie
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,2,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,3,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,4,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [83]:
#drop a+ from feature set
#data_test_final_features = data_test_final_features.drop(['Unnamed: 0'],axis=1)


# create a new field for good, bad reviews (4+ good (1), 3- negative (0))
target2 = df_test_cleaned['sentiment']

In [84]:
# Classify, storing the result in a new variable.
y_pred_test = bnb.predict(data_test_final_features)

# Display our results.
print("Number of mislabeled points out of a total {} points : {}".format(
    data_test_final_features.shape[0],
    (target2 != y_pred_test).sum()
))

Number of mislabeled points out of a total 134476 points : 28250


In [85]:
table2 = pd.crosstab(target2,y_pred_test)
print('Confusion Matrix:')

print(table2)

print('\n Percentage accuracy')
print((table2.iloc[0,0] + table2.iloc[1,1]) / (table2.sum().sum()))

Confusion Matrix:
col_0          0      1
sentiment              
0           7685  13189
1          15061  98541

 Percentage accuracy
0.789925339838


### Conclusion:
Prediction accuracy decreased by about 6% between the training and test datasets