In [1]:
import json
import csv
import pandas as pd
from stop_words import get_stop_words
import re
import string
from sklearn.model_selection import train_test_split

import warnings
warnings.simplefilter("ignore")

In [2]:
review_data_path = 'amazon_reviews_us_Books_v1_02.tsv'

reviews = pd.read_table(review_data_path, error_bad_lines=False, encoding='utf-8')

reviews.head()

b'Skipping line 1680001: expected 15 fields, saw 22\n'


Unnamed: 0,marketplace,customer_id,review_id,product_id,product_parent,product_title,product_category,star_rating,helpful_votes,total_votes,vine,verified_purchase,review_headline,review_body,review_date
0,US,12076615,RQ58W7SMO911M,385730586,122662979,Sisterhood of the Traveling Pants (Book 1),Books,4.0,2.0,3.0,N,N,this book was a great learning novel!,this boook was a great one that you could lear...,2005-10-14
1,US,12703090,RF6IUKMGL8SF,811828964,56191234,The Bad Girl's Guide to Getting What You Want,Books,3.0,5.0,5.0,N,N,Fun Fluff,If you are looking for something to stimulate ...,2005-10-14
2,US,12257412,R1DOSHH6AI622S,1844161560,253182049,"Eisenhorn (A Warhammer 40,000 Omnibus)",Books,4.0,1.0,22.0,N,N,this isn't a review,never read it-a young relative idicated he lik...,2005-10-14
3,US,50732546,RATOTLA3OF70O,373836635,348672532,Colby Conspiracy (Colby Agency),Books,5.0,2.0,2.0,N,N,fine author on her A-game,Though she is honored to be Chicago Woman of t...,2005-10-14
4,US,51964897,R1TNWRKIVHVYOV,262181533,598678717,The Psychology of Proof: Deductive Reasoning i...,Books,4.0,0.0,2.0,N,N,Execellent cursor examination,Review based on a cursory examination by Unive...,2005-10-14


In [3]:
df = reviews.copy()[['star_rating', 'review_body']]

In [4]:
def partition(x):
    if x<3:
        return 'Negative'
    elif x>3:
        return 'Positive'
    return 'Neutral'

actualScore = df['star_rating']
positiveNegative = actualScore.map(partition)
df['ratings'] = positiveNegative

df.groupby('ratings').size()

ratings
Negative     404588
Neutral      249917
Positive    2450865
dtype: int64

In [5]:
"""
# Set up training and test sets by choosing random samples from classes
X_train, X_test, y_train, y_test = train_test_split(df['review_body'], df['ratings'], test_size=0.25, random_state=0)

df_test = pd.concat([X_test, y_test], axis=1)
df_test = df_test.iloc[:100000,:]
df_test.to_csv('test.csv', header=True, index=False, encoding='utf-8')
"""

"\n# Set up training and test sets by choosing random samples from classes\nX_train, X_test, y_train, y_test = train_test_split(df['review_body'], df['ratings'], test_size=0.25, random_state=0)\n\ndf_test = pd.concat([X_test, y_test], axis=1)\ndf_test = df_test.iloc[:100000,:]\ndf_test.to_csv('test.csv', header=True, index=False, encoding='utf-8')\n"

In [6]:
df = df.sort_values('ratings')

In [7]:
df = df.sort_values('ratings')

df.groupby('ratings').size()

ratings
Negative     404588
Neutral      249917
Positive    2450865
dtype: int64

In [8]:
# Define number of classes and number of tweets per class
n_class = 3

# Divide into number of classes
if n_class == 2:
    df_pos = df.copy()[df.ratings == 'Positive'][123977:128977]
    df_neg = df.copy()[df.ratings == 'Negative'][:5000]
    df_neu = pd.DataFrame()
    df = pd.concat([df_pos, df_neg], ignore_index=True).reset_index(drop=True)
elif n_class == 3:
    df_neg = df.copy()[df.ratings == 'Negative'][:5000]
    df_neu = df.copy()[df.ratings == 'Neutral'][404588:409588]
    df_pos = df.copy()[df.ratings == 'Positive'][654505:659505]
    df = pd.concat([df_pos, df_neg, df_neu], ignore_index=True).reset_index(drop=True)

# Define functions to process Tweet text and remove stop words
def ProReviews(review):
    review = ''.join(c for c in review if c not in string.punctuation)
    review = re.sub('((www\S+)|(http\S+))', 'urlsite', review)
    review = re.sub(r'\d+', 'contnum', review)
    review = re.sub(' +',' ', review)
    review = review.lower().strip()
    return review

def rmStopWords(review, stop_words):
    text = review.split()
    text = ' '.join(word for word in text if word not in stop_words)
    return text

In [9]:
# Get list of stop words
stop_words = get_stop_words('english')
stop_words = [''.join(c for c in s if c not in string.punctuation) for s in stop_words]
stop_words = [t.encode('utf-8') for t in stop_words]

# Preprocess all tweet data
pro_reviews = []
for review in df['review_body']:
    processed = ProReviews(review)
    pro_stopw = rmStopWords(processed, stop_words)
    pro_reviews.append(pro_stopw)

df['reviews'] = pro_reviews

In [10]:
# Set up training and test sets by choosing random samples from classes
X_train, X_test, y_train, y_test = train_test_split(df['reviews'], df['ratings'], test_size=0.33, random_state=1)

df_train = pd.DataFrame()
df_test = pd.DataFrame()

df_train['reviews'] = X_train
df_train['ratings'] = y_train
df_train = df_train.reset_index(drop=True)

df_test['reviews'] = X_test
df_test['ratings'] = y_test
df_test = df_test.reset_index(drop=True)


df_test = pd.read_csv('test.csv')
df_test = df_test.reset_index(drop=True)

"\ndf_test = pd.read_csv('test.csv')\ndf_test = df_test.reset_index(drop=True)\ndf_test = df_test.iloc[2000:,:]\n"

In [11]:
from collections import Counter
import numpy as np


# Start training (input training set df_train)
class AmazonNBClassifier(object):

    def __init__(self, df_train):
        self.df_train = df_train
        self.df_pos = df_train.copy()[df_train.ratings == 'Positive']
        self.df_neg = df_train.copy()[df_train.ratings == 'Negative']
        self.df_neu = df_train.copy()[df_train.ratings == 'Neutral']

    def fit(self):
        Pr_pos = df_pos.shape[0]/self.df_train.shape[0]
        Pr_neg = df_neg.shape[0]/self.df_train.shape[0]
        Pr_neu = df_neu.shape[0]/self.df_train.shape[0]
        self.Prior  = (Pr_pos, Pr_neg, Pr_neu)

        self.pos_words = ' '.join(self.df_pos['reviews'].tolist()).split()
        self.neg_words = ' '.join(self.df_neg['reviews'].tolist()).split()
        self.neu_words = ' '.join(self.df_neu['reviews'].tolist()).split()

        all_words = ' '.join(self.df_train['reviews'].tolist()).split()

        self.vocab = len(Counter(all_words))

        wc_pos = len(' '.join(self.df_pos['reviews'].tolist()).split())
        wc_neg = len(' '.join(self.df_neg['reviews'].tolist()).split())
        wc_neu = len(' '.join(self.df_neu['reviews'].tolist()).split())
        self.word_count = (wc_pos, wc_neg, wc_neu)
        return self


    def predict(self, df_test):
        class_choice = ['Positive', 'Negative', 'Neutral']

        classification = []
        for review in df_test['reviews']:
            text = review.split()

            val_pos = np.array([])
            val_neg = np.array([])
            val_neu = np.array([])
            for word in text:
                tmp_pos = np.log((self.pos_words.count(word)+1)/(self.word_count[0]+self.vocab))
                tmp_neg = np.log((self.neg_words.count(word)+1)/(self.word_count[1]+self.vocab))
                tmp_neu = np.log((self.neu_words.count(word)+1)/(self.word_count[2]+self.vocab))
                val_pos = np.append(val_pos, tmp_pos)
                val_neg = np.append(val_neg, tmp_neg)
                val_neu = np.append(val_neu, tmp_neu)

            val_pos = np.log(self.Prior[0]) + np.sum(val_pos)
            val_neg = np.log(self.Prior[1]) + np.sum(val_neg)
            val_neu = np.log(self.Prior[2]) + np.sum(val_neu)

            probability = (val_pos, val_neg, val_neu)
            classification.append(class_choice[np.argmax(probability)])
        return classification


    def score(self, feature, target):

        compare = []
        for i in range(0,len(feature)):
            if feature[i] == target[i]:
                tmp ='correct'
                compare.append(tmp)
            else:
                tmp ='incorrect'
                compare.append(tmp)
        r = Counter(compare)
        accuracy = r['correct']/(r['correct']+r['incorrect'])
        return accuracy

In [12]:
rnb = AmazonNBClassifier(df_train)
rnb = rnb.fit()
print ('training complete')
predict = rnb.predict(df_test)
score = rnb.score(predict,df_test.ratings.tolist())
print(score)

training complete
0.9090909090909091
