In [1]:
import matplotlib as plt
import numpy as np
import pandas as pd
import nltk

pd.set_option('display.max_colwidth', -1)

# Initial file setup
training_df = pd.read_csv("amazon_review_full_csv.tar.gz", nrows=10000, compression='gzip', sep=',', quotechar='"')
training_df.columns = ["Rating", "Title", "Review"]
training_df.to_csv("training.csv")

testing_df = pd.read_csv("amazon_review_full_csv.tar.gz", skiprows=10000, nrows=1000, compression='gzip', sep=',', quotechar='"')
testing_df.columns = ["Rating", "Title", "Review"]
testing_df.to_csv("testing.csv")

In [2]:
# Read training data
training_df = pd.read_csv("training.csv")
del training_df["Unnamed: 0"]
#training_df.head(10)

In [3]:
# Read testing data
testing_df = pd.read_csv("testing.csv")
del testing_df["Unnamed: 0"]
#testing_df.head(10)

In [4]:
training_df["Review"] = training_df["Review"].apply(nltk.word_tokenize)

training_df.head(5)

Unnamed: 0,Rating,Title,Review
0,5,Inspiring,"[I, hope, a, lot, of, people, hear, this, cd, ., We, need, more, strong, and, positive, vibes, like, this, ., Great, vocals, ,, fresh, tunes, ,, cross-cultural, happiness, ., Her, blues, is, from, the, gut, ., The, pop, sounds, are, catchy, and, mature, .]"
1,5,The best soundtrack ever to anything.,"[I, 'm, reading, a, lot, of, reviews, saying, that, this, is, the, best, 'game, soundtrack, ', and, I, figured, that, I, 'd, write, a, review, to, disagree, a, bit, ., This, in, my, opinino, is, Yasunori, Mitsuda, 's, ultimate, masterpiece, ., The, music, is, timeless, and, I, 'm, been, listening, to, it, for, years, now, and, its, beauty, simply, refuses, to, fade.The, price, tag, on, this, is, pretty, staggering, I, must, say, ,, but, if, you, are, going, to, buy, any, cd, for, this, much, money, ,, this, is, the, only, one, that, I, feel, would, be, worth, every, penny, ...]"
2,4,Chrono Cross OST,"[The, music, of, Yasunori, Misuda, is, without, question, my, close, second, below, the, great, Nobuo, Uematsu.Chrono, Cross, OST, is, a, wonderful, creation, filled, with, rich, orchestra, and, synthesized, sounds, ., While, ambiance, is, one, of, the, music, 's, major, factors, ,, yet, at, times, it, 's, very, uplifting, and, vigorous, ., Some, of, my, favourite, tracks, include, ;, ``, Scars, Left, by, Time, ,, The, Girl, who, Stole, the, Stars, ,, and, Another, World, '', .]"
3,5,Too good to be true,"[Probably, the, greatest, soundtrack, in, history, !, Usually, it, 's, better, to, have, played, the, game, first, but, this, is, so, enjoyable, anyway, !, I, worked, so, hard, getting, this, soundtrack, and, after, spending, [, money, ], to, get, it, it, was, really, worth, every, penny, !, !, Get, this, OST, !, it, 's, amazing, !, The, first, few, tracks, will, have, you, dancing, around, with, delight, (, especially, Scars, Left, by, Time, ), !, !, BUY, IT, NOW, !, !]"
4,5,There's a reason for the price,"[There, 's, a, reason, this, CD, is, so, expensive, ,, even, the, version, that, 's, not, an, import.Some, of, the, best, music, ever, ., I, could, listen, to, every, track, every, minute, of, every, day, ., That, 's, about, all, i, can, say, .]"


In [5]:
MAX_WORDS = 2000
word_list = []
for tokenized_index in training_df["Review"]:
    for word in tokenized_index:
        word_list.append(word.lower())
    
word_distribution = nltk.FreqDist(word_list)
word_features = list(word_distribution.keys()) [:MAX_WORDS]

In [6]:
# Determine whether the top MAX_WORDS words are contained in a review 
def find_features(review):
    features = {}
    for word in word_features:
        features[word] = (word in review)
        
    return features

In [10]:
training_feature_sets = []
testing_feature_sets = []
i = 0 # Annoying iterator but unsure of better method
j = 0
for review in training_df["Review"]:
    training_feature_sets.append((find_features(review), training_df["Rating"].iloc[i]))
    i += 1
for review in testing_df["Review"]:
    testing_feature_sets.append((find_features(review), testing_df["Rating"].iloc[j]))
    j += 1

In [11]:
classifier = nltk.NaiveBayesClassifier.train(training_feature_sets)
print("TEST: {}".format(nltk.classify.accuracy(classifier, testing_feature_sets)))
classifier.show_most_informative_features(10)

TEST: 0.295
Most Informative Features
                   waste = True                1 : 4      =     41.0 : 1.0
                   worst = True                1 : 4      =     30.3 : 1.0
                  refund = True                1 : 4      =     26.3 : 1.0
                  poorly = True                1 : 5      =     24.5 : 1.0
                 garbage = True                1 : 3      =     23.1 : 1.0
                    junk = True                1 : 4      =     21.7 : 1.0
               perfectly = True                5 : 2      =     21.3 : 1.0
             predictable = True                2 : 5      =     20.6 : 1.0
                horrible = True                1 : 4      =     17.9 : 1.0
                terrible = True                1 : 4      =     17.7 : 1.0


In [53]:
user_review = input("Please enter a review:\n")
user_feature_set = []
user_review = user_review.lower()
user_feature_set.append((find_features(user_review), 3))

guess = classifier.classify(user_feature_set[0][0])
print("We guess that you rate the product {} out of 5 stars!".format(guess))

Please enter a review:
You will need a lot of time on your hands to read this book.  Exceptionally long, but worth it in the end.  Good work.
We guess that you rate the product 4 out of 5 stars!
