# Classifying Amazon Electronics Reviews

## Setup

### Load and Train Dataset

In [1]:
# from __future__ import division
import pandas as pd
import random
import keras
from keras.preprocessing.text import text_to_word_sequence
import json
import gzip
import csv
import numpy
from sklearn.model_selection import train_test_split

json_file = pd.read_json('/Users/sarah/Documents/Calvin/Senior/Spring/CS_344/Final_Project/reviews_Electronics_5.json.gz', lines=True)

# Function to filter helpfulness values (calculate the percentage of helpfulness from the list of numbers in the dataset)
def perc_help(helpful):
	try:
        # e.g. given [2,3], that means 2/3
		perc = helpful[0] / helpful[1]
    # if given [0,0], handle that exception (0/0 is not possible)
	except ZeroDivisionError:
		perc = 0
	return perc

# Create a new calculated column called perc_help to be used for filtering
json_file['perc_help'] = json_file.helpful.apply(perc_help)

# Filter
condensed_dataset = json_file[json_file.perc_help > 0.5]

print(condensed_dataset.head())

#extract reviews only from json file
elec_rev = []
rev_list = json_file['reviewText']
tokenizer = keras.preprocessing.text.Tokenizer()
tokenizer.fit_on_texts(rev_list)
random.shuffle(rev_list)
train_data, test_data = train_test_split(rev_list, test_size=0.20)
print(train_data[0])

Using TensorFlow backend.


         asin   helpful  overall  \
1  0528881469  [12, 15]        1   
2  0528881469  [43, 45]        3   
3  0528881469   [9, 10]        2   
5  0594451647    [3, 3]        5   
9  0594451647    [3, 3]        5   

                                          reviewText   reviewTime  \
1  I'm a professional OTR truck driver, and I bou...  11 25, 2010   
2  Well, what can I say.  I've had this unit in m...   09 9, 2010   
3  Not going to write a long review, even thought...  11 24, 2010   
5  I am using this with a Nook HD+. It works as d...   01 3, 2014   
9  This product really works great but I found th...  01 20, 2014   

       reviewerID                   reviewerName  \
1   AMO214LNFCEI4                Amazon Customer   
2  A3N7T0DY83Y4IG                  C. A. Freeman   
3  A1H8PY3QHMQQA0       Dave M. Shaw "mack dave"   
5  A2JXAZZI9PHK9Z  Billy G. Noland "Bill Noland"   
9  A3BY5KCNQZXV5U                        Matenai   

                                    summary  unixReview

### Load Labels

In [2]:
rate_list = json_file['overall']
label_list = []
for element in rate_list:
    if element > 2.0:
        label_list.append(1)
    else:
        label_list.append(0)

train_labels, test_labels = train_test_split(label_list, test_size=0.20)

train_labels[0]

1

### Limit word index to 10,000

In [None]:
max([max(sequence) for sequence in train_data])

## Preparing Data

### Vectorize Data

In [None]:
import numpy as np

def vectorize_sequences(sequences, dimension=10000):
    # Create an all-zero matrix of shape (len(sequences), dimension)
    results = np.zeros((len(sequences), dimension))
    for i, sequence in enumerate(sequences):
        results[i, sequence] = 1.  # set specific indices of results[i] to 1s
    return results

# Our vectorized training data
x_train = vectorize_sequences(train_data)
# Our vectorized test data
x_test = vectorize_sequences(test_data)

x_train[0]

### Vectorize Labels

In [None]:
y_train = np.asarray(train_labels).astype('float32')
y_test = np.asarray(test_labels).astype('float32')

## Building our network

### Implementing the Network

In [None]:
from keras import models
from keras import layers

model = models.Sequential()
model.add(layers.Dense(16, activation='relu', input_shape=(10000,)))
model.add(layers.Dense(16, activation='relu'))
model.add(layers.Dense(1, activation='sigmoid'))

### Configure with optimizer and loss function

In [None]:
model.compile(optimizer='rmsprop',
              loss='binary_crossentropy',
              metrics=['accuracy'])

## Validating the approach

### Create Validation set

In [None]:
x_val = x_train[:10000]
partial_x_train = x_train[10000:]

y_val = y_train[:10000]
partial_y_train = y_train[10000:]

### Train model and monitor loss and accuracy

In [None]:
history = model.fit(partial_x_train,
                    partial_y_train,
                    epochs=20,
                    batch_size=512,
                    validation_data=(x_val, y_val))

### Look at history

In [None]:
history_dict = history.history
history_dict.keys()
### Plot training and validation loss

In [None]:
import matplotlib.pyplot as plt

acc = history.history['acc']
val_acc = history.history['val_acc']
loss = history.history['loss']
val_loss = history.history['val_loss']

epochs = range(1, len(acc) + 1)

# "bo" is for "blue dot"
plt.plot(epochs, loss, 'bo', label='Training loss')
# b is for "solid blue line"
plt.plot(epochs, val_loss, 'b', label='Validation loss')
plt.title('Training and validation loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()

plt.show()

### Plot training and validation accuracy

In [None]:
plt.clf()   # clear figure
acc_values = history_dict['acc']
val_acc_values = history_dict['val_acc']

plt.plot(epochs, acc, 'bo', label='Training acc')
plt.plot(epochs, val_acc, 'b', label='Validation acc')
plt.title('Training and validation accuracy')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()

plt.show()

### Train and test a new network from scratch

In [None]:
model = models.Sequential()
model.add(layers.Dense(16, activation='relu', input_shape=(10000,)))
model.add(layers.Dense(16, activation='relu'))
model.add(layers.Dense(1, activation='sigmoid'))

model.compile(optimizer='rmsprop',
              loss='binary_crossentropy',
              metrics=['accuracy'])

model.fit(x_train, y_train, epochs=4, batch_size=512)
results = model.evaluate(x_test, y_test)

results

## Using a trained network to generate predictions on new data

In [None]:
model.predict(x_test)