In [1]:
# imports
from collections import Counter
import enchant
import numpy as np
import pandas as pd
import sys
import time

In [2]:
# Neural Netork
class HelpfulnessNetwork:
    # initialize the network with parameters
    def __init__(self, reviews, labels, min_count, polarity_cutoff, hidden_nodes, learning_rate):
        # This is to get reproducable results each time we run the network
        np.random.seed(1)

        # pre process the data and run it
        self.pre_process_data(reviews, labels, polarity_cutoff, min_count)
        self.init_network(len(self.review_vocab),hidden_nodes, 1, learning_rate)

    def pre_process_data(self, reviews, labels, polarity_cutoff, min_count):
        # PART 1: get the count of words and the count of positive and negative words
        positive_counts = Counter()
        negative_counts = Counter()
        total_counts = Counter()
        
        d = enchant.Dict("en_US")
        
        for i in range(len(reviews)):
            if(labels[i] == 1):
                for word in reviews[i].split(" "):
                    # This is used to check if the words are English dictionary words. TEST
                    if word != '' and d.check(word):
                        positive_counts[word.lower()] += 1
                        total_counts[word.lower()] += 1
            else:
                for word in reviews[i].split(" "):
                    # This is used to check if the words are English dictionary words. TEST
                    if word != '' and d.check(word):
                        negative_counts[word.lower()] += 1
                        total_counts[word.lower()] += 1
        
        # We use this to get a better understanding of what words are correlated to helpful reviews and which
        #ones are correlated to unhelpful reviews.
        pos_neg_ratios = Counter()

        for term,count in list(total_counts.most_common()):
            if count > 25:
                pos_neg_ratio = positive_counts[term] / float(negative_counts[term]+1)
                pos_neg_ratios[term] = pos_neg_ratio

        # We do this so that we can get a scale, where 0 is a neutral word and the greater the number the more helpful
        # and the less the number the more unhelpful.
        for word,ratio in pos_neg_ratios.most_common():
            if(ratio > 1):
                pos_neg_ratios[word] = np.log(ratio)
            else:
                pos_neg_ratios[word] = -np.log((1 / (ratio + 0.01)))
               
        # This is used to testing and visualizing what is happening
        topPos = pos_neg_ratios.most_common()[:30]
        topNeg = list(reversed(pos_neg_ratios.most_common()))[:30]
        print(topPos)
        print(topNeg)
        
        for word in topPos:
            print(total_counts[word[0]])
        for word in topNeg:
            print(total_counts[word[0]])
        
        # populating the vocabulary that is going to be used
        review_vocab = set()
        for review in reviews:
            for word in review.split(" "):
                # check the min_count requirement
                if(total_counts[word.lower()] > min_count):
                    if(word.lower() in pos_neg_ratios.keys()):
                        # ensure the polarity cutoff
                        if((pos_neg_ratios[word.lower()] >= polarity_cutoff) or (pos_neg_ratios[word.lower()] <= -polarity_cutoff)):
                            review_vocab.add(word.lower())
                    else:
                        review_vocab.add(word.lower())

        self.review_vocab = list(review_vocab)
        
        # populate the labels
        label_vocab = set()
        for label in labels:
            label_vocab.add(label)
        
        # Convert the label vocabulary set to a list so we can access labels via indices
        self.label_vocab = list(label_vocab)
        
        self.review_vocab_size = len(self.review_vocab)
        self.label_vocab_size = len(self.label_vocab)
        
        # Creating a dictionary where the words are mapped to the indices
        self.word2index = {}
        for i, word in enumerate(self.review_vocab):
            self.word2index[word] = i
        
        # Creating a dictionary where the labels are mapped to the indices
        self.label2index = {}
        for i, label in enumerate(self.label_vocab):
            self.label2index[label] = i

    def init_network(self, input_nodes, hidden_nodes, output_nodes, learning_rate):
        self.input_nodes = input_nodes
        self.hidden_nodes = hidden_nodes
        self.output_nodes = output_nodes
        self.learning_rate = learning_rate

        # weights input layer -> hidden layer.
        self.weights_0_1 = np.zeros((self.input_nodes, self.hidden_nodes))

        # weights hidden layer -> output layer.
        self.weights_1_2 = np.random.normal(0.0, self.output_nodes**-0.5, (self.hidden_nodes, self.output_nodes))
        
        # The input layer
        self.layer_1 = np.zeros((1,hidden_nodes))
        
    def sigmoid(self,x):
        return 1 / (1 + np.exp(-x))
    
    def sigmoid_output_2_derivative(self,output):
        return output * (1 - output)
    
    def train(self, training_reviews_raw, training_labels):
        training_reviews = list()
        for review in training_reviews_raw:
            indices = set()
            for word in review.split(" "):
                if(word.lower() in self.word2index.keys()):
                    indices.add(self.word2index[word.lower()])
            training_reviews.append(list(indices))
        
        correct_so_far = 0
        start = time.time()
        
        # run a forward and backward pass and reviews and update weights
        for i in range(len(training_reviews)):
            
            # Get the next review and its label
            review = training_reviews[i]
            label = training_labels[i]

            # Hidden layer
            self.layer_1 *= 0
            for index in review:
                self.layer_1 += self.weights_0_1[index]

            # Output layer
            layer_2 = self.sigmoid(self.layer_1.dot(self.weights_1_2))            
            
            
            ### Backward pass ###
            # Output error
            layer_2_error = layer_2 - label 
            layer_2_delta = layer_2_error * self.sigmoid_output_2_derivative(layer_2)

            # Backpropagated error
            layer_1_error = layer_2_delta.dot(self.weights_1_2.T) # to the hidden layer
            layer_1_delta = layer_1_error

            # Update the weights
            self.weights_1_2 -= self.layer_1.T.dot(layer_2_delta) * self.learning_rate # update weights with gradient descent step
            
            for index in review:
                self.weights_0_1[index] -= layer_1_delta[0] * self.learning_rate # update weights with gradient descent step

            if(layer_2 >= 0.5 and label == 1):
                correct_so_far += 1
            elif(layer_2 < 0.5 and label == 0):
                correct_so_far += 1
            
            elapsed_time = float(time.time() - start)
            reviews_per_second = i / elapsed_time if elapsed_time > 0 else 0
            
            sys.stdout.write("\rProgress:" + str(100 * i/float(len(training_reviews)))[:4] \
                             + "% Speed(reviews/sec):" + str(reviews_per_second)[0:5] \
                             + " #Correct:" + str(correct_so_far) + " #Trained:" + str(i+1) \
                             + " Training Accuracy:" + str(correct_so_far * 100 / float(i+1))[:4] + "%")
            if(i % 2500 == 0):
                print("")
    
    def test(self, testing_reviews, testing_labels):
        correct = 0
        start = time.time()

        # Predict for the review 
        for i in range(len(testing_reviews)):
            pred = self.run(testing_reviews[i])
            if(pred == testing_labels[i]):
                correct += 1

            elapsed_time = float(time.time() - start)
            reviews_per_second = i / elapsed_time if elapsed_time > 0 else 0
            
            sys.stdout.write("\rProgress:" + str(100 * i/float(len(testing_reviews)))[:4] \
                             + "% Speed(reviews/sec):" + str(reviews_per_second)[0:5] \
                             + " #Correct:" + str(correct) + " #Tested:" + str(i+1) \
                             + " Testing Accuracy:" + str(correct * 100 / float(i+1))[:4] + "%")
    
    def run(self, review):
        # forward pass
        # Hidden layer 
        self.layer_1 *= 0

        unique_indices = set()
        for word in review.split(" "):
            if word.lower() in self.word2index.keys():
                unique_indices.add(self.word2index[word.lower()])
        
        for index in unique_indices:
            self.layer_1 += self.weights_0_1[index]
        
        # Output layer
        layer_2 = self.sigmoid(self.layer_1.dot(self.weights_1_2))
         
        if(layer_2[0] >= 0.5):
            return 1
        else:
            return 0

In [3]:
file1 = 'reviews_Cell_Phones_and_Accessories_5.json.gz'
#file2 = 'reviews_Kindle_Store_5.json.gz'
#file3 = 'reviews_Home_and_Kitchen_5.json.gz'
all_data1 = pd.read_json('../data/'+file1, lines=True)
#all_data2 = pd.read_json('../data/'+file2, lines=True)
#all_data3 = pd.read_json('../data/'+file,lines=True)
#all_data = pd.concat([all_data1, all_data2])
data = all_data1[['reviewText','helpful']]

# We are calculating the helpful value here and dropping reviews with no ratings
data['helpful'] = data['helpful'].map(lambda x: x[0]/x[1] if x[1] != 0 else np.nan)#np.log(x[0]+1)/x[1] if x[1] != 0 else np.nan)
data.dropna(inplace=True)

# Used for testing purposes
print(len(data['helpful']))
print(np.mean(data['helpful'].values))
print(np.std(data['helpful'].values))

# from looking at the values above use to the mean for the comparison value. it is different for each data set.
data['helpful'] = data['helpful'].map(lambda x: 1 if x > .7 else 0)

reviews = data['reviewText'].values
labels = data['helpful'].values

# baseline for randomly guessing what is helpful and what is not helpful
print(sum(labels)/len(reviews))

# initializing. training and running the neural network
mlp = HelpfulnessNetwork(reviews[:int(-1*(.8*len(reviews)))],labels[:int(-1*(.8*len(reviews)))],min_count=25,polarity_cutoff=.5,hidden_nodes=10,learning_rate=.01)
mlp.train(reviews[:int(-1*(.8*len(reviews)))],labels[:int(-1*(.8*len(reviews)))])
mlp.test(reviews[int(-1*(.2*len(reviews))):],labels[int(-1*(.2*len(reviews))):])


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  # This is added back by InteractiveShellApp.init_path()
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if sys.path[0] == '':


53129
0.7065320112943302
0.38052283401772596
0.6267763368405203


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


[('mhz', 3.6109179126442243), ('keys.', 3.5263605246161616), ('drone', 3.332204510175204), ('2.0', 2.772588722239781), ('cruiser', 2.740840023925201), ('frame', 2.70805020110221), ('legs', 2.70805020110221), ('accident', 2.6741486494265287), ('drive.', 2.6390573296152584), ('recessed', 2.6026896854443837), ('gloves', 2.6026896854443837), ('epic.', 2.6026896854443837), ('pleasant', 2.5902671654458267), ('controls.', 2.5649493574615367), ('announce', 2.5257286443082556), ('cheek', 2.5257286443082556), ('parrot', 2.5257286443082556), ('lightning', 2.4277482359480516), ('marked', 2.367123614131617), ('hp', 2.3353749158170367), ('chrome', 2.3353749158170367), ('remaining', 2.302585092994046), ('joint', 2.302585092994046), ('zoom', 2.277267285009756), ('switches', 2.277267285009756), ('roadster', 2.2735975561207935), ('spoken', 2.268683541318364), ('compromise', 2.268683541318364), ('digital', 2.2300144001592104), ('defender', 2.217843864538955)]
[('t', -0.9844722442902273), ('clipper', -0.9