In [1]:

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

from IPython.core.interactiveshell import InteractiveShell

In [2]:
%matplotlib inline

# Set dataframe options to keep long data in columns from being truncated with ellipsis (...)
pd.set_option('max_colwidth', None)

# Set dataframe options to force display max columns
pd.set_option('display.max_columns', 90)
pd.set_option('display.max_rows', 90)

InteractiveShell.ast_node_interactivity = "all"


In [4]:
def df_info(df):
    '''
    A helper function - similar in functionality with `df.info()` but includes combined features: df.columns, df.dtypes, 
    df.isnull(), df.info(), df.nunique()
    '''
    print('\nShape : {}'.format(df.shape))
    print('Number of duplicates : {}\n'.format(df.duplicated().sum()))
    print('{:^35} {:^12} {:^12} {:^8} {:>10}\n'.format('COLUMNS', 'DATA TYPE', 'HAS NULL', 'COUNTS', '# UNIQUE'))
    for i, v in enumerate(df.columns):
        col = df[v]
        dtype = type(col.iloc[0]).__name__
        
        if dtype=='dict':
            keys = set([item for val in col.values if val is not None for item in list(val.keys())])
            col_nunique = len(keys)
        else:
            col_nunique = col.nunique()
        
        print(' {:>2}.  {:<30} {:<12} {:^10} {:>8} {:>10}'.format(i+1, v, dtype, \
                                                                  str(col.isnull().any()), col.count(), \
                                                                  col_nunique))    # col.nunique()

In [9]:
# persist the under-sampled data
reviews_dataset = pd.read_csv('../yelp_dataset/reviews_dataset_under-sampled.csv')
df_info(reviews_dataset)


Shape : (1699302, 2)
Number of duplicates : 0

              COLUMNS                DATA TYPE     HAS NULL    COUNTS    # UNIQUE

  1.  text                           str            False     1699302    1699302
  2.  sentiment                      int64          False     1699302          3


In [26]:
reviews_dataset.sample(5, random_state=27)

Unnamed: 0,text,sentiment
1597603,"Lindo's is a great neighborhood breakfast and lunch spot. Their Special Greek omelet is a wonder to behold. Stuffed with gyro meat, spinach, feta, and tomatoes, it's gigantic but somehow doesn't make me feel like death after eating it. I also love, love, love that you can get grits as your side instead of home fries. Very nice service, and quick on the coffee refills. I'm eager to try their weekend breakfast buffet, as well as their lunch items.",2
825216,"Okay so where do I start, this place is a long time family owned business in Lakewood. It's very well known for their pizza. I've had on numerous occasions and I have mixed feelings about it. Angelo's is definitely an overrated by quite a bit. Some of their pizzas can be quite greasy and their sauce may need some sugar to tone down the acidity of it. However I think they do really well with their different kinds of unique pizzas that you can't find anywhere else, from BBQ, to chicken Alfredo, to the hawaiian pizza. Their soups, salads, and other sides are really really good suprisingly. I feel it like their food is generally really pricey for some of the cons you get out of them. My last concern is that their staff can be quite rude from time to time by refusing to take a pick up order because it was busy, a little bit of lacking in the social skills.\n\n+Unique selections of pizza\n+Some Pizzas are really really good when are made right\n+Can eat in, not many places do that.\n+Really good soups/Salads/Sides\n\n-Staff can be rude at times, maybe i'm just unlucky\n-Some of the pizza seems like they have a lot of grease on them\n-Lacking consistency? Really acidic sauce sometimes.\n-Slightly pricy.",1
1135653,"Good spot in Scottsdale. The ambience is great. Menu has a ton of great options for breakfast and lunch. There's also free coffee and other drinks for those waiting for a table and a huge bar area. \n\nI ordered the chorizo omelette which came with a side of potatoes, fruit and your choice of toast. The omelette was accompanied with guacamole and a sour cream type of cream. All of the food tasted fresh and of course delicious. Wife ordered strawberry chocolate pancakes and she loved them. I had a bite and they were good considering I don't like tart fruit and chocolate together. \n\nService was top notch as the waiter was fast and friendly. We'll definitely come back!",2
359427,"For beginners the light-skinned African-American young lady that wore a bun on Saturday, August 7 who work the drive-through has a very bad attitude I repeated myself more than one time and I asked her if I could have supreme fiery shells on my tacos and she kept repeating regular or supreme and I made it very clear that I wanted supreme I get to the window and she still has the same attitude I really hope that her management can speak to her about her customer service skills and that they improve over time so that is why I'm giving two stars for customer service",0
1611947,"Based on the reviews I saw here, I went with the traditional hot wings and the wedge salad. The wedge salad was $9, but it was very big. Apparently it was supposed to come with a warm pretzel, but I got pita bread instead for some reason. The wings were pretty good and on the larger side. For $12 your get about 10-12 wings. Comes with celery and carrots as well. The meal overall was very filling and pretty good. The wings weren't award winning in my opinion, but good... especially for a hotel restaurant. If you are considering eating here, do yourself a favor and go to restaurants.com and print off a coupon. Also, I have been here twice for lunch now and another nice thing is that it is very quiet - I was the only one here for lunch around 1pm. Bartender was very nice too and was attentive. Also have a pretty good selection of draft from Great Lakes Brewery - if you're from out of town, try one. They are delicious. Dortmunder Gold is their flagship, but if its the holiday season treat yourself to a Christmas Ale.",2


Test how individiual words are predictive of the review's sentiment.  
Does a neutral share some signals that may create ambiguity both for positive and negative?    
If it creates ambiguity for the model, what are ways to prevent the model from making a mistake?  
How can we make it easy for the neural network to make this prediction?  

In [21]:
from collections import Counter

In [30]:
reviews = reviews_dataset.text.values
labels = reviews_dataset.sentiment.values

pos_ctr, neu_ctr, neg_ctr, tot_ctr = Counter(), Counter(), Counter(), Counter()

for i in range(len(reviews)):
    if labels[i]==0:
        for word in reviews[i].split():
            neg_ctr[word] += 1
            tot_ctr[word] += 1
    elif labels[i]==1:
        for word in reviews[i].split():
            neu_ctr[word] += 1
            tot_ctr[word] += 1
    else:
        for word in reviews[i].split():
            pos_ctr[word] += 1
            tot_ctr[word] += 1

In [39]:
pos_ctr.most_common(10)

[('the', 2177773),
 ('and', 1903659),
 ('a', 1314767),
 ('I', 1165485),
 ('to', 1051208),
 ('was', 906061),
 ('of', 770399),
 ('is', 755438),
 ('for', 569964),
 ('The', 555706)]

In [36]:
neg_ctr.most_common(10)

[('the', 2934770),
 ('and', 2144415),
 ('to', 1875007),
 ('I', 1802984),
 ('a', 1520209),
 ('was', 1412821),
 ('of', 931662),
 ('for', 801261),
 ('it', 687814),
 ('in', 675249)]

In [37]:
neu_ctr.most_common(10)

[('the', 3100763),
 ('and', 2122305),
 ('a', 1830665),
 ('I', 1731009),
 ('to', 1541183),
 ('was', 1520948),
 ('of', 1058070),
 ('for', 838079),
 ('is', 822846),
 ('The', 759460)]

In [77]:
pos_neg_ratios, neu_neg_ratios = Counter(), Counter()
threshold = 10000

for word, count in tot_ctr.most_common():
    
    if count > threshold:
        pos_neg_ratio = pos_ctr[word] / float(neg_ctr[word]+1)
        pos_neg_ratios[word] = pos_neg_ratio
        
        

for word, ratio in pos_neg_ratios.most_common():
    
    if ratio > 1:
        pos_neg_ratios[word] = np.log(ratio)
    else:
        pos_neg_ratios[word] = -np.log((1/(ratio+0.01)))

In [78]:
pos_neg_ratios.most_common(10)

[('Excellent', 3.2243248693487416),
 ('delicious!', 3.1680577995258186),
 ('amazing!', 2.8654714959461898),
 ('Love', 2.488407009879809),
 ('Highly', 2.4656266889924567),
 ('Great', 2.3480427417997496),
 ('perfect.', 2.309371596356452),
 ('Best', 2.3073623958834757),
 ('die', 2.2553763720977713),
 (':)', 2.2184379283517845)]

In [79]:
list(reversed(pos_neg_ratios.most_common()))[0:10]

[('Worst', -4.190090439741446),
 ('worst', -3.4128193100101734),
 ('horrible.', -3.3728462167259146),
 ('awful.', -3.2597680587154843),
 ('terrible.', -3.17301575412417),
 ('rude.', -3.091738272885658),
 ('rude', -3.0300397820298457),
 ('horrible', -2.970016599628408),
 ('terrible', -2.7647884444686297),
 ('worse', -2.6755962149499495)]

Transform dataset into numbers that prove the theory.

In [82]:
vocab = tot_ctr.keys()
vocab_size = len(vocab)
vocab_size

1611234

In [91]:
layer_0 = np.zeros((1, vocab_size))

word2index = {}
for i, word in enumerate(vocab):
    word2index[word] = i


In [99]:

def update_input_layer(review):
    global layer_0
    layer_0 *= 0
    
    for word in review.split():
        layer_0[0][word2index[word]] += 1


In [107]:
update_input_layer(reviews[0])

layer_0[0,:30]

array([1., 1., 1., 1., 3., 2., 2., 2., 1., 1., 1., 3., 1., 1., 1., 1., 2.,
       1., 1., 1., 1., 1., 4., 1., 1., 8., 1., 1., 1., 1.])

In [108]:
labels[0]

0

In [202]:
import time
import sys

# Encapsulate our neural network in a class
class SentimentNetwork:
    def __init__(self, reviews, labels, hidden_nodes = 10, learning_rate = 0.1):
        np.random.seed(1)

        # process the reviews and their associated labels so that everything is ready for training
        self.pre_process_data(reviews, labels)
        
        # Build the network to have the number of hidden nodes and the learning rate that
        # were passed into this initializer. Make the same number of input nodes as
        # there are vocabulary words and create a single output node.
        self.init_network(len(self.review_vocab), hidden_nodes, 1, learning_rate)

    def pre_process_data(self, reviews, labels):
        
        # populate review_vocab with all of the words in the given reviews
        review_vocab = set()
        for review in reviews:
            for word in review.split(" "):
                review_vocab.add(word)

        # Convert the vocabulary set to a list so we can access words via indices
        self.review_vocab = list(review_vocab)
        
        # populate label_vocab with all of the words in the given labels.
        label_vocab = set()
        for label in labels:
            label_vocab.add(label)
        
        # Convert the label vocabulary set to a list so we can access labels via indices
        self.label_vocab = list(label_vocab)
        
        # Store the sizes of the review and label vocabularies.
        self.review_vocab_size = len(self.review_vocab)
        self.label_vocab_size = len(self.label_vocab)
        
        # Create a dictionary of words in the vocabulary mapped to index positions
        self.word2index = {}
        for i, word in enumerate(self.review_vocab):
            self.word2index[word] = i
        
        # Create a dictionary of labels mapped to index positions
        self.label2index = {}
        for i, label in enumerate(self.label_vocab):
            self.label2index[label] = i
            
            
    def init_network(self, input_nodes, hidden_nodes, output_nodes, learning_rate):
        self.input_nodes = input_nodes
        self.hidden_nodes = hidden_nodes
        self.output_nodes = output_nodes

        self.learning_rate = learning_rate

        # Initialize weights

        # Weights between the input layer and the hidden layer.
        self.weights_0_1 = np.zeros((self.input_nodes,self.hidden_nodes))
    
        # Weights between the hidden layer and the output layer.
        self.weights_1_2 = np.random.normal(0.0, self.output_nodes**-0.5, 
                                                (self.hidden_nodes, self.output_nodes))
        
        # The input layer, a two-dimensional matrix with shape 1 x input_nodes
        self.layer_0 = np.zeros((1,input_nodes))


    def update_input_layer(self,review):

        # clear out previous state, reset the layer to be all 0s
        self.layer_0 *= 0
        
        for word in review.split(" "):
            # ensures the word is actually a key in word2index before
            # accessing it This allows us to ignore unknown
            # words encountered in new reviews.
            if(word in self.word2index.keys()):
                self.layer_0[0][self.word2index[word]] += 1
    
    def softmax(self, L):
        return np.exp(L)/np.sum(np.exp(L))
    
    def softmax_output_3_derivative(self,output):
        return output * (1 - output)
    
    def get_target_for_label(self,label):
        if(label == 'POSITIVE'):
            return 1
        else:
            return 0
        
    def sigmoid(self,x):
        return 1 / (1 + np.exp(-x))
    
    def sigmoid_output_2_derivative(self,output):
        return output * (1 - output)
    
    def run(self, review):

        # Input Layer
        self.update_input_layer(review.lower())

        # Hidden layer
        layer_1 = self.layer_0.dot(self.weights_0_1)

        # Output layer
        layer_2 = self.softmax(layer_1.dot(self.weights_1_2))
        
        # Return POSITIVE for values above greater-than-or-equal-to 0.5 in the output layer;
        # return NEGATIVE for other values
        #if(layer_2[0] >= 0.5):
        #    return "POSITIVE"
        #else:
        #    return "NEGATIVE"

        print(layer_2)
        
    def train(self, training_reviews, training_labels):
        
        # Keep track of correct predictions to display accuracy during training 
        correct_so_far = 0

        start = time.time()
        
        # loop and run a forward and backward pass updating weights for every item
        for i in range(len(training_reviews)):
            
            # Get the next review and its correct label
            review = training_reviews[i]
            label = training_labels[i]
            
            ### Forward pass ###

            # Input Layer
            self.update_input_layer(review)

            # Hidden layer
            layer_1 = self.layer_0.dot(self.weights_0_1)

            # Output layer
            layer_2 = self.softmax(layer_1.dot(self.weights_1_2))
            
             ### Backward pass ###

            # Output error
            #layer_2_error = layer_2 - label 
            #self.get_target_for_label(label) # Output layer error is the difference between desired target and actual output.
            #layer_2_delta = layer_2_error * self.softmax_output_3_derivative(layer_2)

In [242]:
sn = SentimentNetwork(reviews[:1000], reviews[:1000], learning_rate=0.1)

In [243]:
sn.update_input_layer(reviews[0:1][0])
layer_1 = sn.layer_0.dot(sn.weights_0_1)
layer_1.dot(sn.weights_1_2)

array([[0., 0.]])

In [244]:
layer_2 = 1 / (1 + np.exp(-layer_1.dot(sn.weights_1_2)))
layer_2_error = layer_2 -1
layer_2

array([[0.5, 0.5]])

In [245]:
layer_2_error

array([[-0.5, -0.5]])

In [246]:
layer_2_delta = layer_2_error * (layer_2 * (1 - layer_2))    # error * derivative
layer_2_delta

array([[-0.125, -0.125]])

In [247]:
layer_1_error = layer_2_delta.dot(sn.weights_1_2.T)  # errors propagated to the hidden layer
layer_1_delta = layer_1_error  # hidden layer gradients - no nonlinearity so it's the same as the error
layer_1_delta

array([[-0.08950106,  0.14152215,  0.12693725, -0.08693921, -0.0061579 ,
         0.05285913,  0.06244385, -0.00299444,  0.0928331 , -0.05524528]])

In [248]:
# Update the weights
sn.weights_1_2 -= layer_1.T.dot(layer_2_delta) * sn.learning_rate # update hidden-to-output weights with gradient descent step
sn.weights_0_1 -= sn.layer_0.T.dot(layer_1_delta) * sn.learning_rate # update input-to-hidden weights with gradient descent step

In [249]:
sn.weights_1_2

array([[ 1.14858562, -0.43257711],
       [-0.37347383, -0.75870339],
       [ 0.6119356 , -1.62743362],
       [ 1.23376823, -0.53825456],
       [ 0.22559471, -0.17633148],
       [ 1.03386644, -1.45673947],
       [-0.22798339, -0.27156744],
       [ 0.80169606, -0.77774057],
       [-0.12192515, -0.62073964],
       [ 0.02984963,  0.41211259]])

In [250]:
sn.weights_0_1

array([[ 0.02685032, -0.04245665, -0.03808118, ...,  0.00089833,
        -0.02784993,  0.01657358],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       ...,
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ]])

In [251]:
layer_2 >= 0.5 and 

array([[ True,  True]])

In [None]:
# Backpropagated error
layer_1_error = layer_2_delta.dot(self.weights_1_2.T)  # errors propagated to the hidden layer
layer_1_delta = layer_1_error  # hidden layer gradients - no nonlinearity so it's the same as the error

# Update the weights
self.weights_1_2 -= layer_1.T.dot(layer_2_delta) * self.learning_rate # update hidden-to-output weights with gradient descent step
self.weights_0_1 -= self.layer_0.T.dot(layer_1_delta) * self.learning_rate # update input-to-hidden weights with gradient descent step

# Keep track of correct predictions.
if(layer_2 >= 0.5 and label == 'POSITIVE'):
    correct_so_far += 1
elif(layer_2 < 0.5 and label == 'NEGATIVE'):
    correct_so_far += 1

In [None]:
    
    def train(self, training_reviews, training_labels):
        
        # make sure out we have a matching number of reviews and labels
        assert(len(training_reviews) == len(training_labels))
        
        # Keep track of correct predictions to display accuracy during training 
        correct_so_far = 0

        # Remember when we started for printing time statistics
        start = time.time()
        
        # loop through all the given reviews and run a forward and backward pass,
        # updating weights for every item
        for i in range(len(training_reviews)):
            
            # Get the next review and its correct label
            review = training_reviews[i]
            label = training_labels[i]
            
            ### Forward pass ###

            # Input Layer
            self.update_input_layer(review)

            # Hidden layer
            layer_1 = self.layer_0.dot(self.weights_0_1)

            # Output layer
            layer_2 = self.softmax(layer_1.dot(self.weights_1_2))
            
            ### Backward pass ###

            # Output error
            layer_2_error = layer_2 - label #self.get_target_for_label(label) # Output layer error is the difference between desired target and actual output.
            layer_2_delta = layer_2_error * self.softmax_output_3_derivative(layer_2) #self.sigmoid_output_2_derivative(layer_2)

            # Backpropagated error
            layer_1_error = layer_2_delta.dot(self.weights_1_2.T)  # errors propagated to the hidden layer
            layer_1_delta = layer_1_error  # hidden layer gradients - no nonlinearity so it's the same as the error

            # Update the weights
            self.weights_1_2 -= layer_1.T.dot(layer_2_delta) * self.learning_rate # update hidden-to-output weights with gradient descent step
            self.weights_0_1 -= self.layer_0.T.dot(layer_1_delta) * self.learning_rate # update input-to-hidden weights with gradient descent step

            # Keep track of correct predictions.
            if(layer_2 >= 0.5 and label == 'POSITIVE'):
                correct_so_far += 1
            elif(layer_2 < 0.5 and label == 'NEGATIVE'):
                correct_so_far += 1
            
            # For debug purposes, print out our prediction accuracy and speed 
            # throughout the training process. 
            elapsed_time = float(time.time() - start)
            reviews_per_second = i / elapsed_time if elapsed_time > 0 else 0
            
            sys.stdout.write("\rProgress:" + str(100 * i/float(len(training_reviews)))[:4] \
                             + "% Speed(reviews/sec):" + str(reviews_per_second)[0:5] \
                             + " #Correct:" + str(correct_so_far) + " #Trained:" + str(i+1) \
                             + " Training Accuracy:" + str(correct_so_far * 100 / float(i+1))[:4] + "%")
            if(i % 2500 == 0):
                print("")
    
    def test(self, testing_reviews, testing_labels):
        """
        Attempts to predict the labels for the given testing_reviews,
        and uses the test_labels to calculate the accuracy of those predictions.
        """
        
        # keep track of how many correct predictions we make
        correct = 0

        # we'll time how many predictions per second we make
        start = time.time()

        # Loop through each of the given reviews and call run to predict
        # its label. 
        for i in range(len(testing_reviews)):
            pred = self.run(testing_reviews[i])
            if(pred == testing_labels[i]):
                correct += 1
            
            # For debug purposes, print out our prediction accuracy and speed 
            # throughout the prediction process. 

            elapsed_time = float(time.time() - start)
            reviews_per_second = i / elapsed_time if elapsed_time > 0 else 0
            
            sys.stdout.write("\rProgress:" + str(100 * i/float(len(testing_reviews)))[:4] \
                             + "% Speed(reviews/sec):" + str(reviews_per_second)[0:5] \
                             + " #Correct:" + str(correct) + " #Tested:" + str(i+1) \
                             + " Testing Accuracy:" + str(correct * 100 / float(i+1))[:4] + "%")
