### This is a project in DLND which was originally designed by  Andrew Trask

## Curate a dataset

In [1]:
def pretty_print_review_and_label(i):
    print(labels[i] + "\t:\t" + reviews[i][:80] + "...")

g = open('reviews.txt','r') # What we know!
reviews = list(map(lambda x:x[:-1],g.readlines()))
g.close()

g = open('labels.txt','r') # What we WANT to know!
labels = list(map(lambda x:x[:-1].upper(),g.readlines()))
g.close()

In [4]:
print("labels.txt \t : \t reviews.txt\n")
pretty_print_review_and_label(10)
pretty_print_review_and_label(100)

labels.txt 	 : 	 reviews.txt

POSITIVE	:	this isn  t the comedic robin williams  nor is it the quirky  insane robin willi...
POSITIVE	:	i find it so amazing that even after all these years  we are still talking about...


## Step 1. Dig the gold

Here,we will figure out those words that influences the sentiment of a sentence 

In [5]:
from collections import Counter
import numpy as np

In [6]:
# Create three Counter objects to store positive, negative and total counts
positive_counts = Counter()
negative_counts = Counter()
total_counts = Counter()

In [7]:
# Loop over all the words in all the reviews and increment the counts in the appropriate counter objects
for i in range(len(reviews)):
    if(labels[i] == 'POSITIVE'):
        for word in reviews[i].split(" "):
            positive_counts[word] += 1
            total_counts[word] += 1
    else:
        for word in reviews[i].split(" "):
            negative_counts[word] += 1
            total_counts[word] += 1

As we can see from above, although we've created a dictionary to map each word to its occurence, but problems still exits, some words in positive_counts do not have positive meaning, and some words in the negative_counts do not have negatvie meaning. 

So we will use positive_negtive_rato and scale it up to make words with positive meaning pop up

In [8]:
pos_neg_ratios = Counter()

# Calculate the ratios of positive and negative uses of the most common words
# Consider words to be "common" if they've been used at least 100 times
for term,cnt in list(total_counts.most_common()):
    if(cnt > 100):
        pos_neg_ratio = positive_counts[term] / float(negative_counts[term]+1)
        pos_neg_ratios[term] = pos_neg_ratio

In [9]:
print("Pos-to-neg ratio for 'the' = {}".format(pos_neg_ratios["the"]))
print("Pos-to-neg ratio for 'amazing' = {}".format(pos_neg_ratios["amazing"]))
print("Pos-to-neg ratio for 'terrible' = {}".format(pos_neg_ratios["terrible"]))

Pos-to-neg ratio for 'the' = 1.0607993145235326
Pos-to-neg ratio for 'amazing' = 4.022813688212928
Pos-to-neg ratio for 'terrible' = 0.17744252873563218


In [10]:
# Convert ratios to logs
for word,ratio in pos_neg_ratios.most_common():
    pos_neg_ratios[word] = np.log(ratio)

In [11]:
print("Pos-to-neg ratio for 'the' = {}".format(pos_neg_ratios["the"]))
print("Pos-to-neg ratio for 'amazing' = {}".format(pos_neg_ratios["amazing"]))
print("Pos-to-neg ratio for 'terrible' = {}".format(pos_neg_ratios["terrible"]))

Pos-to-neg ratio for 'the' = 0.05902269426102881
Pos-to-neg ratio for 'amazing' = 1.3919815802404802
Pos-to-neg ratio for 'terrible' = -1.7291085042663878


Now we can see a much better dataset, positive value will have a value larger than 0 and negative value will have a value less than zero, and words cantain no sentiment will have a value around 0. 

Next, with everything set, we are now ready to dive deeper into creating our net work

- first, we'll create a dictionary to map each word to a number so that we can make matrix multiplication.

In [12]:
vocab = set(total_counts.keys())

In [13]:
vocab_size = len(vocab)
print(vocab_size)

74074


In [14]:
layer_0 = np.zeros((1,vocab_size))
layer_0.shape

(1, 74074)

In [17]:
# Create a dictionary of words in the vocabulary mapped to index positions 
# (to be used in layer_0)
word2index = {}
for i,word in enumerate(vocab):
    word2index[word] = i
    
# display the map of words to indices
word2index

{'': 0,
 'narration': 1,
 'belzer': 2,
 'conon': 3,
 'zuthe': 4,
 'denigrating': 5,
 'gelb': 6,
 'rotated': 7,
 'chancy': 8,
 'fitzgerald': 9,
 'hermamdad': 10,
 'maladroit': 11,
 'dealt': 12,
 'sandburg': 13,
 'milch': 14,
 'crony': 15,
 'goines': 16,
 'investments': 17,
 'preservatives': 18,
 'nymphomaniacs': 19,
 'conversant': 20,
 'mcdonell': 21,
 'unfilmable': 22,
 'discourses': 23,
 'grandeur': 24,
 'forbidding': 25,
 'pbs': 26,
 'dissenter': 27,
 'vigilantes': 28,
 'seducer': 29,
 'mensonges': 30,
 'theese': 31,
 'culturalism': 32,
 'rammed': 33,
 'jaya': 34,
 'pickard': 35,
 'grandmaster': 36,
 'churned': 37,
 'benetakos': 38,
 'qua': 39,
 'behaviors': 40,
 'disciple': 41,
 'grade': 42,
 'legendary': 43,
 'kumba': 44,
 'excellency': 45,
 'basicaly': 46,
 'ashmit': 47,
 'albeniz': 48,
 'caffari': 49,
 'hortensia': 50,
 'monthly': 51,
 'finnerty': 52,
 'jailed': 53,
 'lowlevel': 54,
 'subscribers': 55,
 'bulova': 56,
 'placates': 57,
 'regatta': 58,
 'japanse': 59,
 'retrieve': 6

As we can see from the above output, there are 74074 words in our dictionary, So the mapped number range is 0 to 74073

Here, each review will contain a small amount of words of our total words. And we've already initialized our input layer: layer_0 as a vector of size (1, 74074)

and we'll change it to represent the vector form of each review sentence,in addition I'm also going to change the labels into numbers

In [18]:
#change labels into numbers
def get_target_for_label(label):
    """Convert a label to `0` or `1`.
    Args:
        label(string) - Either "POSITIVE" or "NEGATIVE".
    Returns:
        `0` or `1`.
    """
    if(label == 'POSITIVE'):
        return 1
    else:
        return 0

In [19]:
#change the position in the vector layer_0 to 1 if the word is in that position
def update_input_layer(self,review):

    # clear out previous state, reset the layer to be all 0s
    self.layer_0 *= 0

    for word in review.split(" "):

        if(word in self.word2index.keys()):

            self.layer_0[0][self.word2index[word]] = 1


in fact the update_input_layer is not necessary as we can see it significantly reduce the speed , hence , it will be abandoned in the model, lets build our model 

In [20]:
from bokeh.models import ColumnDataSource, LabelSet
from bokeh.plotting import figure, show, output_file
from bokeh.io import output_notebook
output_notebook()

In [21]:
hist, edges = np.histogram(list(map(lambda x:x[1],pos_neg_ratios.most_common())), density=True, bins=100, normed=True)

p = figure(tools="pan,wheel_zoom,reset,save",
           toolbar_location="above",
           title="Word Positive/Negative Affinity Distribution")
p.quad(top=hist, bottom=0, left=edges[:-1], right=edges[1:], line_color="#555555")
show(p)

In [22]:
frequency_frequency = Counter()

for word, cnt in total_counts.most_common():
    frequency_frequency[cnt] += 1

In [23]:
hist, edges = np.histogram(list(map(lambda x:x[1],frequency_frequency.most_common())), density=True, bins=100, normed=True)

p = figure(tools="pan,wheel_zoom,reset,save",
           toolbar_location="above",
           title="The frequency distribution of the words in our corpus")
p.quad(top=hist, bottom=0, left=edges[:-1], right=edges[1:], line_color="#555555")
show(p)

In [24]:
import time
import sys
import numpy as np

# Encapsulate our neural network in a class
class SentimentNetwork:
    ## New for Project 6: added min_count and polarity_cutoff parameters
    def __init__(self, reviews,labels,min_count = 10,polarity_cutoff = 0.1,hidden_nodes = 10, learning_rate = 0.1):
        """Create a SentimenNetwork with the given settings
        Args:
            reviews(list) - List of reviews used for training
            labels(list) - List of POSITIVE/NEGATIVE labels associated with the given reviews
            min_count(int) - Words should only be added to the vocabulary 
                             if they occur more than this many times
            polarity_cutoff(float) - The absolute value of a word's positive-to-negative
                                     ratio must be at least this big to be considered.
            hidden_nodes(int) - Number of nodes to create in the hidden layer
            learning_rate(float) - Learning rate to use while training
        
        """
        # Assign a seed to our random number generator to ensure we get
        # reproducable results during development 
        np.random.seed(1)

        # process the reviews and their associated labels so that everything
        # is ready for training
        ## New for Project 6: added min_count and polarity_cutoff arguments to pre_process_data call
        self.pre_process_data(reviews, labels, polarity_cutoff, min_count)
        
        # Build the network to have the number of hidden nodes and the learning rate that
        # were passed into this initializer. Make the same number of input nodes as
        # there are vocabulary words and create a single output node.
        self.init_network(len(self.review_vocab),hidden_nodes, 1, learning_rate)

    ## New for Project 6: added min_count and polarity_cutoff parameters
    def pre_process_data(self, reviews, labels, polarity_cutoff, min_count):
        
        ## ----------------------------------------
        ## New for Project 6: Calculate positive-to-negative ratios for words before
        #                     building vocabulary
        #
        positive_counts = Counter()
        negative_counts = Counter()
        total_counts = Counter()

        for i in range(len(reviews)):
            if(labels[i] == 'POSITIVE'):
                for word in reviews[i].split(" "):
                    positive_counts[word] += 1
                    total_counts[word] += 1
            else:
                for word in reviews[i].split(" "):
                    negative_counts[word] += 1
                    total_counts[word] += 1

        pos_neg_ratios = Counter()

        for term,cnt in list(total_counts.most_common()):
            if(cnt >= 50):
                pos_neg_ratio = positive_counts[term] / float(negative_counts[term]+1)
                pos_neg_ratios[term] = pos_neg_ratio

        for word,ratio in pos_neg_ratios.most_common():
            if(ratio > 1):
                pos_neg_ratios[word] = np.log(ratio)
            else:
                pos_neg_ratios[word] = -np.log((1 / (ratio + 0.01)))
        #
        ## end New for Project 6
        ## ----------------------------------------

        # populate review_vocab with all of the words in the given reviews
        review_vocab = set()
        for review in reviews:
            for word in review.split(" "):
                ## New for Project 6: only add words that occur at least min_count times
                #                     and for words with pos/neg ratios, only add words
                #                     that meet the polarity_cutoff
                if(total_counts[word] > min_count):
                    if(word in pos_neg_ratios.keys()):
                        if((pos_neg_ratios[word] >= polarity_cutoff) or (pos_neg_ratios[word] <= -polarity_cutoff)):
                            review_vocab.add(word)
                    else:
                        review_vocab.add(word)

        # Convert the vocabulary set to a list so we can access words via indices
        self.review_vocab = list(review_vocab)
        
        # populate label_vocab with all of the words in the given labels.
        label_vocab = set()
        for label in labels:
            label_vocab.add(label)
        
        # Convert the label vocabulary set to a list so we can access labels via indices
        self.label_vocab = list(label_vocab)
        
        # Store the sizes of the review and label vocabularies.
        self.review_vocab_size = len(self.review_vocab)
        self.label_vocab_size = len(self.label_vocab)
        
        # Create a dictionary of words in the vocabulary mapped to index positions
        self.word2index = {}
        for i, word in enumerate(self.review_vocab):
            self.word2index[word] = i
        
        # Create a dictionary of labels mapped to index positions
        self.label2index = {}
        for i, label in enumerate(self.label_vocab):
            self.label2index[label] = i

    def init_network(self, input_nodes, hidden_nodes, output_nodes, learning_rate):
        # Set number of nodes in input, hidden and output layers.
        self.input_nodes = input_nodes
        self.hidden_nodes = hidden_nodes
        self.output_nodes = output_nodes

        # Store the learning rate
        self.learning_rate = learning_rate

        # Initialize weights

        # These are the weights between the input layer and the hidden layer.
        self.weights_0_1 = np.zeros((self.input_nodes,self.hidden_nodes))

        # These are the weights between the hidden layer and the output layer.
        self.weights_1_2 = np.random.normal(0.0, self.output_nodes**-0.5, 
                                                (self.hidden_nodes, self.output_nodes))
        
        ## New for Project 5: Removed self.layer_0; added self.layer_1
        # The input layer, a two-dimensional matrix with shape 1 x hidden_nodes
        self.layer_1 = np.zeros((1,hidden_nodes))
    
    ## New for Project 5: Removed update_input_layer function
    
    def get_target_for_label(self,label):
        if(label == 'POSITIVE'):
            return 1
        else:
            return 0
        
    def sigmoid(self,x):
        return 1 / (1 + np.exp(-x))
    
    def sigmoid_output_2_derivative(self,output):
        return output * (1 - output)
    
    ## New for Project 5: changed name of first parameter form 'training_reviews' 
    #                     to 'training_reviews_raw'
    def train(self, training_reviews_raw, training_labels):

        ## New for Project 5: pre-process training reviews so we can deal 
        #                     directly with the indices of non-zero inputs
        training_reviews = list()
        for review in training_reviews_raw:
            indices = set()
            for word in review.split(" "):
                if(word in self.word2index.keys()):
                    indices.add(self.word2index[word])
            training_reviews.append(list(indices))

        # make sure out we have a matching number of reviews and labels
        assert(len(training_reviews) == len(training_labels))
        
        # Keep track of correct predictions to display accuracy during training 
        correct_so_far = 0

        # Remember when we started for printing time statistics
        start = time.time()
        
        # loop through all the given reviews and run a forward and backward pass,
        # updating weights for every item
        for i in range(len(training_reviews)):
            
            # Get the next review and its correct label
            review = training_reviews[i]
            label = training_labels[i]
            
            #### Implement the forward pass here ####
            ### Forward pass ###

            ## New for Project 5: Removed call to 'update_input_layer' function
            #                     because 'layer_0' is no longer used

            # Hidden layer
            ## New for Project 5: Add in only the weights for non-zero items
            self.layer_1 *= 0
            for index in review:
                self.layer_1 += self.weights_0_1[index]

            # Output layer
            ## New for Project 5: changed to use 'self.layer_1' instead of 'local layer_1'
            layer_2 = self.sigmoid(self.layer_1.dot(self.weights_1_2))            
            
            #### Implement the backward pass here ####
            ### Backward pass ###

            # Output error
            layer_2_error = layer_2 - self.get_target_for_label(label) # Output layer error is the difference between desired target and actual output.
            layer_2_delta = layer_2_error * self.sigmoid_output_2_derivative(layer_2)

            # Backpropagated error
            layer_1_error = layer_2_delta.dot(self.weights_1_2.T) # errors propagated to the hidden layer
            layer_1_delta = layer_1_error # hidden layer gradients - no nonlinearity so it's the same as the error

            # Update the weights
            ## New for Project 5: changed to use 'self.layer_1' instead of local 'layer_1'
            self.weights_1_2 -= self.layer_1.T.dot(layer_2_delta) * self.learning_rate # update hidden-to-output weights with gradient descent step
            
            ## New for Project 5: Only update the weights that were used in the forward pass
            for index in review:
                self.weights_0_1[index] -= layer_1_delta[0] * self.learning_rate # update input-to-hidden weights with gradient descent step

            # Keep track of correct predictions.
            if(layer_2 >= 0.5 and label == 'POSITIVE'):
                correct_so_far += 1
            elif(layer_2 < 0.5 and label == 'NEGATIVE'):
                correct_so_far += 1
            
            # For debug purposes, print out our prediction accuracy and speed 
            # throughout the training process. 
            elapsed_time = float(time.time() - start)
            reviews_per_second = i / elapsed_time if elapsed_time > 0 else 0
            
            sys.stdout.write("\rProgress:" + str(100 * i/float(len(training_reviews)))[:4] \
                             + "% Speed(reviews/sec):" + str(reviews_per_second)[0:5] \
                             + " #Correct:" + str(correct_so_far) + " #Trained:" + str(i+1) \
                             + " Training Accuracy:" + str(correct_so_far * 100 / float(i+1))[:4] + "%")
            if(i % 2500 == 0):
                print("")
    
    def test(self, testing_reviews, testing_labels):
        """
        Attempts to predict the labels for the given testing_reviews,
        and uses the test_labels to calculate the accuracy of those predictions.
        """
        
        # keep track of how many correct predictions we make
        correct = 0

        # we'll time how many predictions per second we make
        start = time.time()

        # Loop through each of the given reviews and call run to predict
        # its label. 
        for i in range(len(testing_reviews)):
            pred = self.run(testing_reviews[i])
            if(pred == testing_labels[i]):
                correct += 1
            
            # For debug purposes, print out our prediction accuracy and speed 
            # throughout the prediction process. 

            elapsed_time = float(time.time() - start)
            reviews_per_second = i / elapsed_time if elapsed_time > 0 else 0
            
            sys.stdout.write("\rProgress:" + str(100 * i/float(len(testing_reviews)))[:4] \
                             + "% Speed(reviews/sec):" + str(reviews_per_second)[0:5] \
                             + " #Correct:" + str(correct) + " #Tested:" + str(i+1) \
                             + " Testing Accuracy:" + str(correct * 100 / float(i+1))[:4] + "%")
    
    def run(self, review):
        """
        Returns a POSITIVE or NEGATIVE prediction for the given review.
        """
        # Run a forward pass through the network, like in the "train" function.
        
        ## New for Project 5: Removed call to update_input_layer function
        #                     because layer_0 is no longer used

        # Hidden layer
        ## New for Project 5: Identify the indices used in the review and then add
        #                     just those weights to layer_1 
        self.layer_1 *= 0
        unique_indices = set()
        for word in review.lower().split(" "):
            if word in self.word2index.keys():
                unique_indices.add(self.word2index[word])
        for index in unique_indices:
            self.layer_1 += self.weights_0_1[index]
        
        # Output layer
        ## New for Project 5: changed to use self.layer_1 instead of local layer_1
        layer_2 = self.sigmoid(self.layer_1.dot(self.weights_1_2))
         
        # Return POSITIVE for values above greater-than-or-equal-to 0.5 in the output layer;
        # return NEGATIVE for other values
        if(layer_2[0] >= 0.5):
            return "POSITIVE"
        else:
            return "NEGATIVE"


In [29]:
epoach =3
mlp = SentimentNetwork(reviews[:-1000]*epoach,labels[:-1000]*epoach,min_count=20,polarity_cutoff=0.05,learning_rate=0.01)
mlp.train(reviews[:-1000],labels[:-1000])

Progress:0.0% Speed(reviews/sec):0.0 #Correct:1 #Trained:1 Training Accuracy:100.%
Progress:10.4% Speed(reviews/sec):1643. #Correct:1996 #Trained:2501 Training Accuracy:79.8%
Progress:20.8% Speed(reviews/sec):1635. #Correct:4066 #Trained:5001 Training Accuracy:81.3%
Progress:31.2% Speed(reviews/sec):1643. #Correct:6177 #Trained:7501 Training Accuracy:82.3%
Progress:41.6% Speed(reviews/sec):1645. #Correct:8343 #Trained:10001 Training Accuracy:83.4%
Progress:52.0% Speed(reviews/sec):1646. #Correct:10508 #Trained:12501 Training Accuracy:84.0%
Progress:62.5% Speed(reviews/sec):1598. #Correct:12652 #Trained:15001 Training Accuracy:84.3%
Progress:72.9% Speed(reviews/sec):1590. #Correct:14792 #Trained:17501 Training Accuracy:84.5%
Progress:83.3% Speed(reviews/sec):1597. #Correct:16962 #Trained:20001 Training Accuracy:84.8%
Progress:93.7% Speed(reviews/sec):1595. #Correct:19157 #Trained:22501 Training Accuracy:85.1%
Progress:99.9% Speed(reviews/sec):1596. #Correct:20479 #Trained:24000 Training

In [30]:
mlp.test(reviews[-1000:],labels[-1000:])

Progress:99.9% Speed(reviews/sec):1995. #Correct:859 #Tested:1000 Testing Accuracy:85.9%

In [31]:
epoach = 3
mlp = SentimentNetwork(reviews[:-1000]*epoach,labels[:-1000]*epoach,min_count=20,polarity_cutoff=0.8,learning_rate=0.01)
mlp.train(reviews[:-1000],labels[:-1000])

Progress:0.0% Speed(reviews/sec):0.0 #Correct:1 #Trained:1 Training Accuracy:100.%
Progress:10.4% Speed(reviews/sec):4720. #Correct:2126 #Trained:2501 Training Accuracy:85.0%
Progress:20.8% Speed(reviews/sec):4309. #Correct:4253 #Trained:5001 Training Accuracy:85.0%
Progress:31.2% Speed(reviews/sec):4833. #Correct:6385 #Trained:7501 Training Accuracy:85.1%
Progress:41.6% Speed(reviews/sec):4689. #Correct:8531 #Trained:10001 Training Accuracy:85.3%
Progress:52.0% Speed(reviews/sec):4489. #Correct:10667 #Trained:12501 Training Accuracy:85.3%
Progress:62.5% Speed(reviews/sec):4671. #Correct:12830 #Trained:15001 Training Accuracy:85.5%
Progress:72.9% Speed(reviews/sec):4458. #Correct:14953 #Trained:17501 Training Accuracy:85.4%
Progress:83.3% Speed(reviews/sec):4449. #Correct:17134 #Trained:20001 Training Accuracy:85.6%
Progress:93.7% Speed(reviews/sec):4446. #Correct:19324 #Trained:22501 Training Accuracy:85.8%
Progress:99.9% Speed(reviews/sec):4403. #Correct:20635 #Trained:24000 Training

In [32]:
mlp.test(reviews[-1000:],labels[-1000:])

Progress:99.9% Speed(reviews/sec):4100. #Correct:827 #Tested:1000 Testing Accuracy:82.7%

# Conclusion 

This model doesn't consider the effect of word position, which will surely affect the performance of the model. Let's take this example " people love this movie, but i don't like it", without the order, what we are doing is actually summing the positive words and negtive words, and if we change the review to "people don't like this movie, but i like it" the result will be the same . 

Seems we still have long way to go~

In [35]:
myreview = input()
print(mlp.run(myreview))

people don t love this movie, but i like it
POSITIVE


In [36]:
myreview = input()
print(mlp.run(myreview))

people love this movie, but i don t like it
POSITIVE


In [47]:
def get_most_similar_words(focus = "horrible"):
    most_similar = Counter()

    for word in mlp.word2index.keys():
        most_similar[word] = np.dot(mlp.weights_0_1[mlp.word2index[word]],mlp.weights_0_1[mlp.word2index[focus]])
    
    return most_similar.most_common()

In [48]:
get_most_similar_words("excellent")

[('excellent', 0.069107075737698087),
 ('perfect', 0.068953383810783281),
 ('wonderful', 0.064133330405019534),
 ('amazing', 0.059332116352504113),
 ('funniest', 0.056472104175322634),
 ('favorite', 0.056019684247257458),
 ('today', 0.055223597843822296),
 ('fantastic', 0.052224167466387472),
 ('refreshing', 0.052204438921500243),
 ('gem', 0.050779292879830983),
 ('heart', 0.049215228923702087),
 ('wonderfully', 0.047423213806853944),
 ('superb', 0.045319706271143678),
 ('rare', 0.04454880209984384),
 ('pleasantly', 0.044202197934326531),
 ('awesome', 0.043541187393714251),
 ('recommended', 0.043170700227525463),
 ('perfectly', 0.042639476041453142),
 ('enjoyed', 0.042483073983882833),
 ('great', 0.042194818749135304),
 ('captures', 0.040850142643997736),
 ('fascinating', 0.040136442498059872),
 ('touching', 0.039591670779449008),
 ('incredible', 0.039211481443152132),
 ('solid', 0.038841504670561716),
 ('vhs', 0.038797083052315523),
 ('sweet', 0.038279480140434252),
 ('subtle', 0.0376

In [49]:
get_most_similar_words("terrible")

[('worst', 0.074664347811613963),
 ('waste', 0.074227033291572861),
 ('awful', 0.070830866823746202),
 ('poorly', 0.05982116348030387),
 ('fails', 0.055201038978064873),
 ('terrible', 0.053430441889746549),
 ('dull', 0.052766999263105616),
 ('horrible', 0.052058867859236524),
 ('mess', 0.051319748733534135),
 ('disappointment', 0.05129630318637185),
 ('wasted', 0.051055163062972912),
 ('lacks', 0.047867283004409268),
 ('disappointing', 0.047048668982458693),
 ('worse', 0.04434195115302518),
 ('avoid', 0.042435773257135732),
 ('pointless', 0.041456070562567712),
 ('mediocre', 0.04054583957036835),
 ('redeeming', 0.040422744855945228),
 ('poor', 0.040203907225305119),
 ('pathetic', 0.039552573136484769),
 ('ridiculous', 0.039382583267819529),
 ('annoying', 0.039328912329139154),
 ('laughable', 0.039000299769561403),
 ('wooden', 0.038759952953299578),
 ('badly', 0.038484359714607119),
 ('boring', 0.038045492582859254),
 ('crap', 0.036701822637005013),
 ('unfunny', 0.036664918831800501),
 

In [52]:
import matplotlib.colors as colors

words_to_visualize = list()
for word, ratio in pos_neg_ratios.most_common(500):
    if(word in mlp.word2index.keys()):
        words_to_visualize.append(word)
    
for word, ratio in list(reversed(pos_neg_ratios.most_common()))[0:500]:
    if(word in mlp.word2index.keys()):
        words_to_visualize.append(word)

In [55]:
pos = 0
neg = 0

colors_list = list()
vectors_list = list()
for word in words_to_visualize:
    if word in pos_neg_ratios.keys():
        vectors_list.append(mlp.weights_0_1[mlp.word2index[word]])
        if(pos_neg_ratios[word] > 0):
            pos+=1
            colors_list.append("#00ff00")
        else:
            neg+=1
            colors_list.append("#000000")
    

In [56]:
from sklearn.manifold import TSNE
tsne = TSNE(n_components=2, random_state=0)
words_top_ted_tsne = tsne.fit_transform(vectors_list)

our final result should be come out soon~

In [57]:
p = figure(tools="pan,wheel_zoom,reset,save",
           toolbar_location="above",
           title="vector T-SNE for most polarized words")

source = ColumnDataSource(data=dict(x1=words_top_ted_tsne[:,0],
                                    x2=words_top_ted_tsne[:,1],
                                    names=words_to_visualize,
                                    color=colors_list))

p.scatter(x="x1", y="x2", size=8, source=source, fill_color="color")

word_labels = LabelSet(x="x1", y="x2", text="names", y_offset=6,
                  text_font_size="8pt", text_color="#555555",
                  source=source, text_align='center')
p.add_layout(word_labels)

show(p)

# green indicates positive words, black indicates negative words