## Author: Pritish Yuvraj
#### Sentiment Analysis using Deep Learning
#### Natural Language Processing (NLP) using LSTM

In [1]:
import re 
from pprint import pprint 
import numpy as np 

In [2]:
Data = """
I study at University of Massachusetts Amherst. 
I love staying here. Its fun here. 
It is quite cold out here. But I love this weather. 
I wish I could be pursuing my UnderGrad here.
"""

In [3]:
def display(X, Y):
    for i, j in zip(X, Y):
        print i, Y[j]

In [4]:
X = [x for x in Data.strip().split('.') if x != '']
Y = {}
#RegEx
import re 
for index, sentence in enumerate(X):
    X[index], Y[index] = re.findall(r"\w+", sentence.lower()), 0
#print (X, Y)
Y[0] = 1
Y[1] = 2
Y[2] = 2
Y[3] = 0
Y[4] = 2
Y[5] = 1
display(X, Y)
sentence_x, sentiment_y = X, Y
#0: negative
#1: Neutral
#2: Positive

['i', 'study', 'at', 'university', 'of', 'massachusetts', 'amherst'] 1
['i', 'love', 'staying', 'here'] 2
['its', 'fun', 'here'] 2
['it', 'is', 'quite', 'cold', 'out', 'here'] 0
['but', 'i', 'love', 'this', 'weather'] 2
['i', 'wish', 'i', 'could', 'be', 'pursuing', 'my', 'undergrad', 'here'] 1


In [5]:
#Padding 
length = 10
for sent_index, sentence in enumerate(sentence_x):
    if len(sentence)<10:
        #Padding
        sentence_x[sent_index] += ["<pad>" for x in xrange(len(sentence), length)]
    else:
        sentence_x[sent_index] = sentence_x[sent_index][0:length]


In [6]:
#Padding
print ([len(sent) for sent in sentence_x])
print sentence_x

[10, 10, 10, 10, 10, 10]
[['i', 'study', 'at', 'university', 'of', 'massachusetts', 'amherst', '<pad>', '<pad>', '<pad>'], ['i', 'love', 'staying', 'here', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>'], ['its', 'fun', 'here', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>'], ['it', 'is', 'quite', 'cold', 'out', 'here', '<pad>', '<pad>', '<pad>', '<pad>'], ['but', 'i', 'love', 'this', 'weather', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>'], ['i', 'wish', 'i', 'could', 'be', 'pursuing', 'my', 'undergrad', 'here', '<pad>']]


In [7]:
#Finding Unique Vocabulory
word2index = {}
count = 0
for sentence in sentence_x:
    for word in sentence:
        if word not in word2index:
            word2index[word] = count
            count += 1 
index2words = dict((v, k) for k, v in word2index.iteritems())
#pprint(word2index)
#pprint(index2words)

In [8]:
#Converting Words to indexes
for sent_index, sent in enumerate(sentence_x):
    for word_index, word in enumerate(sent):
        sentence_x[sent_index][word_index] = word2index[sentence_x[sent_index][word_index]]
print sentence_x

[[0, 1, 2, 3, 4, 5, 6, 7, 7, 7], [0, 8, 9, 10, 7, 7, 7, 7, 7, 7], [11, 12, 10, 7, 7, 7, 7, 7, 7, 7], [13, 14, 15, 16, 17, 10, 7, 7, 7, 7], [18, 0, 8, 19, 20, 7, 7, 7, 7, 7], [0, 21, 0, 22, 23, 24, 25, 26, 10, 7]]


In [9]:
#One hot encoding
def one_hot_2D(array, list_length):
    for index_i, sentence in enumerate(array):
        for index_j, word in enumerate(sentence):
            temp = [0 for x in xrange(list_length)]
            temp[array[index_i][index_j]] = 1
            array[index_i][index_j] = temp
    return array
train_x = one_hot_2D(sentence_x, len(word2index))

In [10]:
def one_hot_1D(array, list_length):
    return_array = []
    for index_i, indices in enumerate(array):
        temp = [0 for x in xrange(list_length)]
        temp[array[indices]] = 1
        return_array.append(temp)
    return return_array
train_y = one_hot_1D(sentiment_y, 3)

In [11]:
train_x = np.asarray(train_x)
train_y = np.asarray(train_y)
'''for i in xrange(len(train_x)):
    for j in xrange(len(train_x[i])):
        for k in xrange(len(train_x[i][j])):
            pass
        #print i, j, k'''
print train_x.shape, train_y.shape 

(6, 10, 27) (6, 3)


In [12]:
import tensorflow as tf 
from tensorflow.contrib import rnn

In [13]:
#Training Parameters
learning_rate = 0.001
training_steps = 1000
batch_size = 128
display_step = 200

In [14]:
#Network parameters 
num_input = train_x.shape[2]
timesteps = train_x.shape[1]
num_hidden = 128
num_classes = train_y.shape[1]

In [15]:
#tf Graph input
X = tf.placeholder("float", [None, timesteps, num_input])
Y = tf.placeholder("float", [None, num_classes])

In [16]:
# Define weights
weights = {
    'out': tf.Variable(tf.random_normal([num_hidden, num_classes]))
}
biases = {
    'out': tf.Variable(tf.random_normal([num_classes]))
}

In [19]:
def RNN(x, weights, biases):

    # Prepare data shape to match `rnn` function requirements
    # Current data input shape: (batch_size, timesteps, n_input)
    # Required shape: 'timesteps' tensors list of shape (batch_size, n_input)

    # Unstack to get a list of 'timesteps' tensors of shape (batch_size, n_input)
    x = tf.unstack(x, timesteps, 1)

    # Define a lstm cell with tensorflow
    lstm_cell = rnn.BasicLSTMCell(num_hidden, forget_bias=1.0)

    # Get lstm cell output
    outputs, states = rnn.static_rnn(lstm_cell, x, dtype=tf.float32)

    # Linear activation, using rnn inner loop last output
    return tf.matmul(outputs[-1], weights['out']) + biases['out']


In [20]:
logits = RNN(X, weights, biases)
prediction = tf.nn.softmax(logits)

# Define loss and optimizer
loss_op = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(
    logits=logits, labels=Y))
optimizer = tf.train.GradientDescentOptimizer(learning_rate=learning_rate)
train_op = optimizer.minimize(loss_op)

# Evaluate model (with test logits, for dropout to be disabled)
correct_pred = tf.equal(tf.argmax(prediction, 1), tf.argmax(Y, 1))
accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))

# Initialize the variables (i.e. assign their default value)
init = tf.global_variables_initializer()

In [23]:
# Start training
with tf.Session() as sess:

    # Run the initializer
    sess.run(init)

    for step in range(1, training_steps+1):
        # Run optimization op (backprop)
        sess.run(train_op, feed_dict={X: train_x, Y: train_y})
        if step % display_step == 0 or step == 1:
            # Calculate batch loss and accuracy
            loss, acc = sess.run([loss_op, accuracy], feed_dict={X: train_x,
                                                                 Y: train_y})
            print("Step " + str(step) + ", Minibatch Loss= " + \
                  "{:.4f}".format(loss) + ", Training Accuracy= " + \
                  "{:.3f}".format(acc))

    print("Optimization Finished!")

Step 1, Minibatch Loss= 1.2209, Training Accuracy= 0.500
Step 200, Minibatch Loss= 0.7177, Training Accuracy= 0.667
Step 400, Minibatch Loss= 0.5367, Training Accuracy= 0.833
Step 600, Minibatch Loss= 0.4097, Training Accuracy= 0.833
Step 800, Minibatch Loss= 0.3137, Training Accuracy= 1.000
Step 1000, Minibatch Loss= 0.2399, Training Accuracy= 1.000
Optimization Finished!
