# RNN based Sentiment Analysis

### Task for the student
- Read the data set provided
- Spilt the data into train and test
- Perform the seniment analysis using RNN
- Report the accuracy of the model

### Import relevant modules

In [1]:
from __future__ import print_function
#import io
import gzip
import pyprind
import pandas as pd

from string import punctuation

import re
from nltk.stem.porter import PorterStemmer
import nltk
from nltk.corpus import stopwords
import numpy as np
import os

from collections import Counter
import tensorflow as tf
import sys
from sklearn.utils import shuffle
#sys.stdout= open("output_rrn_sentiment.txt","w")

### Read the data set

In [2]:
df = pd.read_csv('./news_test.csv',encoding='utf-8',
                  names = ["sentiment", "title", "review"])
print (len(df.columns))
print(list(df.columns.values))
print(df.loc[0,'sentiment'])
print(df.loc[0,'review'])
print(len(df['review']))


#print(df.head(3).values)    


3
['sentiment', 'title', 'review']
3
Unions representing workers at Turner   Newall say they are 'disappointed' after talks with stricken parent firm Federal Mogul.
7600


### Create one hot vector label

In [3]:
senti_label = np.unique(df['sentiment'])
print(senti_label)
print('label count',len(senti_label))
label_stack = []


for i, j in enumerate(df['sentiment']):  
    one_hot_vector =np.zeros(len(senti_label),dtype=np.int16)
    one_hot_vector[j-1]=1
    label_stack.append(one_hot_vector)
   
    
#y_train = df.loc[0:20, 'sentiment'].values

[1 2 3 4]
label count 4


###  Preprocessing the data

In [4]:
counts = Counter()
pbar = pyprind.ProgBar(len(df['review']), title='Counting words occurences')
for i,review in enumerate(df['review']):
    text = ''.join([c if c not in punctuation else ' '+c+' ' for c in review]).lower() 
    df.loc[i,'review'] = text
    pbar.update()
    counts.update(text.split())
    
print("Finished Counting")
#print(df.head(2).values)
print(counts)
#print(df['sentiment']

Counting words occurences
0% [##############################] 100% | ETA: 00:00:00

Finished Counting



Total time elapsed: 00:00:05


### Map each unique word to an integer

In [5]:
word_counts = sorted(counts, key=counts.get, reverse=True)
print(word_counts[:5])
print("complete")
word_to_int = {word: ii for ii, word in enumerate(word_counts, 1)}

mapped_reviews = []
pbar = pyprind.ProgBar(len(df['review']), title='Map reviews to ints')
for review in df['review']:
    mapped_reviews.append([word_to_int[word] for word in review.split()]) 
    pbar.update()

print(mapped_reviews[:5])

Map reviews to ints


['.', 'the', ',', '-', 'a']
complete


0% [##############################] 100% | ETA: 00:00:00

[[1631, 2880, 377, 24, 4186, 8055, 210, 70, 49, 18, 4726, 18, 39, 240, 19, 11090, 2355, 424, 180, 5418, 1], [245, 1, 114, 4, 808, 3, 665, 4, 4, 5, 94, 17, 120, 7, 11091, 3113, 13, 2, 14, 469, 10, 161, 91, 3391, 1495, 1153, 3, 5, 2356, 13, 17, 2512, 5419, 8056, 245, 1044, 3, 29, 1632, 122, 2, 48, 17, 401, 1711, 13, 23, 5420, 1193, 1], [53, 4, 5, 60, 4187, 28, 5, 6498, 3392, 24, 2, 736, 7, 2686, 195, 5, 3749, 6, 1116, 5, 4727, 7, 1990, 494, 8057, 3, 98, 49, 677, 11092, 7, 8058, 6499, 3, 2, 987, 3750, 7, 11093, 1], [53, 4, 27, 18, 11, 2357, 5421, 89, 959, 11094, 1496, 34, 2687, 19, 5, 11095, 7, 8059, 6500, 3, 1194, 9, 6501, 8060, 3, 59, 459, 51, 3114, 199, 2, 115, 37, 1195, 1, 8061, 37, 831, 8, 2216, 51, 1391, 1, 2116, 37, 1883, 63, 3, 11096, 2216, 37, 4188, 9, 5422, 37, 11097, 1], [53, 4, 460, 531, 18, 11, 11098, 4, 751, 370, 809, 39, 3115, 7, 2, 11099, 4728, 71, 3, 8062, 2, 383, 18, 11, 48, 1117, 6, 1331, 305, 4729, 31, 3393, 3751, 11100, 1]]



Total time elapsed: 00:00:00


### Define hyper parameters

In [6]:
sequence_length = 400
batch_size = 64
chkpoint_epoch = 1
num_epochs = 2 #try 200, 400, 600
cutoff = 5000

###  Get X and Y Data ready

In [7]:
sequences = np.zeros((len(mapped_reviews), sequence_length), dtype=int)
for i, row in enumerate(mapped_reviews):
    review_arr = np.array(row)
    sequences[i, -len(row):] = review_arr[-sequence_length:]

X_train = sequences[:cutoff, :]
y_train = label_stack[:cutoff]
#y_train = df.loc[:cutoff, 'sentiment'].values

X_test = sequences[cutoff:, :]
y_test = label_stack[cutoff:]
print(len(y_test))
#y_test = df.loc[cutoff:, 'sentiment'].values


np.random.seed(123) # for reproducibility


2600


### Function to generate minibatches

In [8]:
def create_batch_generator(x, y=None, batch_size=64):
    n_batches = len(x)//batch_size
    x= x[:n_batches*batch_size]
    if y is not None:
        y = y[:n_batches*batch_size]
    for ii in range(0, len(x), batch_size):
        if y is not None:
            yield x[ii:ii+batch_size], y[ii:ii+batch_size]
        else:
            yield x[ii:ii+batch_size]

### Embedding &  Building the RNN model

In [9]:
class SentimentRNN(object):
    def __init__(self,n_words,seq_len=sequence_length,lstm_size=256,num_layers=1, 
                 batch_size=batch_size,learning_rate=0.0001, embed_size=200,
                 hot_vector_size=len(senti_label)):
        
        self.n_words = n_words
        self.seq_len = seq_len       ## Max length of sentence  
        self.lstm_size = lstm_size   ## number of hidden units
        self.num_layers = num_layers ##for now it is one layered
        self.batch_size = batch_size
        self.learning_rate = learning_rate
        self.embed_size = embed_size ##size of embedding vector
        self.hot_vector_size= hot_vector_size

        self.g = tf.Graph() 
        with self.g.as_default():
            tf.set_random_seed(123)
            self.build()
            self.saver = tf.train.Saver() 
            self.init_op = tf.global_variables_initializer()

    def build(self):
        ## Define the placeholders
        tf_x = tf.placeholder(tf.int32, shape=(self.batch_size, self.seq_len), name='tf_x')
        #tf_y = tf.placeholder(tf.float32, shape=(self.batch_size), name='tf_y') 
        tf_y = tf.placeholder(tf.float32, shape=(self.batch_size, self.hot_vector_size), 
                              name='tf_y')
        
        tf_keepprob = tf.placeholder(tf.float32, name='tf_keepprob')
        
        ## Create the embedding layer
        embedding = tf.Variable(tf.random_uniform((self.n_words, self.embed_size),
                                                  minval=-1, maxval=1), name='embedding')
        
        embed_x = tf.nn.embedding_lookup( embedding, tf_x,  name='embeded_x')

        
        ## Define LSTM cell and stack them together 
        cells = tf.contrib.rnn.MultiRNNCell(
                [tf.contrib.rnn.DropoutWrapper(tf.contrib.rnn.BasicLSTMCell(self.lstm_size), 
                                               output_keep_prob=tf_keepprob)
                 for i in range(self.num_layers)])

        
        ## Define the initial state:
        self.initial_state = cells.zero_state(self.batch_size, tf.float32)
        print('  << initial state >> ', self.initial_state)

        lstm_outputs, self.final_state = tf.nn.dynamic_rnn(cells, embed_x, 
                                                           initial_state=self.initial_state)
        ## Note: lstm_outputs shape:#  [batch_size, max_time, cells.output_size]
        print('\n  << lstm_output   >> ', lstm_outputs)
        print('\n  << final state   >> ', self.final_state)

        ## Apply a FC layer after on top of RNN output:
        #logits = tf.layers.dense( inputs=lstm_outputs[:, -1], units=1, 
        #                         activation=None, name='logits')  
        
        logits = tf.layers.dense( inputs=lstm_outputs[:, -1], units=self.hot_vector_size, 
                                 activation=None, name='logits')   
       
        logits = tf.squeeze(logits, name='logits_squeezed')#quirkyness of tensorflow 
        print ('\n  << logits squeezed        >> ', logits)
        
        y_proba = tf.nn.sigmoid(logits, name='probabilities')
        predictions = {'probabilities': y_proba, 
                       'labels' : tf.cast(tf.round(y_proba), tf.int32, name='labels') }
        
        print('\n  << predictions   >> ', predictions)

        ## Define the cost function. Logit is passed through sigmoid before calculating cross entropy
        cost = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits
                              ( labels=tf_y, logits=logits), name='cost')
        
        ## Define the optimizer
        optimizer = tf.train.AdamOptimizer(self.learning_rate)
        train_op = optimizer.minimize(cost, name='train_op')

    def train(self, X_train, y_train, num_epochs, chkpoint_epoch):
        with tf.Session(graph=self.g) as sess:
            sess.run(self.init_op) 
            iteration = 1
            for epoch in range(num_epochs):
                state = sess.run(self.initial_state)
                
                for batch_x, batch_y in create_batch_generator(
                    X_train, y_train, self.batch_size):
                    feed = {'tf_x:0': batch_x, 
                            'tf_y:0': batch_y, 
                            'tf_keepprob:0': 0.5, 
                            self.initial_state : state}
                    
                    loss, _, state = sess.run(
                        ['cost:0', 'train_op', 
                         self.final_state], 
                        feed_dict=feed)

                    if iteration % 5 == 0:
                        print("Epoch: %d/%d Iteration: %d "
                              "| Train loss: %.5f" % (
                               epoch + 1, num_epochs,
                               iteration, loss))                        
                        
                    iteration +=1
                    
                if (epoch+1)%chkpoint_epoch == 0:
                    self.saver.save(sess, "model/sentiment-%d.ckpt" % epoch)

                    
    def predict(self, X_data, return_proba=False):
        
        preds = []
        with tf.Session(graph = self.g) as sess:
            self.saver.restore(
                sess, tf.train.latest_checkpoint('model/'))  
            
            test_state = sess.run(self.initial_state)
            
                    
            for ii, batch_x in enumerate(
                create_batch_generator(
                    X_data, None, batch_size=self.batch_size), 1):
                
                
                feed = {'tf_x:0' : batch_x,
                        'tf_keepprob:0': 1.0, 
                        self.initial_state : test_state}
                
                if return_proba:
                    pred, test_state = sess.run( 
                        ['probabilities:0', self.final_state], 
                        feed_dict=feed)
                else:
                    pred, test_state = sess.run(
                        ['labels:0', self.final_state], 
                        feed_dict=feed)     
                    
                preds.append(pred)
                  
        return np.concatenate(preds)

### Main: Define multilayer LSTM cells, Define Initial state, Create the network. Train & Test

In [10]:
n_words = max(list(word_to_int.values())) + 1

rnn = SentimentRNN(n_words=n_words, seq_len=sequence_length, embed_size=256, lstm_size=128, 
                   num_layers=1, batch_size=100, learning_rate=0.001)


##Train the network
if (num_epochs < chkpoint_epoch+1):
    num_epochs = chkpoint_epoch+1
    
rnn.train(X_train, y_train, num_epochs,chkpoint_epoch)

## Test the network: 
preds = rnn.predict(X_test)

The TensorFlow contrib module will not be included in TensorFlow 2.0.
For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
  * https://github.com/tensorflow/io (for I/O related ops)
If you depend on functionality not listed there, please file an issue.

Instructions for updating:
This class is equivalent as tf.keras.layers.LSTMCell, and will be replaced by that in Tensorflow 2.0.
Instructions for updating:
This class is equivalent as tf.keras.layers.StackedRNNCells, and will be replaced by that in Tensorflow 2.0.
  << initial state >>  (LSTMStateTuple(c=<tf.Tensor 'MultiRNNCellZeroState/DropoutWrapperZeroState/BasicLSTMCellZeroState/zeros:0' shape=(100, 128) dtype=float32>, h=<tf.Tensor 'MultiRNNCellZeroState/DropoutWrapperZeroState/BasicLSTMCellZeroState/zeros_1:0' shape=(100, 128) dtype=float32>),)
Instructions for updating:
Please use `keras.layers.RNN(cell)`, which is equiv

### Print Accuracy of Test Set

In [11]:
print(len(preds))
print(np.shape(preds))
print(np.shape(y_test))
print(np.sum(np.multiply(preds,y_test)))
num= float(np.sum(np.multiply(preds,y_test)))
den = float(len(y_test))
print('Test Acc.: %.3f' % (num/den))

2600
(2600, 4)
(2600, 4)
1494
Test Acc.: 0.575
