# IMDB Sentiment Analysis in Tensorflow

## Importing modules

In [1]:
import numpy as np
import re
import urllib.request as req 
import tarfile
import os
import zipfile
from collections import defaultdict
import tensorflow as tf

## URL of database

In [2]:
#URL
imdb_url = 'https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz'

## Creating files and folders from dataset

In [3]:
save_filename = 'aclImdb_v1.tar.gz'

if not os.path.exists(save_filename):
    req.urlretrieve(imdb_url, save_filename)
    
imdb_folder = 'aclImdb'

if not os.path.exists(imdb_folder):
    with tarfile.open(save_filename) as tar:
        tar.extractall()

## Creating fuction to get all reviews

In [4]:
#This function gets all reviews and cleans each one into words

def get_reviews(data_folder = '/train'):
    reviews=[]
    labels=[]
    for index, sentiment in enumerate(['/neg/', '/pos/']):
        path = imdb_folder + data_folder + sentiment
        for filename in sorted(os.listdir(path)):
            with open(path + filename, 'r') as f:
                review = f.read()
                review = review.lower()
                review = review.replace('<br />', ' ')
                review = re.sub(r"[^a-z]"," ", review)
                review = re.sub(r" +"," ", review)
                review = review.split(" ")
                reviews.append(review)
                
                label = [0, 0]
                label[index] = 1
                labels.append(label)
    
    return reviews, np.array(labels)
                

## Creating training data

In [5]:
train_reviews, train_labels = get_reviews()

In [6]:
print(len(train_reviews))
print(train_reviews[0])
print(train_labels[0])


25000
['story', 'of', 'a', 'man', 'who', 'has', 'unnatural', 'feelings', 'for', 'a', 'pig', 'starts', 'out', 'with', 'a', 'opening', 'scene', 'that', 'is', 'a', 'terrific', 'example', 'of', 'absurd', 'comedy', 'a', 'formal', 'orchestra', 'audience', 'is', 'turned', 'into', 'an', 'insane', 'violent', 'mob', 'by', 'the', 'crazy', 'chantings', 'of', 'it', 's', 'singers', 'unfortunately', 'it', 'stays', 'absurd', 'the', 'whole', 'time', 'with', 'no', 'general', 'narrative', 'eventually', 'making', 'it', 'just', 'too', 'off', 'putting', 'even', 'those', 'from', 'the', 'era', 'should', 'be', 'turned', 'off', 'the', 'cryptic', 'dialogue', 'would', 'make', 'shakespeare', 'seem', 'easy', 'to', 'a', 'third', 'grader', 'on', 'a', 'technical', 'level', 'it', 's', 'better', 'than', 'you', 'might', 'think', 'with', 'some', 'good', 'cinematography', 'by', 'future', 'great', 'vilmos', 'zsigmond', 'future', 'stars', 'sally', 'kirkland', 'and', 'frederic', 'forrest', 'can', 'be', 'seen', 'briefly', '']


## Embedding words

Since Neural Networks works better with numbers rather than words, we need to embed this words into integers.

We will use GloVe: Global Vectors for Word Representation


In [7]:
glove_url = 'http://nlp.stanford.edu/data/glove.6B.zip'

In [8]:
save_filename = 'glove.6B.zip'

if not os.path.exists(save_filename):
    req.urlretrieve(glove_url, save_filename)
    
embedding_size = 50

glove_filename = f'glove.6B.{embedding_size}d.txt'

if not os.path.exists(glove_filename) and embedding_size in [50, 100, 200, 300]:
    with zipfile.ZipFile(save_filename, 'r') as z:
        z.extractall()

KeyboardInterrupt: 

In [None]:
def load_embeddings():
    with open(glove_filename, 'r') as glove_vectors:
        word_to_int = defaultdict(int)
        int_to_vec = defaultdict(lambda: np.zeros([embedding_size]))
        
        index = 1
        for line in glove_vectors:
            fields = line.split()
            word = str(fields[0])
            vec = np.asarray(fields[1:], np.float32)
            word_to_int[word] = index
            int_to_vec[index] = vec
            
            index += 1
            
    return word_to_int, int_to_vec

In [None]:
word_to_int, int_to_vec = load_embeddings()

In [None]:
def review_words_to_ints(train_review):
    train_data=[]
    for review in train_reviews:
        int_review = [word_to_int[word] for word in review]
        train_data.append(int_review)
    return train_data

In [None]:
train_reviews = review_words_to_ints(train_reviews)
print(train_reviews[0])

## distribution plot of reviews by length of each review

In [None]:
import seaborn as sns

train_reviews_lens = [len(review) for review in train_reviews]
sns.distplot(train_reviews_lens)

In [None]:
max_review_len = 500 #max length of each review

In [None]:
def zero_pad_reviews(train_reviews):
    train_data_padded = []
    for review in train_reviews:
        padded = [0] * max_review_len
        stop_index = min(len(review), max_review_len)
        padded[:stop_index] = review[:stop_index]
        train_data_padded.append(padded)
    return train_data_padded
        

In [None]:
train_reviews = zero_pad_reviews(train_reviews)

In [None]:
print(train_reviews[0])

In [None]:
def review_ints_to_vecs(train_review):
    train_data=[]
    for review in train_reviews:
        vec_review = [int_to_vec[word] for word in review]
        train_data.append(vec_review)
    return train_data

In [None]:
train_reviews = np.array(review_ints_to_vecs(train_reviews))
print(train_reviews.shape)

In [None]:
def define_graph(output_size = 2, l_rate = 0.001):
    X = tf.placeholder(tf.float32, [None, max_review_len, embedding_size])
    Y = tf.placeholder(tf.int32, [None, output_size])
    keep_prob = tf.placeholder_with_default(1.0, shape = ())
    
    rnn = tf.contrib.rnn.GRUCe11(125, activation = tf.nn.relu)
    drop0 = tf.contrib.rnn.DropoutWrapper(rnn, output_keep_prob = keep_prob)
    outputs, final = tf.nn.dynamic_rnn(drop0, X, dtype = tf.float32)
    dense = tf.layers.dense(outputs[:,-1], 100, activation = tf.nn.relu)
    drop1 = tf.layers.dropout(dense, rate = 1-keep_prob)
    logits = tf.layers.dense(drop1, output_size, activation =None)
    
    error = tf.nn.softmax_cross_entropy_with_logits_v2(logits = logits, labels = Y)
    loss = tf.reduce_sum(error)
    optimizer = tf.train.AdamOptimizer(learning_rate = l_rate).minimize(loss)
    
    preds = tf.nn.softmax(logits)
    correct = tf.equal(tf.argmax(preds, axis = 1), tf.argmax(Y, axis = 1))
    accuracy = tf.reduce_mean(tf.cast(correct, tf.float32))
    
    return X, Y, keep_prob, optimizer, loss, accuracy


## Splitting training set

### First we will be creating a permutation list of the indexes that takes an equal amount of bad and good sentiments

In [None]:
permutation = [(i//2)+12500*(i%2) for i in range(len(train_reviews))]

In [None]:
train_reviews = train_reviews[permutation]

In [None]:
train_labels = train_labels[permutation]

### Choosing size of training data

In [None]:
validation_size = 1000

In [None]:
train_size = len(train_reviews) - validation_size

### Splitting and validating

In [None]:
X_train = train_reviews[:train_size]
y_train = train_labels[:train_size]
X_val = train_reviews[train_size:]
y_val = train_labels[train_size:]

In [None]:
print(X_train.shape)
print(y_train.shape)
print(X_val.shape)
print(y_val.shape)

In [None]:
batch_size = 50

num_samples = len(X_train)
num_batches = int(num_samples//batch_size)

accT = []
accV = []

tf.reset_default_graph()
X, Y, keep_prob, optimizer, loss, accuracy = define_graph()

with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    
    for epoch in range(8):
        for i in range(0, num_samples, batch_size):
            X_train_batch = X_train[i:i+batch_size]
            y_train_batch = y_train[i:i+batch_size]
            
            _, train_loss, train_acc = sess.run(
                [optimizer, loss, accuracy],
                feed_dict = {
                    X:X_train_batch,
                    Y:y_train_batch,
                    keep_prob:0.5
                }
            )
            
            if(i%1000) == 0:
                val_acc = sess.run(
                    accuracy, 
                    feed_dict = {
                        X:X_val,
                        Y:y_val
                    }
                )
                
                print(
                    "Epoch {0}:{1:2d}, Train loss: {2:2.2f}, Train acc: {3:.3f}, Val acc: {4:.3f}".format(
                        epoch,
                        i//1000,
                        train_loss,
                        train_acc,
                        val_acc
                    )
                )
                
                accT.append(train_acc)
                accV.append(val_acc)
            