# IMDB Movie Review Sentiment Analysis

## Introduction<br>
Sentiment Analysis : Given a review predict whether the review is positive or negative.<br>

Link to dataset: https://ai.stanford.edu/~amaas/data/sentiment/<br>

## Importing libraries

In [55]:
import sys
import os
import re
import pickle
from collections import defaultdict

import numpy as np
import pandas as pd

from bs4 import BeautifulSoup

from keras.preprocessing.text import Tokenizer, text_to_word_sequence
from keras.preprocessing.sequence import pad_sequences
from keras.utils.np_utils import to_categorical

from keras.layers import Embedding, Dense, Input, Flatten, Concatenate
from keras.layers import Conv1D, MaxPooling1D, Dropout, LSTM, GRU, Bidirectional, TimeDistributed 
from keras.models import Model

from keras import backend as K
from keras.engine.topology import Layer
from keras import initializers

from nltk import tokenize

## Load the dataset

In [56]:
def load_dataset(path):
    
    train_texts,train_labels = [],[]
    test_texts,test_labels = [],[]
    
    for dataset in ['train', 'test']:
        for category in ['pos', 'neg']:
            dataset_path = os.path.join(path, dataset, category)
            for filename in sorted(os.listdir(dataset_path)):
                if filename.endswith('.txt'):
                    with open(os.path.join(dataset_path, filename), encoding="utf8") as f:

                        label = 1 if category == 'pos' else 1
                        if dataset == 'train': 
                            train_texts.append(f.read())
                            train_labels.append(label)
                        else: 
                            test_texts.append(f.read())
                            test_labels.append(label)

    # Converting to np.array
    train_texts = np.array(train_texts)
    train_labels = np.array(train_labels)
    test_texts = np.array(test_texts)
    test_labels = np.array(test_labels)

    # Shuffle the training data
    permutation = np.random.permutation(len(train_texts))
    train_texts, train_labels = train_texts[permutation], train_labels[permutation]

    # Shuffle the testing data
    permutation = np.random.permutation(len(test_texts))
    test_texts, test_labels = test_texts[permutation], test_labels[permutation]
    
    # Return the dataset
    return train_texts, train_labels, test_texts, test_labels

In [57]:
train_texts, train_labels, test_texts, test_labels = load_dataset("aclImdb")

print ('Train samples shape :', train_texts.shape)
print ('Train labels shape  :', train_labels.shape)
print ('Test samples shape  :', test_texts.shape)
print ('Test labels shape   :', test_labels.shape)

Train samples shape : (25000,)
Train labels shape  : (25000,)
Test samples shape  : (25000,)
Test labels shape   : (25000,)


In [58]:
train_index=[i for i in range(25000)]
data_train = pd.DataFrame({'review':train_texts.astype(str),
                           'sentiment':train_labels
                          },index=train_index)

test_index=[i for i in range(25000,50000)]
data_test = pd.DataFrame({'review':test_texts.astype(str),
                          'sentiment':test_labels
                         },index=test_index)

print("Train data shape : ",data_train.shape)
print("Test data shape  : ",data_test.shape)

Train data shape :  (25000, 2)
Test data shape  :  (25000, 2)


In [59]:
data_train.head()

Unnamed: 0,review,sentiment
0,Hard to describe this one -- if you were a fan...,1
1,To make a good movie you either need excellent...,1
2,"I saw this ""hot"" movie when it came out in 198...",1
3,"Plot Synopsis: When his wife, a news reporter,...",1
4,In the fifties the age restrictions for films ...,1


## Text cleaning

In [60]:
MAX_SENT_LENGTH = 300
MAX_SENTS = 10
EMBEDDING_DIM = 300
MAX_WORDS = 10000
VALIDATION_SPLIT = 0.4

In [44]:
puncts = [',', '"', ':', ')', '(', '-', '!', '?', '|', ';', "'", '$', '&', '/', '[', ']', '>', '%', '=', '#', '*', '+', '\\', '•',  '~', '@', '£', 
 '·', '_', '{', '}', '©', '^', '®', '`',  '<', '→', '°', '€', '™', '›',  '♥', '←', '×', '§', '″', '′', 'Â', '█', '½', 'à', '…', 
 '“', '★', '”', '–', '●', 'â', '►', '−', '¢', '²', '¬', '░', '¶', '↑', '±', '¿', '▾', '═', '¦', '║', '―', '¥', '▓', '—', '‹', '─', 
 '▒', '：', '¼', '⊕', '▼', '▪', '†', '■', '’', '▀', '¨', '▄', '♫', '☆', 'é', '¯', '♦', '¤', '▲', 'è', '¸', '¾', 'Ã', '⋅', '‘', '∞', 
 '∙', '）', '↓', '、', '│', '（', '»', '，', '♪', '╩', '╚', '³', '・', '╦', '╣', '╔', '╗', '▬', '❤', 'ï', 'Ø', '¹', '≤', '‡', '√', ]

def clean_text(x):
    x = str(x)
    for punct in puncts:
        x = x.replace(punct, r'')
    return x

def clean_numbers(x):
    x = re.sub('[0-9]{5,}', '#####', x)
    x = re.sub('[0-9]{4}', '####', x)
    x = re.sub('[0-9]{3}', '###', x)
    x = re.sub('[0-9]{2}', '##', x)
    return x

In [45]:
from tqdm import tqdm
tqdm.pandas()
# lower case
data_train['review'] = data_train['review'].progress_apply(lambda x: x.lower())
data_test['review'] = data_test['review'].progress_apply(lambda x: x.lower())

# clean text train and test
data_train['review'] = data_train['review'].progress_apply(lambda x: clean_text(x))
data_test['review'] = data_test['review'].progress_apply(lambda x: clean_text(x))

# clean numbers
data_train['review'] = data_train['review'].progress_apply(lambda x: clean_numbers(x))
data_test['review'] = data_test['review'].progress_apply(lambda x: clean_numbers(x))

100%|██████████| 25000/25000 [00:00<00:00, 301971.82it/s]
100%|██████████| 25000/25000 [00:00<00:00, 329210.83it/s]
100%|██████████| 25000/25000 [00:00<00:00, 28346.28it/s]
100%|██████████| 25000/25000 [00:00<00:00, 28951.77it/s]
100%|██████████| 25000/25000 [00:02<00:00, 9430.31it/s]
100%|██████████| 25000/25000 [00:02<00:00, 9684.23it/s]


## Tokenize text and create embedding dictionary

In [61]:
print('Average word length of review in train is {0:.0f}.'.format(np.mean(data_train['review'].apply(lambda x: len(x.split())))))
print('Average word length of review in train is {0:.0f}.'.format(np.mean(data_test['review'].apply(lambda x: len(x.split())))))

Average word length of review in train is 234.
Average word length of review in train is 229.


In [62]:
tokenizer = Tokenizer(lower = True,num_words=MAX_WORDS)
full_text = list(data_train['review'].values) + list(data_test['review'].values)
tokenizer.fit_on_texts(full_text)

In [63]:
print('Max word length of review in train is {0:.0f}.'.format(np.max(data_train['review'].apply(lambda x: len(x.split())))))
print('Max word length of review in train is {0:.0f}.'.format(np.max(data_test['review'].apply(lambda x: len(x.split())))))

Max word length of review in train is 2470.
Max word length of review in train is 2278.


In [64]:
train_tokenized = tokenizer.texts_to_sequences(data_train['review'].fillna('missing'))
test_tokenized = tokenizer.texts_to_sequences(data_test['review'].fillna('missing'))
X_train = pad_sequences(train_tokenized,maxlen = MAX_SENT_LENGTH, padding = "post", truncating = "post")
X_test = pad_sequences(test_tokenized, maxlen = MAX_SENT_LENGTH, padding = "post", truncating = "post")
y = data_train['sentiment']

In [65]:
word_index = tokenizer.word_index
nb_words = min(MAX_WORDS, len(word_index))

## Splitting the data into train, test and validation datasets.

In [66]:
from sklearn.model_selection import train_test_split
x_train, x_val, y_train, y_val = train_test_split(X_train, y, test_size = VALIDATION_SPLIT, random_state=42)

# Word Embedding


In [67]:
embedding_path = "glove.840B.300d/glove.840B.300d.txt"
# function to load the glove embedding into the memory
def get_coefs(word,*arr): 
    return word, np.asarray(arr, dtype='float32')

embedding_index = dict(get_coefs(*o.split(" ")) for o in open(embedding_path, encoding='utf-8', errors='ignore'))
all_embs = np.stack(list(embedding_index.values()))
emb_mean,emb_std = all_embs.mean(), all_embs.std()


embedding_matrix = np.random.normal(emb_mean, emb_std, (nb_words + 1, EMBEDDING_DIM))
for word, i in word_index.items():
    if i >= MAX_WORDS: continue
    embedding_vector = embedding_index.get(word)
    if embedding_vector is not None: 
        embedding_matrix[i] = embedding_vector

## Building Convolutional Neural Network

In [68]:
def CNN_Model():
    embedding_layer = Embedding(nb_words+1,EMBEDDING_DIM,weights=[embedding_matrix],input_length=MAX_SENT_LENGTH,trainable=False)
    sequence_input = Input(shape=(MAX_SENT_LENGTH,))
    embedded_sequences = embedding_layer(sequence_input)
    filter_sizes = [1,2,3]
    num_filters = 128
    conv_0 = Conv1D(num_filters, filter_sizes[0], padding='valid', kernel_initializer='normal', activation='relu')(embedded_sequences)
    conv_1 = Conv1D(num_filters, filter_sizes[1], padding='valid', kernel_initializer='normal', activation='relu')(embedded_sequences)
    conv_2 = Conv1D(num_filters, filter_sizes[2], padding='valid', kernel_initializer='normal', activation='relu')(embedded_sequences)

    maxpool_0 = MaxPooling1D(pool_size=(MAX_SENT_LENGTH - filter_sizes[0] + 1), strides=(1), padding='valid')(conv_0)
    maxpool_1 = MaxPooling1D(pool_size=(MAX_SENT_LENGTH - filter_sizes[1] + 1), strides=(1), padding='valid')(conv_1)
    maxpool_2 = MaxPooling1D(pool_size=(MAX_SENT_LENGTH - filter_sizes[2] + 1), strides=(1), padding='valid')(conv_2)

    concatenated_tensor = Concatenate(axis=1)([maxpool_0, maxpool_1, maxpool_2])
    flatten = Flatten()(concatenated_tensor)
    x = flatten
    x = Dropout(0.3)(x)
    x = Dense(64, activation = 'relu')(x)
    outp = Dense(1, activation='sigmoid')(x)
    model = Model(inputs=sequence_input, outputs=outp)
    return model

## Model compilation and fitting

In [54]:
model = CNN_Model()
model.compile(loss='binary_crossentropy',optimizer='rmsprop',metrics=['acc'])
print("Model fitting - Convolutional Neural network")
model.fit(x_train, y_train, validation_data=(x_val, y_val), epochs=3, batch_size=512)

Model fitting - Convolutional Neural network
Train on 15000 samples, validate on 10000 samples
Epoch 1/3
Epoch 2/3
 3328/15000 [=====>........................] - ETA: 34s - loss: 5.2496e-06 - acc: 1.0000

KeyboardInterrupt: 

## References

https://www.kaggle.com/iarunava/google-text-classification-notebook <br>
https://github.com/richliao/textClassifier/blob/master/textClassifierHATT.py <br>
https://richliao.github.io/supervised/classification/2016/12/26/textclassifier-HATN/ <br>
https://medium.com/jatana/report-on-text-classification-using-cnn-rnn-han-f0e887214d5f <br>