# IMDB Movie Reviews - Sentiment Analysis

This is a Binary Classification task.
Analyze the written text reviews to find out whether a review is of type positive or negative.

## Import Libraries

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from nltk import word_tokenize
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.layers import LSTM, Activation, Dense, Dropout, Input, Embedding
from tensorflow.keras.models import Model

In [None]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

## Load IMDB Dataset

In [None]:
!ls

imdb_labelled.txt  sample_data


In [None]:
file="imdb_labelled.txt"

In [None]:
imdb = pd.read_csv(file,sep='\t',header=None,names=['review','sentiment'])

## Split into Training and Test Set

In [None]:
imdb_train, imdb_test = train_test_split(imdb, test_size=0.2, random_state=2)

In [None]:
x_train = imdb_train['review']
y_train = imdb_train['sentiment']
x_test = imdb_test['review']
y_test = imdb_test['sentiment']

### Length of the Reviews in terms of the number of words

In [None]:
sent_lens=[]
for sent in imdb_train['review']:
    sent_lens.append(len(word_tokenize(sent)))      

In [None]:
max(sent_lens)

1616

### Most Frequent Review Length

In [None]:
np.quantile(sent_lens,0.95)

40.0

#### We can see that 95% review text are of lengths less than or equal to 40. 

### We'll keep the max length to 40 -- from each review (text paragraph) we will take the first 40 words and ingore the rest.

In [None]:
# Set the maximum number of words in a given review
max_len = 40

### Tokenize the Words from a Review

In [None]:
# Tokenize the words
tok = Tokenizer(char_level=False, split=' ')

#### Get a Token Index for Every Word in the current Vocabulary

#### Fit the Tokenizer object with the training data. 

After this, the tokenizer knows the total number of unique words in the vocabulary (training set). A dictionary of all the unique words is formed. This is called the vocabulary. Every work is assignd a unique numeric code/index. This information is kept as a dictionary. A dictionary where numeric indices are the keys and the individual words are the values.

In [None]:
tok.fit_on_texts(x_train)

#### Check the Words and the corresponding Numeric Index

In [None]:
# A distionary where numeric indices are the keys and the individual words are the values
tok.index_word

{1: 'the',
 2: 'a',
 3: 'and',
 4: 'of',
 5: 'is',
 6: 'this',
 7: 'it',
 8: 'i',
 9: 'to',
 10: 'was',
 11: 'movie',
 12: 'in',
 13: 'film',
 14: 'that',
 15: '1',
 16: '0',
 17: 'but',
 18: 'for',
 19: 'as',
 20: 'with',
 21: 'are',
 22: 'on',
 23: 'not',
 24: 'you',
 25: 'one',
 26: 'very',
 27: 'bad',
 28: 'just',
 29: 'so',
 30: 'good',
 31: 'all',
 32: 'an',
 33: "it's",
 34: 'there',
 35: 'be',
 36: 'by',
 37: 'about',
 38: 'at',
 39: 'if',
 40: 'out',
 41: 'great',
 42: 'his',
 43: 'from',
 44: 'like',
 45: 'have',
 46: 'time',
 47: 'were',
 48: 'well',
 49: 'has',
 50: 'even',
 51: 'really',
 52: 'my',
 53: 'or',
 54: 'who',
 55: 'acting',
 56: 'he',
 57: 'when',
 58: 'most',
 59: 'see',
 60: 'how',
 61: 'more',
 62: 'characters',
 63: 'would',
 64: 'no',
 65: 'only',
 66: 'ever',
 67: 'made',
 68: 'also',
 69: 'best',
 70: '10',
 71: 'plot',
 72: 'some',
 73: 'your',
 74: 'do',
 75: 'its',
 76: 'character',
 77: 'real',
 78: 'because',
 79: 'love',
 80: "didn't",
 81: 'movies

### Vocabulary Size

Number of unique/distinct words in the corpus.

The index of the first word in this dictionary is 1.

In [None]:
last_index_in_vocab = len(tok.index_word.keys())

In [None]:
# First word
tok.index_word[1]

'the'

In [None]:
# Last word index
last_index_in_vocab

2688

In [None]:
# Last word
tok.index_word[last_index_in_vocab]

'passion'

In [None]:
# Total number of words in the dictionary
vocab_size = len(tok.index_word.keys())
print(vocab_size)

2688


### Now we can convert any arbitrary text to a sequence of integers.

In [None]:
# Words not part of the vocab will be dropped - person
# punctuations will be dropped (the period . at the end)
twt = tok.texts_to_sequences(['He is a lazy person.'])
print (twt)

[[56, 5, 2, 613]]


In [None]:
# Words not part of the vocab will be dropped - e.g. Egyptian, Mou
# punctuations will be dropped (the period . at the end)
twt = tok.texts_to_sequences(['The Egyptian Mou is crazy.'])
print (twt)

[[1, 5, 821]]


In [None]:
# punctuations will be dropped (the period . at the end)
twt = tok.texts_to_sequences(['The movie was great.'])
print (twt)

[[1, 11, 10, 41]]


### Convert Each Review to a Sequence

#### Convert the Training Data

In [None]:
# Convert the words in a review to numeric sequences
sequences_train = tok.texts_to_sequences(x_train)

In [None]:
y_train

456    1
231    0
250    1
16     1
490    0
      ..
534    1
584    1
493    1
527    0
168    1
Name: sentiment, Length: 598, dtype: int64

In [None]:
#x_train[0]
x_train[456]

'There still are good actors around!  '

In [None]:
# Actual review words (their numeric index in the vocab) 
# of the first review in the training set
sequences_train[0]

[34, 146, 21, 30, 90, 426]

In [None]:
index=0
review = sequences_train[index]

review_words =[]
for k in review:
  review_words.append((tok.index_word[k]))
review_words  

['there', 'still', 'are', 'good', 'actors', 'around']

In [None]:
for word in review_words:
 for key, value in tok.index_word.items():
    if value == word:
        print('{} - {}'.format(key,value))
        break

34 - there
146 - still
21 - are
30 - good
90 - actors
426 - around


#### Pad Sequences to make them the same size (40 words in this case)

In [None]:
sequences_matrix_train = sequence.pad_sequences(sequences_train, maxlen = max_len)

In [None]:
sequences_matrix_train.shape

(598, 40)

In [None]:
sequences_matrix_train

array([[  0,   0,   0, ...,  30,  90, 426],
       [  0,   0,   0, ...,  10, 275,  94],
       [  0,   0,   0, ...,  18,   1,  71],
       ...,
       [  0,   0,   0, ...,  44,  64,  84],
       [  0,   0,   0, ..., 284, 284, 284],
       [  0,   0,   0, ..., 383,  20, 307]], dtype=int32)

In [None]:
# Check the size of each review, exactly 40 words (hopefully with zero padding at the beginning)
print(sequences_matrix_train[0])

print('\n Every input vector is of length : ')
print(sequences_matrix_train[0].shape)

[  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0  34 146
  21  30  90 426]

 Every input vector is of length : 
(40,)


#### Convert the Test Data

In [None]:
# Convert to numeric sequence
sequences_test = tok.texts_to_sequences(x_test)
# Pad sequences
sequences_matrix_test = sequence.pad_sequences(sequences_test, maxlen = max_len)

## Build the Recurrent Net with LSTM

### **Keras Embedding Layer**

Embedding layers are almost identical to dense layers but very important to talk about as they are extensively used in preparing text input. 
An embedding layer is a dense layer without bias parameters and identity as the activation function. In fact all the layer does is a matrix multiply where the matrix entries are learnt during training. 

**Embedding layers are used in text processing to come up with numerical vector representations of words.**


Keras offers an Embedding layer that can be used for neural networks on text data.

It requires that the input data be integer encoded, so that each word is represented by a unique integer. This data preparation step can be performed using the Tokenizer API also provided with Keras.

**The Embedding layer is initialized with random weights and will learn an embedding for all of the words in the training dataset.**

It is a flexible layer that can be used in a variety of ways, such as:

i) It can be used alone to learn a word embedding that can be saved and used in another model later.

ii) It can be used as part of a deep learning model where the embedding is learned along with the model itself.

iii) It can be used to load a pre-trained word embedding model, a type of transfer learning.

The Embedding layer is defined as the **first hidden layer of a network.**

It must specify 3 arguments:

**input_dim**: This is the size of the vocabulary in the text data. For example, if your data is integer encoded to values between 0-10, then the size of the vocabulary would be 11 words. (i.e. The value of this parameter is an integer which is the Size of the vocabulary, i.e. maximum integer index + 1)

**output_dim**: This is the size of the vector space in which words will be embedded. It defines the size of the output vectors from this layer for each word. For example, it could be 32 or 100 or even larger. Test different values for your problem.

**input_length**: This is the length of input sequences, as you would define for any input layer of a Keras model. For example, if all of your input documents are comprised of 1000 words, this would be 1000.


### Build a Custom Function

In [None]:
def RNN():
    inputs = Input(name='inputs', shape=[max_len])
    
    # This layer can only be used as the first layer in a model.
    # Turns positive integers (index values) into dense vectors of fixed size.
    # The model will take as input an integer matrix of size (batch, input_length) and the 
    # largest integer (i.e. word index) in the input should be not larger than vocabulary_size+1.  
    # Now model's output_shape is (None, max_len, output_dim), where `None` is the batch dimension. 
    layer = Embedding(input_dim = vocab_size+1, output_dim = 500, input_length = max_len, mask_zero=True)(inputs)

    # num_params = input_dim * output_dim = 2689 * 500 = 1344500  

    layer = LSTM(64)(layer)   # num_params = [(num_units + input_dim + 1) * num_units] * 4
                              # 144640 = [(64 + 500 +1) * 64] *4  

    layer = Dense(256, name='FC1')(layer)
    layer = Activation('relu')(layer)
    layer = Dropout(0.5)(layer)
    layer = Dense(1, name='out_layer')(layer)
    layer = Activation('sigmoid')(layer)
    model = Model(inputs=inputs, outputs=layer)
    return model

### Call the Custom Function to Build the Model

In [None]:
model = RNN()
model.summary()

Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
inputs (InputLayer)          [(None, 40)]              0         
_________________________________________________________________
embedding (Embedding)        (None, 40, 500)           1344500   
_________________________________________________________________
lstm (LSTM)                  (None, 64)                144640    
_________________________________________________________________
FC1 (Dense)                  (None, 256)               16640     
_________________________________________________________________
activation (Activation)      (None, 256)               0         
_________________________________________________________________
dropout (Dropout)            (None, 256)               0         
_________________________________________________________________
out_layer (Dense)            (None, 1)                 257   

In [None]:
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

## Train the Model

In [None]:
model.fit(sequences_matrix_train, y_train.values, batch_size=50,epochs=50, 
          validation_data = (sequences_matrix_test, y_test.values))

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<tensorflow.python.keras.callbacks.History at 0x7f3328889ed0>

If you notice the validation loss , it is actually increasing after some epochs , best model appears much earlier than 50th epoch. Therefore, we could have stopped training earlier. We can keep track of this with a Keras callback for EarlyStopping.


### Test the Model

In [None]:
predictions = model.predict(sequences_matrix_test)

#### Evaluate with ROC-AUC Score

In [None]:
from sklearn.metrics import roc_auc_score

In [None]:
roc_auc_score(y_test,predictions)

0.845679012345679

### Make a Prediction

In [None]:
# Choose a review from the test set (any index number)
index=90   # positive
#index=20  # negative

review = sequences_test[index]

review_words =[]
for k in review:
  review_words.append((tok.index_word[k]))
review_words  

['this',
 'early',
 'film',
 'from',
 'future',
 'is',
 'a',
 'very',
 'good',
 'addition',
 'to',
 'the',
 'giallo',
 'genre']

In [None]:
#model.predict(sequences_matrix_test[index].reshape(1,40))
pred = model.predict(sequences_matrix_test[index].reshape(1,40))[0][0]
print(pred)

0.9999925
