# IMDB Ratings Sentimental Analysis

In [4]:
import os 
import numpy as np
import re
import string 
import shutil
import tensorflow as tf
from tensorflow.keras.layers import Dense, Embedding, GlobalAveragePooling1D, Conv2D, AveragePooling1D
from tensorflow.keras.preprocessing import text_dataset_from_directory
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization
from tensorflow.keras import Sequential
from tensorflow.keras.losses import BinaryCrossentropy
from tensorflow.keras.layers import LSTM, GlobalMaxPool1D, Bidirectional
from keras.layers import Flatten, Conv1D
from keras.layers.convolutional import MaxPooling1D
from tensorflow.keras.layers import Dropout

## Utilities/Definitions used for simplification of tasks

In [7]:
def getIMDBData():
    url = 'https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz'
    dataset = tf.keras.utils.get_file("aclImdb_v1.tar.gz", 
                                      url, 
                                      untar=True, 
                                      cache_dir='.',
                                      cache_subdir='')
    return os.path.join(os.path.dirname(dataset), 'aclImdb')

def removeRedundantDirectory(datasetDir):
    trainDir = os.path.join(datasetDir, 'train')
    unsupDir = os.path.join(trainDir, 'unsup')
    shutil.rmtree(unsupDir)

def makeTextsLowerCase(inputData):
    return tf.strings.lower(inputData)

def removeHTMLTags(inputData):
    return tf.strings.regex_replace(inputData, '<br />', 
                                    ' ')

def removePunctuations(inputData):
    return tf.strings.regex_replace(inputData,
                                  '[%s]' % re.escape(string.punctuation), 
                                  '')

def getWordPrepocessingPredicates(inputData):
    processedData = makeTextsLowerCase(inputData)
    processedData = removePunctuations(processedData)
    processedData = removeHTMLTags(processedData)
    return processedData
    

## Fetch the data

In [3]:
datasetDir = getIMDBData() ## Dataset won't be downloaded if already present
removeRedundantDirectory(datasetDir)

Downloading data from https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz


## Segregating Training and Validation set

In [8]:
trainSet = text_dataset_from_directory(
    'aclImdb/train', batch_size=300, 
    seed=100, label_mode='int')
testSet = text_dataset_from_directory(
    'aclImdb/test', label_mode='int')

Found 25000 files belonging to 2 classes.
Found 25000 files belonging to 2 classes.


In [9]:
vocabularies = 10000
lengthCapOfWords = 250
wordVectorizePredicates = TextVectorization(
    standardize=getWordPrepocessingPredicates,
    max_tokens=vocabularies,
    output_mode='int',
    output_sequence_length=lengthCapOfWords)

textSet = trainSet.map(lambda texts, labels: texts)
wordVectorizePredicates.adapt(textSet)
embeddingDimension=500

## Designing the model

### Case Study 1: All Dense layer architecture

In [10]:
SentimentModel = Sequential([
  wordVectorizePredicates,
  Embedding(vocabularies, embeddingDimension, name="embedding"),
  Dense(30, activation='relu'),
  Dropout(0.2),
  Flatten(),
  Dense(1, activation='sigmoid')
])

In [11]:
SentimentModel.compile(optimizer='adam',
              loss=BinaryCrossentropy(from_logits=False),
              metrics=['accuracy'])

In [15]:
SentimentModel.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
text_vectorization (TextVect (None, 250)               0         
_________________________________________________________________
embedding (Embedding)        (None, 250, 500)          5000000   
_________________________________________________________________
dense (Dense)                (None, 250, 30)           15030     
_________________________________________________________________
dropout (Dropout)            (None, 250, 30)           0         
_________________________________________________________________
module_wrapper (ModuleWrappe (None, 7500)              0         
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 7501      
Total params: 5,022,531
Trainable params: 5,022,531
Non-trainable params: 0
______________________________________________

In [12]:
SentimentModel.fit(
    trainSet,
    validation_data=testSet,
    epochs=2)

Epoch 1/2
Epoch 2/2


<tensorflow.python.keras.callbacks.History at 0x7f84832f2150>

In [16]:
scores = SentimentModel.evaluate(testSet, verbose=0)
print("Accuracy on Validation set: %.2f%%" % (scores[1]*100))

Accuracy on Validation set: 86.64%


### Case Study 2: Convolutional-Dense layer architecture

In [30]:
SentimentModel = Sequential([
  wordVectorizePredicates,
  Embedding(vocabularies, embeddingDimension, name="embedding"),
  Conv1D(filters=40, kernel_size=3, padding='same', activation='relu'),
  Dropout(0.2),
  Flatten(),
  Dense(20, activation='relu'),
  Dense(1, activation='sigmoid')
])

In [31]:
SentimentModel.compile(optimizer='adam',
              loss=BinaryCrossentropy(from_logits=False),
              metrics=['accuracy'])

In [34]:
SentimentModel.summary()

Model: "sequential_8"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
text_vectorization (TextVect (None, 250)               0         
_________________________________________________________________
embedding (Embedding)        (None, 250, 500)          5000000   
_________________________________________________________________
module_wrapper_15 (ModuleWra (None, 250, 40)           60040     
_________________________________________________________________
dropout_8 (Dropout)          (None, 250, 40)           0         
_________________________________________________________________
module_wrapper_16 (ModuleWra (None, 10000)             0         
_________________________________________________________________
dense_12 (Dense)             (None, 20)                200020    
_________________________________________________________________
dense_13 (Dense)             (None, 1)                

In [32]:
SentimentModel.fit(
    trainSet,
    validation_data=testSet,
    epochs=2)

Epoch 1/2
Epoch 2/2


<tensorflow.python.keras.callbacks.History at 0x7f8483298ad0>

In [33]:
scores = SentimentModel.evaluate(testSet, verbose=0)
print("Accuracy on Validation set: %.2f%%" % (scores[1]*100))

Accuracy on Validation set: 86.86%


In [None]:
SentimentModel.summary()

Model: "sequential_8"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
text_vectorization (TextVect (None, 250)               0         
_________________________________________________________________
embedding (Embedding)        (None, 250, 500)          5000000   
_________________________________________________________________
module_wrapper_8 (ModuleWrap (None, 250, 40)           60040     
_________________________________________________________________
dropout_6 (Dropout)          (None, 250, 40)           0         
_________________________________________________________________
module_wrapper_9 (ModuleWrap (None, 10000)             0         
_________________________________________________________________
dense_17 (Dense)             (None, 20)                200020    
_________________________________________________________________
dense_18 (Dense)             (None, 1)                

### Case Study 3: LSTM-Dense layer architecture

In [38]:
SentimentModel = Sequential([
  wordVectorizePredicates,
  Embedding(vocabularies, embeddingDimension, name="embedding"),
  Bidirectional(LSTM(40, return_sequences = True)),
  Dropout(0.2),
  AveragePooling1D(),
  Dense(20, activation="relu"),
  Dense(1, activation='sigmoid')
])

In [39]:
SentimentModel.compile(optimizer='adam',
              loss=BinaryCrossentropy(from_logits=False),
              metrics=['accuracy'])

In [40]:
SentimentModel.fit(
    trainSet,
    validation_data=testSet,
    epochs=3)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<tensorflow.python.keras.callbacks.History at 0x7f847e61d310>

In [None]:
scores = SentimentModel.evaluate(testSet, verbose=0)
print("Accuracy on Validation set: %.2f%%" % (scores[1]*100))

Accuracy: 85.78%


In [None]:
SentimentModel.summary()

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
text_vectorization_1 (TextVe (None, 250)               0         
_________________________________________________________________
embedding (Embedding)        (None, 250, 500)          5000000   
_________________________________________________________________
bidirectional (Bidirectional (None, 250, 64)           136448    
_________________________________________________________________
global_average_pooling1d (Gl (None, 64)                0         
_________________________________________________________________
dense_5 (Dense)              (None, 20)                1300      
_________________________________________________________________
dense_6 (Dense)              (None, 1)                 21        
Total params: 5,137,769
Trainable params: 5,137,769
Non-trainable params: 0
____________________________________________