# Binary Sentiment Analysis on IMDB Reviews Dataset

# Libraries & Modules

In [1]:
# Manage Warnings
import warnings
warnings.filterwarnings("ignore")

# Libraris & Modules
import os
import sys
import shutil
import string
import re as regex
import matplotlib.pyplot as plt

# TensorFlow Log Level
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'

# TensorFlow Libraries
import tensorflow as tf

# Environment/Versions
print("Python Env : ", sys.version)
print("TensorFlow : ", tf.__version__)

# Random Generator Seed
random_seed = 47

Python Env :  3.10.12 | packaged by conda-forge | (main, Jun 23 2023, 22:40:32) [GCC 12.3.0]
TensorFlow :  2.12.0


# Downloading & Exploring Dataset

In [2]:
# Dataset Download Path
url = "https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz"

# Retrieve File (and untar)
dataset = tf.keras.utils.get_file(fname="aclImdb_v1", 
                                  origin=url, 
                                  untar=True, 
                                  cache_dir=".", 
                                  cache_subdir="")

# Dataset Directory
dataset_dir = os.path.join(os.path.dirname(dataset), "aclImdb")

Downloading data from https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz


In [3]:
# List Dataset Sub-Directories & Files
display(os.listdir(dataset_dir))

['train', 'imdbEr.txt', 'README', 'test', 'imdb.vocab']

In [4]:
# Review Readme File
with open(os.path.join(dataset_dir, 'README'), 'r') as f:
    print(f.read())

Large Movie Review Dataset v1.0

Overview

This dataset contains movie reviews along with their associated binary
sentiment polarity labels. It is intended to serve as a benchmark for
sentiment classification. This document outlines how the dataset was
gathered, and how to use the files provided. 

Dataset 

The core dataset contains 50,000 reviews split evenly into 25k train
and 25k test sets. The overall distribution of labels is balanced (25k
pos and 25k neg). We also include an additional 50,000 unlabeled
documents for unsupervised learning. 

In the entire collection, no more than 30 reviews are allowed for any
given movie because reviews for the same movie tend to have correlated
ratings. Further, the train and test sets contain a disjoint set of
movies, so no significant performance is obtained by memorizing
movie-unique terms and their associated with observed labels.  In the
labeled train/test sets, a negative review has a score <= 4 out of 10,
and a positive review has a scor

In [5]:
# Train & Test Directories
display(os.listdir(f"{dataset_dir}/train"))
display(os.listdir(f"{dataset_dir}/test"))

['urls_neg.txt',
 'unsup',
 'pos',
 'urls_pos.txt',
 'neg',
 'labeledBow.feat',
 'urls_unsup.txt',
 'unsupBow.feat']

['urls_neg.txt', 'pos', 'urls_pos.txt', 'neg', 'labeledBow.feat']

In [6]:
# Training Data Directory
train_dir = os.path.join(dataset_dir, "train")
train_dir_items = os.listdir(train_dir)
train_dir_items

['urls_neg.txt',
 'unsup',
 'pos',
 'urls_pos.txt',
 'neg',
 'labeledBow.feat',
 'urls_unsup.txt',
 'unsupBow.feat']

In [7]:
# Testing Data Directory
test_dir = os.path.join(dataset_dir, "test")
os.listdir(test_dir)

['urls_neg.txt', 'pos', 'urls_pos.txt', 'neg', 'labeledBow.feat']

In [8]:
# View a sample positive review.
train_files = os.listdir(os.path.join(train_dir, "pos"))
sample_fname = train_files[2]
print(f"File : {sample_fname}\n")
with open(os.path.join(train_dir, f"pos/{sample_fname}")) as f:
    print(f.read())

File : 8303_10.txt

This film quite literally has every single action movie cliche and all of them work to its advantage. Straight from Lethal Weapon Gary Busey wisecracks, shoots and chuckles through this film with such reckless abandonment it can't help but amuse and entertain. There are tanks, helicopters, machine gun battles, grenades and ice cream vans and if they aren't good enough reasons to watch this film then how about the best one...Danny Trejo. And if you don't know who Danny Trejo is then you probably won't like this film.


In [9]:
tf.keras.utils.text_dataset_from_directory?

In [10]:
# Prepare Training Directory as per 'text_dataset_from_directory' Requirements.
# -----------------------------------------------------------------------------
# Create New Directory for Unsupervised Data
train_unsup_dir = os.path.join(dataset_dir, "train_unsup")
if not os.path.exists(train_unsup_dir):
    os.makedirs(train_unsup_dir)
else:
    pass
display(os.listdir(dataset_dir))
# Collect Unsupervised Data Files and Directories
file_list = [_ for _ in train_dir_items if "unsup" in _]
# Source & Destination Directories
source_dir = train_dir
destination_dir = train_unsup_dir
# Move Files and Directories
for file in file_list:
    source_file = os.path.join(source_dir, file)
    destination_file = os.path.join(destination_dir, file)
    # Move as per Directory/File
    if os.path.isdir(source_file):
        shutil.copytree(source_file, destination_file)
        shutil.rmtree(source_file)
    else:
        shutil.move(source_file, destination_file)

# Review Training & Unsupervised Training Data Directories
display(os.listdir(train_dir))
display(os.listdir(train_unsup_dir))

['train_unsup', 'train', 'imdbEr.txt', 'README', 'test', 'imdb.vocab']

['urls_neg.txt', 'pos', 'urls_pos.txt', 'neg', 'labeledBow.feat']

['unsup', 'urls_unsup.txt', 'unsupBow.feat']

# Training, Validation & Testing Datasets

In [11]:
# Training Dataset
print("Training Dataset :")
train_ds = tf.keras.utils.text_dataset_from_directory(directory=train_dir, 
                                                      batch_size=8, 
                                                      validation_split=0.2, 
                                                      subset='training', 
                                                      seed=random_seed)

# Validation Dataset
print("\n")
print("Validation Dataset :")
val_ds = tf.keras.utils.text_dataset_from_directory(directory=train_dir, 
                                                    batch_size=8, 
                                                    validation_split=0.2, 
                                                    subset='validation', 
                                                    seed=random_seed)

# Testing Dataset
print("\n")
print("Testing Dataset :")
test_ds = tf.keras.utils.text_dataset_from_directory(directory=test_dir, 
                                                     batch_size=8)

Training Dataset :
Found 25000 files belonging to 2 classes.
Using 20000 files for training.


Validation Dataset :
Found 25000 files belonging to 2 classes.
Using 5000 files for validation.


Testing Dataset :
Found 25000 files belonging to 2 classes.


In [12]:
# Review Dataset
for x, y in train_ds.take(1):
    print(f"Dataset Element 1 (Labels)  : {tf.shape(y)}")
    print(f"Dataset Element 1 (Reviews) : {tf.shape(x)}\n")
    for i in range(5):
        tf.print(f"LABEL  : {y[i]}")
        tf.print(f"REVIEW : {x[i]}")
        tf.print("-"*80)

Dataset Element 1 (Labels)  : [8]
Dataset Element 1 (Reviews) : [8]

LABEL  : 0
REVIEW : b"I wouldn't say this is a *bad* movie. Unfortunately for me, I get the feeling that the more you know about fencing, the worse it gets simply due to the fact that it becomes totally unrealistic. I've been fencing since i was 14 years old, and this movie portrays it very poorly. F. Murray Abraham is good (and appears to have some fencing background), but most of the other actors--especially the students--just seem to be lost."
--------------------------------------------------------------------------------
LABEL  : 1
REVIEW : b'Family problems abound in real life and that is what this movie is about. Love can hold the members together through out the ordeals and trials and that is what this movie is about. One man, Daddy, has the maturity and fortitude to sustain the family in the face of adversity. The kids grow up,one all be it, in the hard way, to realize that no matter how old they or a parent 

In [13]:
# Class Names Corresponding to Index (as Numerical Label)
display(train_ds.class_names)

['neg', 'pos']

# Data Preparation

## Custom Standardization Function

Removing Punctuations & HTML Elements.

In [14]:
# Function Definition : Remove HTML Tags & Punctuations
def custom_standardize(input_data):
    """Remove HTML Tags & Punctuations"""
    
    # Convert to Lowercase
    lowercase = tf.strings.lower(input_data)
    
    # Remove the '<br />' Tag
    stripped_html = tf.strings.regex_replace(lowercase, "<br />", " ")
    
    # Remove Punctuations
    stripped_html_punc = tf.strings.regex_replace(stripped_html,
        "[%s]" % regex.escape(string.punctuation), ""
    )
    
    return stripped_html_punc

## Tokenization & Vectorization

- Custom function for Standardization
- Splitting strings into tokens. --> tf.keras.layers.TextVectorization()
- Converting tokens into numbers for neural network.

In [15]:
# Calculate Maximum Features Value
# --------------------------------
# Collect all Word Tokens provided in Dataset
vocab_tokens = list()
with open(os.path.join(dataset_dir, "imdb.vocab"), "r") as f:
    for line in f:
        vocab_tokens.append(line.strip())
vocab_tokens = set(vocab_tokens)       

# Calc. Max. Features rounded to next hundred
max_features = round(len(vocab_tokens)+100, -2)

# Delete Token Object
del vocab_tokens

# Display Max. Features
print("Maximum Features(Tokens) in Entire Dataset : ", max_features)

# Set Maximum Sequence Length
# ---------------------------
max_seq_len = 250
print("Maximum Output Sequence Length of Vectorization Layer : ", max_seq_len)

Maximum Features(Tokens) in Entire Dataset :  89600
Maximum Output Sequence Length of Vectorization Layer :  250


In [16]:
# Vectorization Layer