# Binary Sentiment Analysis on IMDB Reviews Dataset

# Libraries & Modules

In [1]:
# Manage Warnings
import warnings
warnings.filterwarnings("ignore")

# Modules
import os
import sys
import shutil
import string
import re as regex
import matplotlib.pyplot as plt

# TensorFlow Log Level
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'

# TensorFlow Libraries
import tensorflow as tf

# Environment/Versions
print("Python Env : ", sys.version)
print("TensorFlow : ", tf.__version__)

Python Env :  3.10.12 | packaged by conda-forge | (main, Jun 23 2023, 22:40:32) [GCC 12.3.0]
TensorFlow :  2.12.0


# Dataset

## Downloading & Exploring

In [2]:
# Dataset Download Path
url = "https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz"

# Retrieve File (and untar)
dataset = tf.keras.utils.get_file(fname="aclImdb_v1", 
                                  origin=url, 
                                  untar=True, 
                                  cache_dir=".", 
                                  cache_subdir="")

# Dataset Directory
dataset_dir = os.path.join(os.path.dirname(dataset), "aclImdb")

Downloading data from https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz


In [3]:
# List Dataset Sub-Directories & Files
display(os.listdir(dataset_dir))

['train', 'test', 'README', 'imdbEr.txt', 'imdb.vocab']

In [4]:
# Review Readme File
with open(os.path.join(dataset_dir, 'README'), 'r') as f:
    print(f.read())

Large Movie Review Dataset v1.0

Overview

This dataset contains movie reviews along with their associated binary
sentiment polarity labels. It is intended to serve as a benchmark for
sentiment classification. This document outlines how the dataset was
gathered, and how to use the files provided. 

Dataset 

The core dataset contains 50,000 reviews split evenly into 25k train
and 25k test sets. The overall distribution of labels is balanced (25k
pos and 25k neg). We also include an additional 50,000 unlabeled
documents for unsupervised learning. 

In the entire collection, no more than 30 reviews are allowed for any
given movie because reviews for the same movie tend to have correlated
ratings. Further, the train and test sets contain a disjoint set of
movies, so no significant performance is obtained by memorizing
movie-unique terms and their associated with observed labels.  In the
labeled train/test sets, a negative review has a score <= 4 out of 10,
and a positive review has a scor

In [5]:
# Train & Test Directories
display(os.listdir(f"{dataset_dir}/train"))
display(os.listdir(f"{dataset_dir}/test"))

['labeledBow.feat',
 'neg',
 'pos',
 'urls_unsup.txt',
 'unsup',
 'urls_neg.txt',
 'unsupBow.feat',
 'urls_pos.txt']

['labeledBow.feat', 'neg', 'pos', 'urls_neg.txt', 'urls_pos.txt']

In [6]:
# Training Data Directory
train_dir = os.path.join(dataset_dir, "train")
train_dir_items = os.listdir(train_dir)
train_dir_items

['labeledBow.feat',
 'neg',
 'pos',
 'urls_unsup.txt',
 'unsup',
 'urls_neg.txt',
 'unsupBow.feat',
 'urls_pos.txt']

In [7]:
# Testing Data Directory
test_dir = os.path.join(dataset_dir, "test")
os.listdir(test_dir)

['labeledBow.feat', 'neg', 'pos', 'urls_neg.txt', 'urls_pos.txt']

In [8]:
# View a sample positive review.
train_files = os.listdir(os.path.join(train_dir, "pos"))
sample_fname = train_files[2]
print(f"File : {sample_fname}\n")
with open(os.path.join(train_dir, f"pos/{sample_fname}")) as f:
    print(f.read())

File : 1751_8.txt

The 40 Year Old Virgin, is about Andy Stitzer, a forty year old man who works in an electronic store and doesn't have much of a social life and is very awkward around women. Some of his co-workers at the store invite him out one night and they discover that Andy, is still a virgin so they plan to help him lose his virginity. One day in the store Andy, meets a woman named Trish, who gives him her phone number and eventually Andy, works up enough courage to go on a date with her and they start to really like each other but Andy, is still very awkward when it comes to sex and he is going to have to tell this to Trish, much to his embarrassment if he can actually get up enough courage to tell her before things get awkward. The 40 Year Old Virgin, has good direction, a good script, good comedic performances by the whole cast, good cinematography and good film editing. The film stars and is co-written by Steve Carell, who does a very good comedic breakthrough performance a

In [9]:
tf.keras.utils.text_dataset_from_directory?

In [10]:
# Prepare Training Directory as per 'text_dataset_from_directory' Requirements.
# -----------------------------------------------------------------------------
# Create New Directory for Unsupervised Data
train_unsup_dir = os.path.join(dataset_dir, "train_unsup")
if not os.path.exists(train_unsup_dir):
    os.makedirs(train_unsup_dir)
else:
    pass
display(os.listdir(dataset_dir))
# Collect Unsupervised Data Files and Directories
file_list = [_ for _ in train_dir_items if "unsup" in _]
# Source & Destination Directories
source_dir = train_dir
destination_dir = train_unsup_dir
# Move Files and Directories
for file in file_list:
    source_file = os.path.join(source_dir, file)
    destination_file = os.path.join(destination_dir, file)
    # Move as per Directory/File
    if os.path.isdir(source_file):
        shutil.copytree(source_file, destination_file)
        shutil.rmtree(source_file)
    else:
        shutil.move(source_file, destination_file)

# Review Training & Unsupervised Training Data Directories
display(os.listdir(train_dir))
display(os.listdir(train_unsup_dir))

['train', 'test', 'README', 'imdbEr.txt', 'train_unsup', 'imdb.vocab']

['labeledBow.feat', 'neg', 'pos', 'urls_neg.txt', 'urls_pos.txt']

['urls_unsup.txt', 'unsup', 'unsupBow.feat']