<a href="https://colab.research.google.com/github/rhiosutoyo/Teaching-Deep-Learning-and-Its-Applications/blob/main/8_1_sentiment_analysis_using_imdb_movie_reviews.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Step 1: Install Required Libraries
!pip install torch torchvision torchtext spacy torchdata portalocker>=2.0.0

In [2]:
# Step 2: Import Libraries
import os
import urllib.request
import tarfile
import torch
import torchdata
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
from torch.utils.data import DataLoader, random_split
import torch.nn as nn
import torch.optim as optim
import random
import numpy as np
from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence, pad_packed_sequence



In [3]:
# Step 3: Set the Seed for Reproducibility
SEED = 88
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
np.random.seed(SEED)
random.seed(SEED)

In [4]:
# Step 4: Load and Preprocess the Data
def download_imdb_dataset():
    url = "https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz"
    filename = "aclImdb_v1.tar.gz"
    dataset_folder = "aclImdb"

    if not os.path.exists(dataset_folder):
        urllib.request.urlretrieve(url, filename)
        with tarfile.open(filename, "r:gz") as tar:
            tar.extractall()
        os.remove(filename)

download_imdb_dataset()

def read_imdb_split(split):
    split_path = f'aclImdb/{split}'
    texts, labels = [], []
    for label in ['pos', 'neg']:
        dir_path = f'{split_path}/{label}'
        for fname in os.listdir(dir_path):
            if fname.endswith('.txt'):
                with open(os.path.join(dir_path, fname), 'r', encoding='utf-8') as f:
                    texts.append(f.read())
                    labels.append(label)
    return list(zip(labels, texts))

train_data = read_imdb_split('train')
test_data = read_imdb_split('test')
train_data, valid_data = random_split(train_data, [20000, 5000], generator=torch.Generator().manual_seed(SEED))

tokenizer = get_tokenizer('spacy', language='en_core_web_sm')

def yield_tokens(data_iter):
    for _, text in data_iter:
        yield tokenizer(text)

vocab = build_vocab_from_iterator(yield_tokens(train_data), specials=["<unk>", "<pad>"])
vocab.set_default_index(vocab["<unk>"])

In [5]:
# Step 5: Define Text and Label Processing Functions
def text_pipeline(x):
    return [vocab[token] for token in tokenizer(x)]

def label_pipeline(x):
    return 1 if x == 'pos' else 0

In [6]:
# Step 6: Create DataLoader
def collate_batch(batch):
    label_list, text_list, lengths = [], [], []
    for (_label, _text) in batch:
        label_list.append(label_pipeline(_label))
        processed_text = torch.tensor(text_pipeline(_text), dtype=torch.int64)
        text_list.append(processed_text)
        lengths.append(processed_text.size(0))
    label_list = torch.tensor(label_list, dtype=torch.float32)
    text_list = pad_sequence(text_list, batch_first=True, padding_value=vocab['<pad>'])
    lengths = torch.tensor(lengths, dtype=torch.int64)
    return label_list.to(device), text_list.to(device), lengths.to(device)

BATCH_SIZE = 64
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
train_dataloader = DataLoader(train_data, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_batch)
valid_dataloader = DataLoader(valid_data, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_batch)
test_dataloader = DataLoader(test_data, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_batch)

In [7]:
# Step 7: Build the Model
class RNN(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, n_layers,
                 bidirectional, dropout, pad_idx):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=pad_idx)
        self.rnn = nn.LSTM(embedding_dim, hidden_dim, num_layers=n_layers,
                           bidirectional=bidirectional, dropout=dropout)
        self.fc = nn.Linear(hidden_dim * 2, output_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, text, text_lengths):
        embedded = self.dropout(self.embedding(text))
        packed_embedded = pack_padded_sequence(embedded, text_lengths.cpu(), batch_first=True, enforce_sorted=False)
        packed_output, (hidden, cell) = self.rnn(packed_embedded)
        hidden = self.dropout(torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim=1))
        return self.fc(hidden)

In [8]:
# Step 8: Initialize the Model
INPUT_DIM = len(vocab)
EMBEDDING_DIM = 100
HIDDEN_DIM = 256
OUTPUT_DIM = 1
N_LAYERS = 2
BIDIRECTIONAL = True
DROPOUT = 0.5
PAD_IDX = vocab['<pad>']

model = RNN(INPUT_DIM, EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM, N_LAYERS,
            BIDIRECTIONAL, DROPOUT, PAD_IDX)

In [9]:
# Step 9: Train the Model
optimizer = optim.Adam(model.parameters())
criterion = nn.BCEWithLogitsLoss()
model = model.to(device)
criterion = criterion.to(device)

In [10]:
# Step 10: Training Loop
def binary_accuracy(preds, y):
    rounded_preds = torch.round(torch.sigmoid(preds))
    correct = (rounded_preds == y).float()
    acc = correct.sum() / len(correct)
    return acc

def train(model, iterator, optimizer, criterion):
    epoch_loss = 0
    epoch_acc = 0
    model.train()
    for batch in iterator:
        optimizer.zero_grad()
        labels, text, text_lengths = batch
        predictions = model(text, text_lengths).squeeze(1)
        loss = criterion(predictions, labels)
        acc = binary_accuracy(predictions, labels)
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()
        epoch_acc += acc.item()
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

def evaluate(model, iterator, criterion):
    epoch_loss = 0
    epoch_acc = 0
    model.eval()
    with torch.no_grad():
        for batch in iterator:
            labels, text, text_lengths = batch
            predictions = model(text, text_lengths).squeeze(1)
            loss = criterion(predictions, labels)
            acc = binary_accuracy(predictions, labels)
            epoch_loss += loss.item()
            epoch_acc += acc.item()
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

N_EPOCHS = 5
for epoch in range(N_EPOCHS):
    train_loss, train_acc = train(model, train_dataloader, optimizer, criterion)
    valid_loss, valid_acc = evaluate(model, valid_dataloader, criterion)
    print(f'Epoch: {epoch+1:02}')
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')

Epoch: 01
	Train Loss: 0.682 | Train Acc: 56.37%
	 Val. Loss: 0.681 |  Val. Acc: 55.20%
Epoch: 02
	Train Loss: 0.654 | Train Acc: 60.81%
	 Val. Loss: 0.638 |  Val. Acc: 60.70%
Epoch: 03
	Train Loss: 0.603 | Train Acc: 67.15%
	 Val. Loss: 0.537 |  Val. Acc: 73.32%
Epoch: 04
	Train Loss: 0.496 | Train Acc: 76.48%
	 Val. Loss: 0.408 |  Val. Acc: 81.61%
Epoch: 05
	Train Loss: 0.433 | Train Acc: 80.29%
	 Val. Loss: 0.389 |  Val. Acc: 82.73%


In [11]:
# Step 11: Evaluate on Test Data
test_loss, test_acc = evaluate(model, test_dataloader, criterion)
print(f'Test Loss: {test_loss:.3f} | Test Acc: {test_acc*100:.2f}%')

Test Loss: 0.400 | Test Acc: 82.53%


In [12]:
# Step 12: Test with New Sentences
test_reviews = [
    "The movie was fantastic! I really enjoyed it.",
    "Absolutely terrible. Worst movie I've seen in years.",
    "It was an okay movie. Nothing special, but not bad either.",
    "I loved the acting and the storyline. Highly recommend!",
    "The plot was very predictable and boring.",
    "A masterpiece of cinema. Truly inspiring and well-made.",
    "I didn't like the film at all. The characters were flat and uninteresting.",
    "Great visuals, but the story lacked depth.",
    "An excellent film with a powerful message.",
    "Not my cup of tea. I found it quite dull and unengaging."
]

def predict_sentiment(model, sentence):
    model.eval()
    tokens = text_pipeline(sentence)
    text_lengths = torch.tensor([len(tokens)])
    text_tensor = torch.tensor(tokens).unsqueeze(0).to(device)
    text_lengths = text_lengths.to(device)
    prediction = torch.sigmoid(model(text_tensor, text_lengths))
    return prediction.item()

for review in test_reviews:
    sentiment = predict_sentiment(model, review)
    label = 'positive' if sentiment >= 0.5 else 'negative'
    print(f'Review: {review}\nSentiment Score: {sentiment:.4f} ({label})\n')

Review: The movie was fantastic! I really enjoyed it.
Sentiment Score: 0.7823 (positive)

Review: Absolutely terrible. Worst movie I've seen in years.
Sentiment Score: 0.0620 (negative)

Review: It was an okay movie. Nothing special, but not bad either.
Sentiment Score: 0.0143 (negative)

Review: I loved the acting and the storyline. Highly recommend!
Sentiment Score: 0.8376 (positive)

Review: The plot was very predictable and boring.
Sentiment Score: 0.0231 (negative)

Review: A masterpiece of cinema. Truly inspiring and well-made.
Sentiment Score: 0.8245 (positive)

Review: I didn't like the film at all. The characters were flat and uninteresting.
Sentiment Score: 0.0378 (negative)

Review: Great visuals, but the story lacked depth.
Sentiment Score: 0.8064 (positive)

Review: An excellent film with a powerful message.
Sentiment Score: 0.9218 (positive)

Review: Not my cup of tea. I found it quite dull and unengaging.
Sentiment Score: 0.4882 (negative)

