In [10]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from datasets import load_dataset
from torch.nn.utils.rnn import pad_sequence
from collections import Counter
import re

# Load IMDb dataset from HuggingFace
dataset = load_dataset("imdb")

# Tokenizer
def simple_tokenizer(text):
    text = text.lower()
    text = re.sub(r"<.*?>", " ", text)  # remove HTML
    text = re.sub(r"[^\w\s]", "", text)  # remove punctuation
    return text.split()

# Build vocabulary
counter = Counter()
for example in dataset["train"]:
    tokens = simple_tokenizer(example["text"])
    counter.update(tokens)

vocab = {"<pad>": 0, "<unk>": 1}
for word, freq in counter.items():
    if freq >= 5:  # remove rare words
        vocab[word] = len(vocab)

# Encode text
def encode(text):
    return [vocab.get(token, vocab["<unk>"]) for token in simple_tokenizer(text)]

# IMDb Dataset class
class IMDBDataset(Dataset):
    def __init__(self, split):
        self.data = dataset[split]

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        text = encode(self.data[idx]["text"])
        label = torch.tensor(self.data[idx]["label"], dtype=torch.float)
        return torch.tensor(text, dtype=torch.long), label

# Collate function to pad sequences
def collate_batch(batch):
    texts, labels = zip(*batch)
    padded = pad_sequence(texts, batch_first=True, padding_value=0)
    labels = torch.tensor(labels)
    return padded, labels

# DataLoaders
train_loader = DataLoader(IMDBDataset("train"), batch_size=64, shuffle=True, collate_fn=collate_batch)
test_loader = DataLoader(IMDBDataset("test"), batch_size=64, shuffle=False, collate_fn=collate_batch)


README.md:   0%|          | 0.00/7.81k [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


train-00000-of-00001.parquet:   0%|          | 0.00/21.0M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/20.5M [00:00<?, ?B/s]

unsupervised-00000-of-00001.parquet:   0%|          | 0.00/42.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating unsupervised split:   0%|          | 0/50000 [00:00<?, ? examples/s]

In [None]:
class VanillaRNN(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim):
        super(VanillaRNN, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=0)
        self.rnn = nn.RNN(embed_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, 1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        embedded = self.embedding(x)
        output, hidden = self.rnn(embedded)   ## h_t = wx x_t + wh h_t-1 + b
        out = self.fc(hidden.squeeze(0))
        return self.sigmoid(out)


In [12]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = VanillaRNN(vocab_size=len(vocab), embed_dim=100, hidden_dim=128).to(device)
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training loop
for epoch in range(3):  # Reduce epochs to avoid long training time
    model.train()
    total_loss = 0
    for inputs, labels in train_loader:
        inputs, labels = inputs.to(device), labels.to(device)
        optimizer.zero_grad()
        outputs = model(inputs).squeeze()
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch+1}, Loss: {total_loss / len(train_loader):.4f}")

# Evaluation
model.eval()
correct, total = 0, 0
with torch.no_grad():
    for inputs, labels in test_loader:
        inputs, labels = inputs.to(device), labels.to(device)
        outputs = model(inputs).squeeze()
        preds = (outputs >= 0.5).float()
        correct += (preds == labels).sum().item()
        total += labels.size(0)

print(f"Test Accuracy: {correct / total:.4f}")


Epoch 1, Loss: 0.6932
Epoch 2, Loss: 0.6971
Epoch 3, Loss: 0.6954
Test Accuracy: 0.5097


In [13]:
import random

# Get a few random samples from the test set
sample_indices = random.sample(range(len(test_loader.dataset)), 5)
samples = [test_loader.dataset[i] for i in sample_indices]

model.eval()
print("\n🧪 Sample Test Predictions:\n")
for i, (text_tensor, true_label) in enumerate(samples):
    input_ids = text_tensor.unsqueeze(0).to(device)
    with torch.no_grad():
        output = model(input_ids).item()
        pred_label = 1 if output >= 0.5 else 0

    # Decode the input back to readable text
    inv_vocab = {idx: word for word, idx in vocab.items()}
    words = [inv_vocab.get(idx.item(), "<unk>") for idx in text_tensor[:50]]  # show only first 50 tokens

    print(f"Sample {i+1}")
    print(f"📄 Review (truncated): {' '.join(words)}")
    print(f"✅ Predicted: {'positive' if pred_label else 'negative'}")
    print(f"🟡 Actual:    {'positive' if int(true_label) else 'negative'}")
    print("-" * 60)



🧪 Sample Test Predictions:

Sample 1
📄 Review (truncated): this hugely entertaining short is considered one of the best shorts ever and i certainly wont argue with that even in a country where topnotch animated shorts are created with <unk> this film still manages to stand out if you ever get the chance to view this film please do
✅ Predicted: negative
🟡 Actual:    positive
------------------------------------------------------------
Sample 2
📄 Review (truncated): i watched it because my friend said we could try it when my father asked if wed watch it i didnt want to because it was such an old film how could that be good i finally did watch with that friend and my father my friend and i loved
✅ Predicted: negative
🟡 Actual:    positive
------------------------------------------------------------
Sample 3
📄 Review (truncated): <unk> of hand is my favorite rockford files episode of the entire series this episode shows a side of jim rockford that is usually ignored to wit jim is genuinel