In [None]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
from torch.nn.utils.rnn import pad_sequence
import spacy
from collections import Counter
import random
import numpy as np
from tqdm import tqdm
from sklearn.metrics import accuracy_score



In [None]:
!wget http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz


--2026-01-21 21:24:36--  http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz
Resolving ai.stanford.edu (ai.stanford.edu)... 171.64.68.10
Connecting to ai.stanford.edu (ai.stanford.edu)|171.64.68.10|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 84125825 (80M) [application/x-gzip]
Saving to: ‘aclImdb_v1.tar.gz’


2026-01-21 21:24:44 (10.6 MB/s) - ‘aclImdb_v1.tar.gz’ saved [84125825/84125825]



In [None]:
!tar -xzf aclImdb_v1.tar.gz


In [None]:
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m77.0 MB/s[0m eta [36m0:00:00[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [None]:
# Set random seed for reproducibility
SEED = 1234
torch.manual_seed(SEED)
random.seed(SEED)
np.random.seed(SEED)

# Load spaCy tokenizer
nlp = spacy.load("en_core_web_sm")

PAD_IDX = 0
UNK_IDX = 1

def tokenize(text):
    return [token.text.lower() for token in nlp(text)]

# Build vocabulary
def build_vocab(texts, max_vocab_size=25_000):
    counter = Counter(token for text in texts for token in tokenize(text))
    vocab = {word: idx + 2 for idx, (word, _) in enumerate(counter.most_common(max_vocab_size))}
    vocab["<pad>"] = PAD_IDX
    vocab["<unk>"] = UNK_IDX
    return vocab

# Numericalize text
def numericalize(texts, vocab):
    return [[vocab.get(token, UNK_IDX) for token in tokenize(text)] for text in texts]

# Load IMDB dataset
def load_imdb_data(data_dir):
    texts, labels = [], []
    for label_type in ["pos", "neg"]:
        folder = f"{data_dir}/{label_type}"
        for file in os.listdir(folder):
            with open(f"{folder}/{file}", "r", encoding="utf-8") as f:
                texts.append(f.read())
                labels.append(1 if label_type == "pos" else 0)
    return texts, labels


In [None]:
class IMDBDataset(Dataset):
    def __init__(self, texts, labels, vocab):
        self.texts = numericalize(texts, vocab)
        self.labels = labels

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        return torch.tensor(self.texts[idx], dtype=torch.long), torch.tensor(self.labels[idx], dtype=torch.float)

def collate_fn(batch):
    texts, labels = zip(*batch)
    lengths = torch.tensor([len(text) for text in texts])
    padded_texts = pad_sequence(texts, batch_first=True, padding_value=PAD_IDX)
    labels = torch.tensor(labels, dtype=torch.float)
    return padded_texts, labels, lengths


In [None]:
class LogisticRegression(nn.Module):
    def __init__(self, vocab_size, embed_dim):
        super(LogisticRegression, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=PAD_IDX)
        self.fc = nn.Linear(embed_dim, 1)

    def forward(self, x):
        embedded = self.embedding(x)  # Shape: [batch_size, seq_len, embed_dim]
        pooled = embedded.mean(dim=1)  # Average over the sequence length
        return self.fc(pooled).squeeze(1)  # Shape: [batch_size]


In [None]:
def train_model(model, dataloader, optimizer, criterion, device):
    model.train()
    epoch_loss = 0
    for texts, labels, _ in tqdm(dataloader):
        texts, labels = texts.to(device), labels.to(device)
        optimizer.zero_grad()
        predictions = model(texts)
        loss = criterion(predictions, labels)
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()
    return epoch_loss / len(dataloader)

def evaluate_model(model, dataloader, criterion, device):
    model.eval()
    epoch_loss = 0
    all_preds, all_labels = [], []
    with torch.no_grad():
        for texts, labels, _ in dataloader:
            texts, labels = texts.to(device), labels.to(device)
            predictions = model(texts)
            loss = criterion(predictions, labels)
            epoch_loss += loss.item()
            all_preds.extend(torch.round(torch.sigmoid(predictions)).cpu().numpy())
            all_labels.extend(labels.cpu().numpy())
    accuracy = accuracy_score(all_labels, all_preds)
    return epoch_loss / len(dataloader), accuracy


In [None]:
# Load data
import os
train_texts, train_labels = load_imdb_data("./aclImdb/train")
test_texts, test_labels = load_imdb_data("./aclImdb/test")



In [None]:

# Split data
train_texts, valid_texts = train_texts[:20000], train_texts[20000:]
train_labels, valid_labels = train_labels[:20000], train_labels[20000:]

# Build vocabulary
vocab = build_vocab(train_texts)



In [None]:
# Create datasets and dataloaders
train_dataset = IMDBDataset(train_texts, train_labels, vocab)
valid_dataset = IMDBDataset(valid_texts, valid_labels, vocab)
test_dataset = IMDBDataset(test_texts, test_labels, vocab)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True, collate_fn=collate_fn)
valid_loader = DataLoader(valid_dataset, batch_size=32, shuffle=False, collate_fn=collate_fn)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False, collate_fn=collate_fn)


In [None]:

# Model, optimizer, and loss function
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = LogisticRegression(len(vocab), embed_dim=100).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
criterion = nn.BCEWithLogitsLoss()

# Train the model
N_EPOCHS = 10
for epoch in range(N_EPOCHS):
    train_loss = train_model(model, train_loader, optimizer, criterion, device)
    valid_loss, valid_acc = evaluate_model(model, valid_loader, criterion, device)
    print(f"Epoch {epoch+1}: Train Loss = {train_loss:.4f}, Valid Loss = {valid_loss:.4f}, Valid Accuracy = {valid_acc:.4f}")

100%|██████████| 625/625 [00:17<00:00, 35.39it/s]


Epoch 1: Train Loss = 0.6441, Valid Loss = 0.8067, Valid Accuracy = 0.1746


100%|██████████| 625/625 [00:15<00:00, 39.93it/s]


Epoch 2: Train Loss = 0.5264, Valid Loss = 0.6686, Valid Accuracy = 0.5788


100%|██████████| 625/625 [00:15<00:00, 40.28it/s]


Epoch 3: Train Loss = 0.4083, Valid Loss = 0.5558, Valid Accuracy = 0.7046


100%|██████████| 625/625 [00:15<00:00, 39.33it/s]


Epoch 4: Train Loss = 0.3346, Valid Loss = 0.4895, Valid Accuracy = 0.7594


100%|██████████| 625/625 [00:15<00:00, 40.12it/s]


Epoch 5: Train Loss = 0.2895, Valid Loss = 0.4304, Valid Accuracy = 0.8016


100%|██████████| 625/625 [00:15<00:00, 39.97it/s]


Epoch 6: Train Loss = 0.2563, Valid Loss = 0.4324, Valid Accuracy = 0.8064


100%|██████████| 625/625 [00:16<00:00, 38.94it/s]


Epoch 7: Train Loss = 0.2297, Valid Loss = 0.4098, Valid Accuracy = 0.8224


100%|██████████| 625/625 [00:15<00:00, 39.73it/s]


Epoch 8: Train Loss = 0.2076, Valid Loss = 0.4004, Valid Accuracy = 0.8286


100%|██████████| 625/625 [00:15<00:00, 39.59it/s]


Epoch 9: Train Loss = 0.1887, Valid Loss = 0.3518, Valid Accuracy = 0.8544


100%|██████████| 625/625 [00:15<00:00, 39.54it/s]


Epoch 10: Train Loss = 0.1709, Valid Loss = 0.3804, Valid Accuracy = 0.8448
