In [1]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("lakshmi25npathi/imdb-dataset-of-50k-movie-reviews")

print("Path to dataset files:", path)


Path to dataset files: /kaggle/input/imdb-dataset-of-50k-movie-reviews


In [1]:
# !pip install pandas scikit-learn torch torchvision torchaudio tqdm kagglehub --quiet

import os, re, random, gc, glob
import pandas as pd
import numpy as np
from collections import Counter

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score
from tqdm import tqdm

# ---- Kaggle dataset path ----
import kagglehub
DATASET_DIR = kagglehub.dataset_download("lakshmi25npathi/imdb-dataset-of-50k-movie-reviews")
print("DATASET_DIR:", DATASET_DIR)

# Try to auto-detect the CSV
candidates = glob.glob(os.path.join(DATASET_DIR, "**", "*.csv"), recursive=True)
CSV_FILE = os.path.basename(candidates[0]) if candidates else "IMDB Dataset.csv"
print("CSV_FILE:", CSV_FILE)

# ---- General config ----
TEXT_COL, LABEL_COL = "review", "sentiment"
MIN_FREQ, MAX_VOCAB_SIZE = 2, 50000
MAX_LEN = 250

EMBED_DIM = 100          # use 100 if you load glove.6B.100d.txt
HIDDEN_DIM = 128
BATCH_SIZE = 64
EPOCHS, LR = 5, 2e-3
VAL_SPLIT, TEST_SPLIT = 0.1, 0.1
SEED = 42

# GloVe (set to file path or keep None for learned embeddings)
GLOVE_TXT_PATH = None  # e.g., "/content/glove.6B.100d.txt"

random.seed(SEED); np.random.seed(SEED); torch.manual_seed(SEED)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device


DATASET_DIR: /kaggle/input/imdb-dataset-of-50k-movie-reviews
CSV_FILE: IMDB Dataset.csv


device(type='cuda')

In [2]:
def basic_clean(text: str) -> str:
    text = text.lower()
    text = re.sub(r"<br\s*/?>", " ", text)
    text = re.sub(r"<.*?>", " ", text)
    text = re.sub(r"[^a-z0-9' ]+", " ", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text

df = pd.read_csv(os.path.join(DATASET_DIR, CSV_FILE))
df = df[[TEXT_COL, LABEL_COL]].dropna()

label_map = {"negative": 0, "positive": 1}
df["label"] = df[LABEL_COL].map(label_map).astype(int)
df["clean"] = df[TEXT_COL].map(basic_clean)

df.head(3)


Unnamed: 0,review,sentiment,label,clean
0,One of the other reviewers has mentioned that ...,positive,1,one of the other reviewers has mentioned that ...
1,A wonderful little production. <br /><br />The...,positive,1,a wonderful little production the filming tech...
2,I thought this was a wonderful way to spend ti...,positive,1,i thought this was a wonderful way to spend ti...


In [3]:
def tokenize(s: str):
    return s.split()

counter = Counter()
for s in df["clean"]:
    counter.update(tokenize(s))

PAD, UNK, BOS, EOS = "<PAD>", "<UNK>", "<BOS>", "<EOS>"

most_common = [w for w, c in counter.most_common() if c >= MIN_FREQ]
most_common = most_common[:MAX_VOCAB_SIZE - 4]

itos = [PAD, UNK, BOS, EOS] + most_common
stoi = {w: i for i, w in enumerate(itos)}
vocab_size = len(itos)
vocab_size


50000

In [4]:
def encode(tokens, add_bos_eos=True, max_len=MAX_LEN):
    ids = [stoi.get(t, stoi[UNK]) for t in tokens]
    if add_bos_eos:
        ids = [stoi[BOS]] + ids + [stoi[EOS]]
    if len(ids) < max_len:
        ids += [stoi[PAD]] * (max_len - len(ids))
    else:
        ids = ids[:max_len]
    return ids

df["ids"] = df["clean"].map(lambda s: encode(tokenize(s)))

X = np.stack(df["ids"].values)
y = df["label"].values

X_temp, X_test, y_temp, y_test = train_test_split(
    X, y, test_size=TEST_SPLIT, random_state=SEED, stratify=y
)
val_size = VAL_SPLIT / (1 - TEST_SPLIT)
X_train, X_val, y_train, y_val = train_test_split(
    X_temp, y_temp, test_size=val_size, random_state=SEED, stratify=y_temp
)

len(X_train), len(X_val), len(X_test)


(40000, 5000, 5000)

In [5]:
class IMDBDataset(Dataset):
    def __init__(self, X, y):
        self.X = torch.tensor(X, dtype=torch.long)
        self.y = torch.tensor(y, dtype=torch.float32)
    def __len__(self): return len(self.X)
    def __getitem__(self, idx): return self.X[idx], self.y[idx]

train_dl = DataLoader(IMDBDataset(X_train, y_train), batch_size=BATCH_SIZE, shuffle=True,  num_workers=2, pin_memory=True)
val_dl   = DataLoader(IMDBDataset(X_val,   y_val),   batch_size=BATCH_SIZE, shuffle=False, num_workers=2, pin_memory=True)
test_dl  = DataLoader(IMDBDataset(X_test,  y_test),  batch_size=BATCH_SIZE, shuffle=False, num_workers=2, pin_memory=True)


In [6]:
def load_glove(path, embed_dim=EMBED_DIM):
    word2vec = {}
    with open(path, "r", encoding="utf8") as f:
        for line in f:
            parts = line.rstrip().split(" ")
            w, vec = parts[0], np.asarray(parts[1:], dtype=np.float32)
            if vec.size == embed_dim: word2vec[w] = vec
    return word2vec

def build_embedding_matrix(itos, glove_dict, embed_dim=EMBED_DIM):
    mat = np.random.normal(0, 0.05, (len(itos), embed_dim)).astype(np.float32)
    mat[0] = 0.0  # PAD row zeros
    for i, w in enumerate(itos):
        if w in (PAD, UNK, BOS, EOS): continue
        if w in glove_dict: mat[i] = glove_dict[w]
    return torch.tensor(mat)

embedding_matrix = None
if GLOVE_TXT_PATH and os.path.exists(GLOVE_TXT_PATH):
    glove = load_glove(GLOVE_TXT_PATH, EMBED_DIM)
    embedding_matrix = build_embedding_matrix(itos, glove, EMBED_DIM)
    print("GloVe loaded; matrix:", embedding_matrix.shape)
else:
    print("GloVe not provided → using learned embeddings.")


GloVe not provided → using learned embeddings.


In [7]:
class RNNClassifier(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim, use_glove=False, embedding_matrix=None):
        super().__init__()
        if use_glove and embedding_matrix is not None:
            self.embedding = nn.Embedding.from_pretrained(embedding_matrix, freeze=False, padding_idx=0)
        else:
            self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=0)
        self.rnn = nn.RNN(embed_dim, hidden_dim, batch_first=True)
        self.fc  = nn.Linear(hidden_dim, 1)

    def forward(self, x):
        x = self.embedding(x)
        _, h = self.rnn(x)            # h: (1,B,H)
        return self.fc(h.squeeze(0)).squeeze(1)

class LSTMClassifier(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim, use_glove=False, embedding_matrix=None):
        super().__init__()
        if use_glove and embedding_matrix is not None:
            self.embedding = nn.Embedding.from_pretrained(embedding_matrix, freeze=False, padding_idx=0)
        else:
            self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=0)
        self.lstm = nn.LSTM(embed_dim, hidden_dim, batch_first=True)
        self.fc   = nn.Linear(hidden_dim, 1)

    def forward(self, x):
        x = self.embedding(x)
        _, (h, _) = self.lstm(x)      # h: (1,B,H)
        return self.fc(h.squeeze(0)).squeeze(1)


In [8]:
def run_epoch(model, loader, criterion, optimizer=None):
    train = optimizer is not None
    model.train(train)
    losses, preds, trues = [], [], []
    for xb, yb in loader:
        xb, yb = xb.to(device), yb.to(device)
        with torch.set_grad_enabled(train):
            logits = model(xb)
            loss = criterion(logits, yb)
            if train:
                optimizer.zero_grad(set_to_none=True)
                loss.backward()
                torch.nn.utils.clip_grad_norm_(model.parameters(), 5.0)
                optimizer.step()
        losses.append(loss.item())
        preds.extend((torch.sigmoid(logits) > 0.5).long().cpu().numpy())
        trues.extend(yb.long().cpu().numpy())
    return float(np.mean(losses)), accuracy_score(trues, preds), f1_score(trues, preds)

def train_model(model, train_dl, val_dl, epochs=EPOCHS, lr=LR):
    model = model.to(device)
    crit = nn.BCEWithLogitsLoss()
    opt  = torch.optim.Adam(model.parameters(), lr=lr)

    best_f1, best_state = -1, None
    for ep in range(1, epochs+1):
        tr_l, tr_a, tr_f = run_epoch(model, train_dl, crit, opt)
        va_l, va_a, va_f = run_epoch(model, val_dl,   crit, None)
        print(f"Epoch {ep:02d} | train loss={tr_l:.4f} acc={tr_a:.4f} f1={tr_f:.4f} || "
              f"val loss={va_l:.4f} acc={va_a:.4f} f1={va_f:.4f}")
        if va_f > best_f1:
            best_f1, best_state = va_f, {k: v.cpu() for k, v in model.state_dict().items()}
    if best_state:
        model.load_state_dict(best_state)
        model = model.to(device)
    return model

def test_model(model, test_dl):
    crit = nn.BCEWithLogitsLoss()
    te_l, te_a, te_f = run_epoch(model, test_dl, crit, None)
    print(f"TEST  | loss={te_l:.4f} acc={te_a:.4f} f1={te_f:.4f}")
    return {"loss": te_l, "acc": te_a, "f1": te_f}



In [9]:
def run_experiment(model_cls, use_glove, name):
    model = model_cls(
        vocab_size=vocab_size,
        embed_dim=EMBED_DIM,
        hidden_dim=HIDDEN_DIM,
        use_glove=use_glove and (embedding_matrix is not None),
        embedding_matrix=embedding_matrix if use_glove else None
    )
    print(f"\n=== {name} ===")
    model = train_model(model, train_dl, val_dl, epochs=EPOCHS, lr=LR)
    return test_model(model, test_dl)

results = {}
results["RNN+GloVe"]     = run_experiment(RNNClassifier,  True,  "RNN (GloVe)")
results["LSTM+GloVe"]    = run_experiment(LSTMClassifier, True,  "LSTM (GloVe)")
results["RNN+Learned"]   = run_experiment(RNNClassifier,  False, "RNN (Learned Embedding)")
results["LSTM+Learned"]  = run_experiment(LSTMClassifier, False, "LSTM (Learned Embedding)")
results



=== RNN (GloVe) ===
Epoch 01 | train loss=0.6963 acc=0.5011 f1=0.4700 || val loss=0.6976 acc=0.5014 f1=0.6549
Epoch 02 | train loss=0.6973 acc=0.5049 f1=0.5201 || val loss=0.7008 acc=0.5002 f1=0.0008
Epoch 03 | train loss=0.6942 acc=0.5111 f1=0.5064 || val loss=0.6997 acc=0.5020 f1=0.0518
Epoch 04 | train loss=0.6889 acc=0.5266 f1=0.5215 || val loss=0.6938 acc=0.5092 f1=0.6321
Epoch 05 | train loss=0.6713 acc=0.5495 f1=0.5468 || val loss=0.7105 acc=0.5068 f1=0.6562
TEST  | loss=0.7135 acc=0.5012 f1=0.6530

=== LSTM (GloVe) ===
Epoch 01 | train loss=0.6926 acc=0.5115 f1=0.4856 || val loss=0.6930 acc=0.5244 f1=0.4467
Epoch 02 | train loss=0.6808 acc=0.5595 f1=0.5502 || val loss=0.6858 acc=0.5492 f1=0.3178
Epoch 03 | train loss=0.6255 acc=0.6688 f1=0.6603 || val loss=0.6308 acc=0.6794 f1=0.6498
Epoch 04 | train loss=0.5450 acc=0.7435 f1=0.7251 || val loss=0.5741 acc=0.6768 f1=0.7308
Epoch 05 | train loss=0.4382 acc=0.8100 f1=0.8095 || val loss=0.4621 acc=0.8028 f1=0.7916
TEST  | loss=0.4

{'RNN+GloVe': {'loss': 0.7135331653341462,
  'acc': 0.5012,
  'f1': 0.6530328324986088},
 'LSTM+GloVe': {'loss': 0.46826197414458554,
  'acc': 0.7944,
  'f1': 0.7851170568561872},
 'RNN+Learned': {'loss': 0.6961947264550608,
  'acc': 0.5098,
  'f1': 0.47994907702100575},
 'LSTM+Learned': {'loss': 0.32107965384103077,
  'acc': 0.8722,
  'f1': 0.8748776189543763}}

In [10]:
pd.DataFrame(results).T.sort_values("f1", ascending=False)

Unnamed: 0,loss,acc,f1
LSTM+Learned,0.32108,0.8722,0.874878
LSTM+GloVe,0.468262,0.7944,0.785117
RNN+GloVe,0.713533,0.5012,0.653033
RNN+Learned,0.696195,0.5098,0.479949
