# BiLSTM + GloVe — Datathon Track 2 (Run 2)
**A100 GPU. Key fixes vs Run 1:**
- `EPOCHS=30`, `PATIENCE=5` — model was still improving at epoch 15
- `POS_WEIGHT_CAP=10` — fixes threshold stuck at 0.90–0.94
- `HIDDEN_DIM=512` — doubles model capacity

Drive folder already mounted from Run 1 — just re-run Cell 3 onward.

In [7]:
# Cell 1 — Check GPU
import torch
print('torch:', torch.__version__)
print('CUDA :', torch.cuda.is_available())
if torch.cuda.is_available():
    print('GPU  :', torch.cuda.get_device_name(0))
    print('VRAM :', round(torch.cuda.get_device_properties(0).total_memory/1e9,1), 'GB')
else:
    raise RuntimeError('No GPU — Runtime > Change runtime type > A100')

torch: 2.10.0+cu128
CUDA : True
GPU  : NVIDIA A100-SXM4-80GB
VRAM : 85.1 GB


In [8]:
# Cell 2 — Deps
import subprocess, sys
subprocess.run([sys.executable, '-m', 'pip', 'install', '-q',
                'scikit-learn', 'pandas', 'numpy'])
print('Deps ready.')

Deps ready.


In [9]:
# Cell 3 — Mount Google Drive and copy data files
from google.colab import drive
import shutil, os

drive.mount('/content/drive')

DRIVE_FOLDER = '/content/drive/MyDrive/datathon'   # <-- edit if needed

for fname in ['train.csv', 'val.csv', 'label_list.txt']:
    src = os.path.join(DRIVE_FOLDER, fname)
    if os.path.exists(src):
        shutil.copy(src, fname)
        print(f'  OK  {fname}  ({os.path.getsize(fname)/1e6:.1f} MB)')
    else:
        print(f'  MISSING: {src}')

missing = [f for f in ['train.csv', 'val.csv', 'label_list.txt']
           if not os.path.exists(f)]
if missing:
    raise FileNotFoundError(f'Missing: {missing} — check DRIVE_FOLDER above')
print('All data files ready.')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
  OK  train.csv  (12.5 MB)
  OK  val.csv  (2.7 MB)
  OK  label_list.txt  (0.0 MB)
All data files ready.


In [10]:
# Cell 4 — GloVe (skips download if already present from Run 1)
import os
os.makedirs('data/glove', exist_ok=True)
glove_path = 'data/glove/glove.6B.300d.txt'

if not os.path.exists(glove_path):
    print('Downloading glove.6B.zip (~820 MB) ...')
    os.system('wget -q --show-progress https://nlp.stanford.edu/data/glove.6B.zip '
              '-O data/glove/glove.6B.zip')
    os.system("unzip -j data/glove/glove.6B.zip 'glove.6B.300d.txt' -d data/glove/")
    os.remove('data/glove/glove.6B.zip')
    print('Done.')
else:
    print('GloVe already present, skipping download.')
print(f'Size: {os.path.getsize(glove_path)/1e9:.2f} GB')

GloVe already present, skipping download.
Size: 1.04 GB


In [12]:
# Cell 5 — Train BiLSTM (Run 2 — tuned hyperparameters)
import os, json, pickle
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import f1_score

# ── Config (key changes from Run 1 marked with <--) ──────────────────────────
TRAIN_PATH      = 'train.csv'
VAL_PATH        = 'val.csv'
LABEL_LIST_PATH = 'label_list.txt'
GLOVE_PATH      = 'data/glove/glove.6B.300d.txt'
MODEL_DIR       = 'model/'
VOCAB_SIZE      = 50_000
MAX_LEN         = 300
EMBED_DIM       = 300
HIDDEN_DIM      = 512      # <-- doubled (was 256)
NUM_LAYERS      = 2
DROPOUT         = 0.3
BATCH_SIZE      = 128
EPOCHS          = 30       # <-- more epochs (was 15, model wasn't converged)
LR              = 1e-3
PATIENCE        = 5        # <-- more patience (was 3)
POS_WEIGHT_CAP  = 10.0     # <-- lower cap (was 50) — fixes threshold stuck at 0.90+

DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Device: {DEVICE}')

# Helpers
def load_label_list(path):
    labels = [ln.strip().lower() for ln in open(path, encoding='utf-8') if ln.strip()]
    if 'none' not in labels: labels.append('none')
    seen = set()
    return [x for x in labels if not (x in seen or seen.add(x))]

def parse_topics(x, known=None):
    x = (x or 'none').strip().lower()
    parts = [t.strip() for t in x.split('|') if t.strip()]
    if known: parts = [t for t in parts if t in known]
    return parts if parts else ['none']

def combine_text(df):
    return (df['title'].fillna('').astype(str) + '. ' +
            df['text'].fillna('').astype(str)).tolist()

def build_vocab(texts, max_vocab):
    from collections import Counter
    c = Counter()
    for t in texts: c.update(t.lower().split())
    v = {'<PAD>': 0, '<UNK>': 1}
    for w, _ in c.most_common(max_vocab - 2): v[w] = len(v)
    return v

def texts_to_sequences(texts, vocab, max_len):
    UNK = vocab.get('<UNK>', 1)
    seqs = []
    for t in texts:
        ids = [vocab.get(tok, UNK) for tok in t.lower().split()[:max_len]]
        ids += [0] * (max_len - len(ids))
        seqs.append(ids)
    return np.array(seqs, dtype=np.int64)

def load_glove(path, vocab, embed_dim):
    print('Loading GloVe ...')
    rng = np.random.default_rng(42)
    mat = rng.uniform(-0.05, 0.05, (len(vocab), embed_dim)).astype(np.float32)
    mat[0] = 0.0
    found = 0
    with open(path, encoding='utf-8') as fh:
        for line in fh:
            p = line.rstrip().split(' ')
            if len(p) != embed_dim + 1: continue
            if p[0] in vocab:
                mat[vocab[p[0]]] = np.array(p[1:], dtype=np.float32)
                found += 1
    print(f'  {found:,}/{len(vocab):,} words matched')
    return mat

class TextDataset(Dataset):
    def __init__(self, X, Y):
        self.X = torch.from_numpy(X); self.Y = torch.from_numpy(Y)
    def __len__(self): return len(self.X)
    def __getitem__(self, i): return self.X[i], self.Y[i]

class BiLSTMClassifier(nn.Module):
    def __init__(self, vocab_size, embed_dim, embed_matrix,
                 hidden_dim, num_layers, num_labels, dropout):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=0)
        self.embedding.weight = nn.Parameter(
            torch.tensor(embed_matrix, dtype=torch.float32))
        self.lstm = nn.LSTM(embed_dim, hidden_dim, num_layers=num_layers,
                            bidirectional=True, batch_first=True,
                            dropout=dropout if num_layers > 1 else 0.0)
        self.drop = nn.Dropout(dropout)
        self.fc   = nn.Linear(hidden_dim * 2, num_labels)

    def forward(self, x):
        mask    = (x != 0).float()
        lengths = mask.sum(1, keepdim=True).clamp(min=1)
        emb     = self.drop(self.embedding(x))
        out, _  = self.lstm(emb)
        pooled  = (out * mask.unsqueeze(-1)).sum(1) / lengths
        return self.fc(self.drop(pooled))

@torch.no_grad()
def get_val_probs(model, loader, device):
    model.eval()
    P, L = [], []
    for X, Y in loader:
        P.append(torch.sigmoid(model(X.to(device))).cpu().numpy())
        L.append(Y.numpy())
    return np.vstack(P), np.vstack(L)

def sweep_threshold(probs, labels):
    best_t, best_m = 0.5, -1.0
    for t in np.arange(0.10, 0.95, 0.01):   # wider sweep — catches low-prob models
        m = f1_score(labels, (probs >= t).astype(int),
                     average='micro', zero_division=0)
        if m > best_m: best_m, best_t = m, float(t)
    return best_t, best_m

# Run
os.makedirs(MODEL_DIR, exist_ok=True)
train_df   = pd.read_csv(TRAIN_PATH, dtype=str).fillna('')
val_df     = pd.read_csv(VAL_PATH,   dtype=str).fillna('')
labels     = load_label_list(LABEL_LIST_PATH)
known      = set(labels)
num_labels = len(labels)
print(f'Train {len(train_df):,}  Val {len(val_df):,}  Labels {num_labels}')

mlb     = MultiLabelBinarizer(classes=labels)
Y_train = mlb.fit_transform(
    train_df['topics'].apply(lambda x: parse_topics(x, known))).astype(np.float32)
Y_val   = mlb.transform(
    val_df['topics'].apply(lambda x: parse_topics(x, known))).astype(np.float32)

train_texts = combine_text(train_df)
val_texts   = combine_text(val_df)
print('Building vocab ...')
vocab        = build_vocab(train_texts, VOCAB_SIZE)
X_train      = texts_to_sequences(train_texts, vocab, MAX_LEN)
X_val        = texts_to_sequences(val_texts,   vocab, MAX_LEN)
embed_matrix = load_glove(GLOVE_PATH, vocab, EMBED_DIM)

train_loader = DataLoader(TextDataset(X_train, Y_train), batch_size=BATCH_SIZE,
                          shuffle=True,  num_workers=2, pin_memory=True)
val_loader   = DataLoader(TextDataset(X_val,   Y_val),   batch_size=BATCH_SIZE*4,
                          shuffle=False, num_workers=2, pin_memory=True)

model = BiLSTMClassifier(len(vocab), EMBED_DIM, embed_matrix,
                         HIDDEN_DIM, NUM_LAYERS, num_labels, DROPOUT).to(DEVICE)
print(f'Params: {sum(p.numel() for p in model.parameters()):,}')

pos_counts = Y_train.sum(0).clip(min=1)
pos_weight = torch.tensor(
    np.clip((len(Y_train) - pos_counts) / pos_counts, None, POS_WEIGHT_CAP),
    dtype=torch.float32).to(DEVICE)
criterion  = nn.BCEWithLogitsLoss(pos_weight=pos_weight)
optimizer  = torch.optim.Adam(model.parameters(), lr=LR)
scheduler  = torch.optim.lr_scheduler.ReduceLROnPlateau(
    optimizer, mode='max', factor=0.5, patience=2)

best_micro, best_thr, patience_ctr = -1.0, 0.5, 0
print(f'\n{"-"*58}')
print(f'  {"Ep":>3}  {"loss":>8}  {"Micro F1":>9}  {"thr":>5}')
print(f'{"-"*58}')

for epoch in range(1, EPOCHS + 1):
    model.train()
    total, n = 0.0, 0
    for X, Y in train_loader:
        X, Y = X.to(DEVICE), Y.to(DEVICE)
        optimizer.zero_grad(set_to_none=True)
        loss = criterion(model(X), Y)
        loss.backward()
        nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        total += loss.item(); n += 1

    probs, lbls = get_val_probs(model, val_loader, DEVICE)
    thr, micro  = sweep_threshold(probs, lbls)
    scheduler.step(micro)

    star = ' ★' if micro > best_micro + 1e-5 else ''
    print(f'  {epoch:3d}  {total/n:8.4f}  {micro:9.4f}  {thr:5.2f}{star}', flush=True)

    if micro > best_micro + 1e-5:
        best_micro, best_thr = micro, thr
        torch.save(model.state_dict(), os.path.join(MODEL_DIR, 'bilstm.pt'))
        patience_ctr = 0
    else:
        patience_ctr += 1
        if patience_ctr >= PATIENCE:
            print(f'  Early stopping at epoch {epoch}')
            break

print(f'{"-"*58}')
print(f'Best val Micro F1: {best_micro:.4f}  @ thr={best_thr:.2f}')

with open(os.path.join(MODEL_DIR, 'vocab.pkl'), 'wb') as f: pickle.dump(vocab, f)
with open(os.path.join(MODEL_DIR, 'mlb.pkl'),   'wb') as f: pickle.dump(mlb,   f)
meta = dict(best_threshold=best_thr, num_labels=num_labels,
            max_len=MAX_LEN, embed_dim=EMBED_DIM, hidden_dim=HIDDEN_DIM,
            num_layers=NUM_LAYERS, dropout=DROPOUT, vocab_size=len(vocab),
            val_micro_f1=round(best_micro,4), labels=labels)
with open(os.path.join(MODEL_DIR, 'meta_bilstm.json'), 'w') as f:
    json.dump(meta, f, indent=2)
print('Artifacts saved to model/')

Device: cuda
Train 15,104  Val 3,236  Labels 116
Building vocab ...
Loading GloVe ...
  25,825/50,000 words matched
Params: 24,752,692

----------------------------------------------------------
   Ep      loss   Micro F1    thr
----------------------------------------------------------
    1    0.1599     0.3995   0.43 ★
    2    0.1320     0.3062   0.37
    3    0.1078     0.4947   0.62 ★
    4    0.0891     0.5926   0.63 ★
    5    0.0679     0.6681   0.71 ★
    6    0.0552     0.6926   0.80 ★
    7    0.0466     0.7244   0.80 ★
    8    0.0399     0.7344   0.80 ★
    9    0.0339     0.7457   0.78 ★
   10    0.0285     0.7719   0.81 ★
   11    0.0243     0.7549   0.81
   12    0.0217     0.7765   0.78 ★
   13    0.0187     0.7798   0.83 ★
   14    0.0169     0.7804   0.83 ★
   15    0.0158     0.7915   0.81 ★
   16    0.0144     0.7931   0.77 ★
   17    0.0136     0.7939   0.80 ★
   18    0.0118     0.7956   0.78 ★
   19    0.0107     0.7941   0.76
   20    0.0096     0.8003   0.76 

In [13]:
# Cell 6 — Zip and save model artifacts to Google Drive
import zipfile, shutil, os

zip_path = 'bilstm_model.zip'
with zipfile.ZipFile(zip_path, 'w', zipfile.ZIP_DEFLATED) as zf:
    for fname in ['bilstm.pt', 'vocab.pkl', 'mlb.pkl', 'meta_bilstm.json']:
        fpath = os.path.join('model', fname)
        zf.write(fpath, fname)
        print(f'  {fname}  ({os.path.getsize(fpath)/1e6:.1f} MB)')

drive_out = os.path.join(DRIVE_FOLDER, zip_path)
shutil.copy(zip_path, drive_out)
print(f'\nSaved to Google Drive: {drive_out}')
print('Download from Drive, unzip locally: unzip bilstm_model.zip -d model/')

  bilstm.pt  (99.0 MB)
  vocab.pkl  (0.7 MB)
  mlb.pkl  (0.0 MB)
  meta_bilstm.json  (0.0 MB)

Saved to Google Drive: /content/drive/MyDrive/datathon/bilstm_model.zip
Download from Drive, unzip locally: unzip bilstm_model.zip -d model/
