In [None]:


import os, re, html, random, gc, time, math
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from collections import Counter
from sklearn.metrics import (accuracy_score, classification_report,
                             confusion_matrix, roc_curve, auc)
from sklearn.model_selection import train_test_split, learning_curve

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import os
os.makedirs("figures", exist_ok=True)   # ← create the folder if missing


SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

# ---------------------------------------------------------
# 1. Load data -------------------------------------------------
DATA_PATH = "/kaggle/input/ai-2-dl-for-nlp-2025-homework-2"
train_df = pd.read_csv(f"{DATA_PATH}/train_dataset.csv")
val_df   = pd.read_csv(f"{DATA_PATH}/val_dataset.csv")
test_df  = pd.read_csv(f"{DATA_PATH}/test_dataset.csv")
print("Train / Val / Test:", train_df.shape, val_df.shape, test_df.shape)

# ---------------------------------------------------------
# 2. Pre‑processing --------------------------------------------
slang_dict = {
    "u":"you","r":"are","im":"i am","ur":"your","ure":"you are","idk":"i dont know",
    "brb":"be right back","btw":"by the way","lmk":"let me know","tbh":"to be honest",
    "ftw":"for the win","fyi":"for your information","diy":"do it yourself",
    "gonna":"going to","wanna":"want to","da":"the","omg":"oh my god","gr8":"great"
}
URL_RE  = re.compile(r"http\S+|www\.\S+")
MENTION = re.compile(r"@\w+")
ELONG   = re.compile(r"(.)\1{2,}")   # char repeated ≥3
NON_ALPHANUM = re.compile(r"[^a-z0-9\s]")

def preprocess(text:str)->list[str]:
    text = html.unescape(text)
    text = text.lower()
    text = URL_RE.sub(" ", text)
    text = MENTION.sub(" ", text)
    text = text.replace("#", "")            # keep hashtag word
    # slang expansion
    for slang, rep in slang_dict.items():
        text = re.sub(rf"\b{slang}\b", rep, text)
    # normalize elongations (sooo -> soo)
    text = ELONG.sub(r"\1\1", text)
    text = NON_ALPHANUM.sub(" ", text)
    text = re.sub(r"\s+", " ", text).strip()
    tokens = text.split()
    return tokens if tokens else ["<pad>"]

# Apply preprocessing
for df in (train_df, val_df, test_df):
    df["tokens"] = df["Text"].apply(preprocess)

# ---------------------------------------------------------
# 3. Build vocabulary & load GloVe vectors ----------------
EMBED_DIM = 200
GLOVE_PATH = "/kaggle/input/glove-twitter/glove.twitter.27B.200d.txt"

# Build vocab from train + val
all_tokens = [tok for row in pd.concat([train_df["tokens"],val_df["tokens"]]) for tok in row]
vocab_cnt  = Counter(all_tokens)
# keep tokens with freq >=2  (OOV will be <unk>)
tokens = [t for t, c in vocab_cnt.items() if c >= 2 and t != "<pad>"]

# build an index sequence with NO gaps: 0=<pad>, 1=<unk>, 2…
specials = ["<pad>", "<unk>"]
word2idx  = {tok: i for i, tok in enumerate(specials)}

for tok in tokens:                          # tokens never contains <pad>
    word2idx[tok] = len(word2idx)

idx2word   = {i: w for w, i in word2idx.items()}
vocab_size = len(word2idx)
print("Vocab size:", vocab_size)
# Load GloVe
def load_glove(path, dim=100):
    vectors = {}
    with open(path, "r", encoding="utf8") as fh:
        for line in fh:
            parts = line.rstrip().split(" ")
            if len(parts) != dim+1:
                continue
            word = parts[0]
            if word in word2idx:            # only store needed words
                vectors[word] = np.asarray(parts[1:], dtype=np.float32)
    return vectors
glove = load_glove(GLOVE_PATH, EMBED_DIM)
print("Loaded glove vectors:", len(glove))

# Create embedding matrix
emb_matrix = np.random.normal(scale=0.6, size=(vocab_size, EMBED_DIM)).astype(np.float32)
emb_matrix[word2idx["<pad>"]] = np.zeros(EMBED_DIM)
for word, vec in glove.items():
    emb_matrix[word2idx[word]] = vec

word2idx_frozen = word2idx                 # same object; vocab won’t change
assert emb_matrix.shape[0] == vocab_size, "vocab/emb mismatch"

# ---------------------------------------------------------
# 4. PyTorch Dataset & DataLoader -------------------------
class TweetDataset(Dataset):
    def __init__(self, df, label_available=True):
        self.tokens = df["tokens"].tolist()
        self.ids    = df["ID"].tolist()
        self.label_available = label_available
        if label_available:
            self.labels = df["Label"].values

    def __len__(self): return len(self.tokens)

    def __getitem__(self, idx):
        tokens = self.tokens[idx]
        idxs = [word2idx_frozen.get(t, 1) for t in tokens]  # 1 = <unk>
        if self.label_available:
            return torch.tensor(idxs, dtype=torch.long), self.labels[idx]
        else:
            return torch.tensor(idxs, dtype=torch.long), self.ids[idx]

def collate_batch(batch):
    token_lists, targets = zip(*batch)
    offsets = [0]
    flat = []
    for tl in token_lists:
        flat.extend(tl)
        offsets.append(offsets[-1] + len(tl))
    offsets = torch.tensor(offsets[:-1], dtype=torch.long)
    flat = torch.tensor(flat, dtype=torch.long)
    if isinstance(targets[0], (int, np.integer)):
        targets = torch.tensor(targets, dtype=torch.long)
        return flat, offsets, targets
    else:                                 # test set, IDs
        ids = list(targets)
        return flat, offsets, ids

train_ds = TweetDataset(train_df)
val_ds   = TweetDataset(val_df)
test_ds  = TweetDataset(test_df, label_available=False)

BATCH = 64
train_loader = DataLoader(train_ds, batch_size=BATCH, shuffle=True,
                          collate_fn=collate_batch, drop_last=False)
val_loader   = DataLoader(val_ds,   batch_size=BATCH, shuffle=False,
                          collate_fn=collate_batch)
test_loader  = DataLoader(test_ds,  batch_size=BATCH, shuffle=False,
                          collate_fn=collate_batch)

# ---------------------------------------------------------
# 5. Model ------------------------------------------------
class FFNN(nn.Module):
    def __init__(self, embedding_weight, hidden=128, p_drop=0.5):
        super().__init__()
        num_embeddings, emb_dim = embedding_weight.shape
        self.embed = nn.EmbeddingBag(num_embeddings,
                                     emb_dim,
                                     mode='mean',
                                     _weight=torch.tensor(embedding_weight))
        self.dropout = nn.Dropout(p_drop)
        self.fc1 = nn.Linear(emb_dim, hidden)
        self.fc2 = nn.Linear(hidden, 2)

    def forward(self, tokens, offsets):
        x = self.embed(tokens, offsets)       # (batch, emb_dim)
        x = self.dropout(x)
        x = torch.relu(self.fc1(x))
        x = self.dropout(x)
        return self.fc2(x)

model = FFNN(emb_matrix, hidden=128, p_drop=0.5).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3, weight_decay=1e-5)

# ---------------------------------------------------------
# 6. Train ------------------------------------------------
def run_epoch(loader, train=True):
    epoch_loss, correct, total = 0.0, 0, 0
    if train: model.train()
    else:     model.eval()
    for tokens, offsets, targets in loader:
        tokens, offsets, targets = (t.to(device) for t in (tokens, offsets, targets))
        with torch.set_grad_enabled(train):
            outputs = model(tokens, offsets)
            loss = criterion(outputs, targets)
            if train:
                optimizer.zero_grad()
                loss.backward()
                torch.nn.utils.clip_grad_norm_(model.parameters(), 5.0)
                optimizer.step()
        epoch_loss += loss.item() * targets.size(0)
        preds = outputs.argmax(1)
        correct += (preds == targets).sum().item()
        total += targets.size(0)
    acc  = correct / total
    loss = epoch_loss / total
    return loss, acc

best_val_acc = -1
patience, patience_left = 2, 2
train_hist, val_hist = [], []
NUM_EPOCHS = 10
for epoch in range(1, NUM_EPOCHS+1):
    t0 = time.time()
    tr_loss, tr_acc = run_epoch(train_loader, train=True)
    va_loss, va_acc = run_epoch(val_loader,   train=False)
    train_hist.append(tr_acc); val_hist.append(va_acc)
    print(f"Epoch {epoch:02d}: train_acc={tr_acc:=.4f} val_acc={va_acc:=.4f} time={time.time()-t0:.1f}s")
    if va_acc > best_val_acc:
        best_val_acc = va_acc
        torch.save(model.state_dict(), "best_model.pt")
        patience_left = patience
        print("  --> New best, model saved.")
    else:
        patience_left -= 1
        if patience_left == 0:
            print("Early stopping.")
            break

# Load best
model.load_state_dict(torch.load("best_model.pt"))

# ---------------------------------------------------------
# 7. Evaluation (confusion, ROC, classification report) ---
model.eval()
all_preds, all_probs, all_targets = [], [], []

with torch.no_grad():                                 # NEW wrapper
    for tokens, offsets, targets in val_loader:
        tokens, offsets = tokens.to(device), offsets.to(device)
        outputs = model(tokens, offsets)
        probs = torch.softmax(outputs, dim=1)[:, 1].cpu().numpy()
        preds = outputs.argmax(1).cpu().numpy()
        all_preds.extend(preds)
        all_probs.extend(probs)
        all_targets.extend(targets.numpy())
print("\nClassification report (validation):")
print(classification_report(all_targets, all_preds, target_names=["neg","pos"]))
print("Accuracy:", accuracy_score(all_targets, all_preds))

# Confusion matrix
cm = confusion_matrix(all_targets, all_preds)
plt.figure(figsize=(4,3))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
            xticklabels=["neg","pos"], yticklabels=["neg","pos"])
plt.ylabel("Actual"); plt.xlabel("Predicted")
plt.title("Confusion Matrix - Validation")
plt.tight_layout()
plt.savefig("figures/conf_matrix.png", dpi=300)
plt.show()

# ROC
fpr, tpr, _ = roc_curve(all_targets, all_probs)
roc_auc = auc(fpr, tpr)
plt.figure()
plt.plot(fpr, tpr, label=f"AUC={roc_auc:.2f}")
plt.plot([0,1],[0,1],'k--')
plt.xlabel("False Positive Rate"); plt.ylabel("True Positive Rate")
plt.title("ROC Curve - Validation"); plt.legend()
plt.tight_layout()
plt.savefig("figures/ROC.png", dpi=300)
plt.show()

# Learning curve (training vs val accuracy per epoch)
epochs = np.arange(1,len(train_hist)+1)
plt.figure()
plt.plot(epochs, train_hist, 'o-', label="train")
plt.plot(epochs, val_hist, 'o-', label="val")
plt.xlabel("Epoch"); plt.ylabel("Accuracy")
plt.title("Learning Curve"); plt.legend()
plt.tight_layout()
plt.savefig("figures/LC.png", dpi=300)
plt.show()

# ---------------------------------------------------------
# 8. Predict on test and create submission ----------------
model.eval()
test_labels = []
test_ids = []
with torch.no_grad():
    for tokens, offsets, ids in test_loader:
        tokens, offsets = tokens.to(device), offsets.to(device)
        preds = model(tokens, offsets).argmax(1).cpu().numpy()

      
        ids = [int(i) for i in ids]

        test_ids.extend(ids)
        test_labels.extend(preds)



print("\nGenerating appendix figures …")

# ---------- A1. Word-cloud of the full training corpus ----------
try:
    from wordcloud import WordCloud
except ImportError:
    # Install wordcloud only if missing (Kaggle lets you pip-install)
    import subprocess, sys
    subprocess.check_call([sys.executable, "-m", "pip", "install", "-q", "wordcloud"])
    from wordcloud import WordCloud

text_blob = " ".join(train_df["Text"].astype(str).tolist())
wc = WordCloud(width=1600, height=800, background_color="white",
               collocations=False, max_words=300).generate(text_blob)
wc.to_file("figures/wordcloud_all.png")
print("   ✓ figures/wordcloud_all.png")

# ---------- A2. Top-25 token frequency barplot ----------
token_counter = Counter(tok for toks in train_df["tokens"] for tok in toks)
top25 = token_counter.most_common(25)
words, freqs = zip(*top25)

plt.figure(figsize=(6,6))
plt.barh(range(len(words))[::-1], freqs[::-1])
plt.yticks(range(len(words))[::-1], words[::-1])
plt.xlabel("Frequency"); plt.title("Top-25 tokens (training set)")
plt.tight_layout()
plt.savefig("figures/token_freq_top25.png", dpi=300)
plt.close()
print("   ✓ figures/token_freq_top25.png")

# ---------- B. Hyper-parameter heatmap ----------

hp_trials = []  

if hp_trials:
    hp_df = pd.DataFrame(hp_trials)
    pivot = hp_df.pivot_table(values="val_acc",
                              index="hidden", columns="dropout", aggfunc="max")
    plt.figure(figsize=(6,4))
    sns.heatmap(pivot, annot=True, fmt=".3f", cmap="YlGnBu")
    plt.title("Validation accuracy by hidden size × dropout")
    plt.tight_layout()
    plt.savefig("figures/hp_heatmap.png", dpi=300)
    plt.close()
    print("   ✓ figures/hp_heatmap.png")
else:
    print("   (skipped hp_heatmap — hp_trials list is empty)")

# ---------- C. Histogram of predicted probabilities ----------
plt.figure(figsize=(6,4))
plt.hist(all_probs, bins=30, alpha=0.9, edgecolor="k")
plt.xlabel("Predicted P(positive)"); plt.ylabel("Tweet count")
plt.title("Probability distribution – validation set")
plt.tight_layout()
plt.savefig("figures/prob_dist_best.png", dpi=300)
plt.close()
print("   ✓ figures/prob_dist_best.png")

print("All appendix images saved to figures/")


submission = pd.DataFrame({"ID": test_ids, "Label": test_labels})
submission.to_csv("submission.csv", index=False)
print("\nCreated 'submission.csv' with", submission.shape[0], "rows")


In [None]:
import os, glob
print(os.listdir("/kaggle/input/glove-twitter"))
