In [1]:
import os
import random
from dataclasses import dataclass
from pathlib import Path

import numpy as np
import pandas as pd
from PIL import Image

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

from torchvision import models, transforms
from torchvision.models import ResNet50_Weights

from transformers import DistilBertModel, DistilBertTokenizerFast

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score, classification_report, confusion_matrix

import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm

In [2]:
def seed_everything(seed: int = 42) -> None:
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)


@dataclass
class Config:
    embedding_dim: int = 512
    num_classes: int = 2
    batch_size: int = 8
    lr: float = 1e-3
    epochs: int = 10
    max_length: int = 128
    subset_size: int = 100
    seed: int = 42
    num_workers: int = 0  # set >0 if your environment supports it
    pin_memory: bool = True


In [3]:
# Paths
notebook_dir = Path.cwd()
project_root = notebook_dir.parent if notebook_dir.name == "src" else notebook_dir

data_dir = project_root / "data"
original_dir = data_dir / "original"
processed_dir = data_dir / "processed"
img_dir = original_dir / "img"
models_dir = project_root / "models"
results_dir = project_root / "results"

processed_dir.mkdir(parents=True, exist_ok=True)
models_dir.mkdir(parents=True, exist_ok=True)
results_dir.mkdir(parents=True, exist_ok=True)

train_jsonl = original_dir / "train.jsonl"

if not img_dir.exists():
    raise FileNotFoundError(f"Image directory not found: {img_dir}")
if not train_jsonl.exists():
    raise FileNotFoundError(f"Train file not found: {train_jsonl}")

print(f"✓ Project root: {project_root}")
print(f"✓ Image directory: {img_dir}")
print(f"✓ Found {len(list(img_dir.glob('*.png')))} images")
print(f"✓ Train file: {train_jsonl}")

✓ Project root: /Users/shrutisivakumar/Library/CloudStorage/OneDrive-Personal/College Stuff/Sem 6/Projects/NLP/Facebook-Hateful-Memes-Challenge-2020
✓ Image directory: /Users/shrutisivakumar/Library/CloudStorage/OneDrive-Personal/College Stuff/Sem 6/Projects/NLP/Facebook-Hateful-Memes-Challenge-2020/data/original/img
✓ Found 10000 images
✓ Train file: /Users/shrutisivakumar/Library/CloudStorage/OneDrive-Personal/College Stuff/Sem 6/Projects/NLP/Facebook-Hateful-Memes-Challenge-2020/data/original/train.jsonl


In [4]:
# Dataset
class HatefulMemesDataset(Dataset):
    """
    Returns:
      - image tensor: [3, 224, 224]
      - raw text: str
      - label: int
    """
    def __init__(self, jsonl_path: str, img_dir: str, transform=None):
        self.df = pd.read_json(jsonl_path, lines=True)
        self.img_dir = Path(img_dir)
        self.transform = transform

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx: int):
        row = self.df.iloc[idx]

        img_name = Path(str(row["img"]).replace("\\", "/")).name
        img_path = self.img_dir / img_name

        text = row["text"]
        label = int(row["label"])

        image = Image.open(img_path).convert("RGB")
        if self.transform:
            image = self.transform(image)

        return image, text, torch.tensor(label, dtype=torch.long)



def collate_fn(batch):
    """
    batch: list of (image_tensor, text_str, label_tensor)
    returns batched tensors + list of texts
    """
    images, texts, labels = zip(*batch)
    images = torch.stack(images, dim=0)
    labels = torch.stack(labels, dim=0)
    return images, list(texts), labels

In [5]:
# Model blocks
class ImageEncoder(nn.Module):
    def __init__(self, embedding_dim: int = 512, freeze_backbone: bool = True):
        super().__init__()
        weights = ResNet50_Weights.DEFAULT
        backbone = models.resnet50(weights=weights)
        self.backbone = nn.Sequential(*list(backbone.children())[:-1])  # remove fc
        self.projection = nn.Linear(2048, embedding_dim)

        if freeze_backbone:
            for p in self.backbone.parameters():
                p.requires_grad = False

    def forward(self, images: torch.Tensor) -> torch.Tensor:
        # images: [B,3,224,224]
        feats = self.backbone(images)           # [B,2048,1,1]
        feats = feats.flatten(1)                # [B,2048]
        emb = self.projection(feats)            # [B,512]
        return emb


class TextEncoder(nn.Module):
    def __init__(self, embedding_dim: int = 512, model_name: str = "distilbert-base-uncased", freeze_backbone: bool = True):
        super().__init__()
        self.bert = DistilBertModel.from_pretrained(model_name)
        self.tokenizer = DistilBertTokenizerFast.from_pretrained(model_name)
        self.projection = nn.Linear(768, embedding_dim)

        if freeze_backbone:
            for p in self.bert.parameters():
                p.requires_grad = False

    def forward(self, texts, device: torch.device, max_length: int = 128) -> torch.Tensor:
        enc = self.tokenizer(
            texts,
            return_tensors="pt",
            padding=True,
            truncation=True,
            max_length=max_length,
        )
        enc = {k: v.to(device) for k, v in enc.items()}

        out = self.bert(**enc)
        cls = out.last_hidden_state[:, 0, :]    # [B,768]
        emb = self.projection(cls)              # [B,512]
        return emb


class FusionClassifier(nn.Module):
    def __init__(self, embedding_dim: int = 512, num_classes: int = 2):
        super().__init__()
        self.classifier = nn.Sequential(
            nn.Linear(embedding_dim * 2, 256),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(256, 128),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(128, num_classes),
        )

    def forward(self, img_emb: torch.Tensor, txt_emb: torch.Tensor) -> torch.Tensor:
        fused = torch.cat([img_emb, txt_emb], dim=1)  # [B,1024]
        return self.classifier(fused)                 # [B,2]


class MultimodalHatefulMemes(nn.Module):
    def __init__(self, embedding_dim: int = 512, num_classes: int = 2):
        super().__init__()
        self.image_encoder = ImageEncoder(embedding_dim=embedding_dim, freeze_backbone=True)
        self.text_encoder = TextEncoder(embedding_dim=embedding_dim, freeze_backbone=True)
        self.fusion = FusionClassifier(embedding_dim=embedding_dim, num_classes=num_classes)

    def forward(self, images, texts, device, max_length: int = 128):
        img_emb = self.image_encoder(images)
        txt_emb = self.text_encoder(texts, device=device, max_length=max_length)
        logits = self.fusion(img_emb, txt_emb)
        return logits

In [6]:
# Data prep
def prepare_subset_data(train_jsonl_path: str, out_train: str, out_val: str, subset_size: int = 100, seed: int = 42):
    df = pd.read_json(train_jsonl_path, lines=True)

    subset, _ = train_test_split(
        df,
        train_size=subset_size,
        stratify=df["label"],
        random_state=seed
    )

    train_df, val_df = train_test_split(
        subset,
        test_size=0.2,
        stratify=subset["label"],
        random_state=seed
    )

    Path(out_train).parent.mkdir(parents=True, exist_ok=True)
    train_df.to_json(out_train, orient="records", lines=True)
    val_df.to_json(out_val, orient="records", lines=True)

    print(f"✓ Subset created: {subset_size} samples")
    print(f"  Train: {len(train_df)} | Val: {len(val_df)}")
    print("  Train distribution:\n", train_df["label"].value_counts().to_string())
    print("  Val distribution:\n", val_df["label"].value_counts().to_string())


def build_loaders(train_path: str, val_path: str, cfg: Config):
    img_tf = transforms.Compose([
        transforms.Resize(256),
        transforms.CenterCrop(224),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406],
                             std=[0.229, 0.224, 0.225]),
    ])

    train_ds = HatefulMemesDataset(train_path, str(img_dir), transform=img_tf)
    val_ds = HatefulMemesDataset(val_path, str(img_dir), transform=img_tf)

    train_loader = DataLoader(
        train_ds,
        batch_size=cfg.batch_size,
        shuffle=True,
        num_workers=cfg.num_workers,
        pin_memory=cfg.pin_memory,
        collate_fn=collate_fn
    )
    val_loader = DataLoader(
        val_ds,
        batch_size=cfg.batch_size,
        shuffle=False,
        num_workers=cfg.num_workers,
        pin_memory=cfg.pin_memory,
        collate_fn=collate_fn
    )
    return train_loader, val_loader

In [7]:
# Train / Eval
def train_one_epoch(model, loader, optimizer, criterion, device, cfg: Config):
    model.train()
    total_loss = 0.0
    correct = 0
    total = 0

    for images, texts, labels in tqdm(loader, desc="Train", leave=False):
        images = images.to(device)
        labels = labels.to(device)

        logits = model(images, texts, device=device, max_length=cfg.max_length)
        loss = criterion(logits, labels)

        optimizer.zero_grad(set_to_none=True)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        preds = logits.argmax(dim=1)
        correct += (preds == labels).sum().item()
        total += labels.size(0)

    return total_loss / max(1, len(loader)), correct / max(1, total)


@torch.no_grad()
def evaluate(model, loader, device, cfg: Config):
    model.eval()
    all_labels = []
    all_preds = []
    all_probs = []

    for images, texts, labels in tqdm(loader, desc="Eval", leave=False):
        images = images.to(device)

        logits = model(images, texts, device=device, max_length=cfg.max_length)
        probs = torch.softmax(logits, dim=1)[:, 1].cpu().numpy()
        preds = logits.argmax(dim=1).cpu().numpy()

        all_probs.extend(probs.tolist())
        all_preds.extend(preds.tolist())
        all_labels.extend(labels.numpy().tolist())

    acc = accuracy_score(all_labels, all_preds)
    auc = roc_auc_score(all_labels, all_probs) if len(set(all_labels)) > 1 else float("nan")
    return acc, auc, np.array(all_labels), np.array(all_preds), np.array(all_probs)


def save_confusion_matrix(y_true, y_pred, out_path: Path):
    cm = confusion_matrix(y_true, y_pred)
    plt.figure(figsize=(7, 5))
    sns.heatmap(
        cm,
        annot=True,
        fmt="d",
        cmap="Blues",
        xticklabels=["Not Hateful", "Hateful"],
        yticklabels=["Not Hateful", "Hateful"],
    )
    plt.title("Confusion Matrix")
    plt.xlabel("Predicted")
    plt.ylabel("True")
    plt.savefig(out_path, dpi=150, bbox_inches="tight")
    plt.close()


In [8]:
# Inference helper
@torch.no_grad()
def predict_one(model, image_path: str, text: str, device, cfg: Config):
    img_tf = transforms.Compose([
        transforms.Resize(256),
        transforms.CenterCrop(224),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406],
                             std=[0.229, 0.224, 0.225]),
    ])

    image = Image.open(image_path).convert("RGB")
    image = img_tf(image).unsqueeze(0).to(device)  # [1,3,224,224]

    logits = model(image, [text], device=device, max_length=cfg.max_length)
    probs = torch.softmax(logits, dim=1)[0]
    pred = int(torch.argmax(probs).item())
    conf = float(probs[pred].item())
    return pred, conf, probs.cpu().numpy()


In [9]:
# Main pipeline
def run_pipeline():
    cfg = Config()
    seed_everything(cfg.seed)

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"Using device: {device}")

    train_subset_path = processed_dir / "train_subset.jsonl"
    val_subset_path = processed_dir / "val_subset.jsonl"

    # Step 1: subset
    prepare_subset_data(
        train_jsonl_path=str(train_jsonl),
        out_train=str(train_subset_path),
        out_val=str(val_subset_path),
        subset_size=cfg.subset_size,
        seed=cfg.seed
    )

    # Step 2: loaders
    train_loader, val_loader = build_loaders(str(train_subset_path), str(val_subset_path), cfg)
    print(f"Train batches: {len(train_loader)} | Val batches: {len(val_loader)}")

    # Step 3: model
    model = MultimodalHatefulMemes(embedding_dim=cfg.embedding_dim, num_classes=cfg.num_classes).to(device)

    # Train only projections + fusion (backbones already frozen)
    trainable_params = [p for p in model.parameters() if p.requires_grad]
    optimizer = torch.optim.Adam(trainable_params, lr=cfg.lr)
    criterion = nn.CrossEntropyLoss()

    print(f"Trainable parameters: {sum(p.numel() for p in trainable_params):,}")
    print(f"LR: {cfg.lr} | Epochs: {cfg.epochs}")

    best_acc = -1.0
    best_path = models_dir / "best_model.pth"

    # Step 4: training loop
    for epoch in range(cfg.epochs):
        train_loss, train_acc = train_one_epoch(model, train_loader, optimizer, criterion, device, cfg)
        val_acc, val_auc, y_true, y_pred, y_prob = evaluate(model, val_loader, device, cfg)

        print(f"\nEpoch {epoch+1}/{cfg.epochs}")
        print(f"  Train Loss: {train_loss:.4f} | Train Acc: {train_acc:.4f}")
        print(f"  Val   Acc : {val_acc:.4f} | Val AUC : {val_auc:.4f}")

        if val_acc > best_acc:
            best_acc = val_acc
            torch.save(
                {
                    "model_state": model.state_dict(),
                    "epoch": epoch,
                    "val_acc": val_acc,
                    "val_auc": val_auc,
                    "config": cfg.__dict__,
                },
                best_path
            )
            print("  ✓ Saved best checkpoint")

    # Step 5: final report from last eval (and best acc printed)
    print("\n" + "=" * 60)
    print("FINAL VALIDATION RESULTS (last epoch eval)")
    print("=" * 60)
    print(f"Best Val Accuracy: {best_acc:.4f}")
    print(f"Last Val Accuracy: {val_acc:.4f}")
    print(f"Last Val AUC     : {val_auc:.4f}\n")

    print(classification_report(y_true, y_pred, target_names=["Not Hateful", "Hateful"]))

    cm_path = results_dir / "confusion_matrix.png"
    save_confusion_matrix(y_true, y_pred, cm_path)
    print(f"✓ Confusion matrix saved to: {cm_path}")

    # Step 6: test one example (optional)
    example_img = str(img_dir / "02657.png")
    example_text = "oh look someone's returning a broken sandwich maker to walmart"

    # load best model for inference
    ckpt = torch.load(best_path, map_location=device)
    model.load_state_dict(ckpt["model_state"])
    model.eval()

    pred, conf, probs = predict_one(model, example_img, example_text, device, cfg)
    print("\nExample prediction:")
    print(f"  pred={pred} ({'Hateful' if pred==1 else 'Not Hateful'}) | conf={conf:.4f}")
    print(f"  probs: Not Hateful={probs[0]:.4f} | Hateful={probs[1]:.4f}")

In [10]:
if __name__ == "__main__":
    run_pipeline()

Using device: cpu
✓ Subset created: 100 samples
  Train: 80 | Val: 20
  Train distribution:
 label
0    51
1    29
  Val distribution:
 label
0    13
1     7
Train batches: 10 | Val batches: 3
Trainable parameters: 1,738,370
LR: 0.001 | Epochs: 10


                                                      


Epoch 1/10
  Train Loss: 0.6616 | Train Acc: 0.6250
  Val   Acc : 0.6500 | Val AUC : 0.6923
  ✓ Saved best checkpoint


                                                      


Epoch 2/10
  Train Loss: 0.6749 | Train Acc: 0.5625
  Val   Acc : 0.6500 | Val AUC : 0.7802


                                                      


Epoch 3/10
  Train Loss: 0.5479 | Train Acc: 0.7375
  Val   Acc : 0.7500 | Val AUC : 0.8571
  ✓ Saved best checkpoint


                                                      


Epoch 4/10
  Train Loss: 0.4170 | Train Acc: 0.8250
  Val   Acc : 0.8000 | Val AUC : 0.8132
  ✓ Saved best checkpoint


                                                      


Epoch 5/10
  Train Loss: 0.3097 | Train Acc: 0.8750
  Val   Acc : 0.6500 | Val AUC : 0.7033


                                                      


Epoch 6/10
  Train Loss: 0.3061 | Train Acc: 0.8875
  Val   Acc : 0.7000 | Val AUC : 0.8132


                                                      


Epoch 7/10
  Train Loss: 0.1503 | Train Acc: 0.9500
  Val   Acc : 0.7000 | Val AUC : 0.7473


                                                      


Epoch 8/10
  Train Loss: 0.0726 | Train Acc: 0.9750
  Val   Acc : 0.7000 | Val AUC : 0.7033


                                                      


Epoch 9/10
  Train Loss: 0.1456 | Train Acc: 0.9625
  Val   Acc : 0.7000 | Val AUC : 0.8242


                                                      


Epoch 10/10
  Train Loss: 0.0947 | Train Acc: 0.9500
  Val   Acc : 0.6500 | Val AUC : 0.8462

FINAL VALIDATION RESULTS (last epoch eval)
Best Val Accuracy: 0.8000
Last Val Accuracy: 0.6500
Last Val AUC     : 0.8462

              precision    recall  f1-score   support

 Not Hateful       0.69      0.85      0.76        13
     Hateful       0.50      0.29      0.36         7

    accuracy                           0.65        20
   macro avg       0.59      0.57      0.56        20
weighted avg       0.62      0.65      0.62        20

✓ Confusion matrix saved to: /Users/shrutisivakumar/Library/CloudStorage/OneDrive-Personal/College Stuff/Sem 6/Projects/NLP/Facebook-Hateful-Memes-Challenge-2020/results/confusion_matrix.png

Example prediction:
  pred=1 (Hateful) | conf=0.7447
  probs: Not Hateful=0.2553 | Hateful=0.7447
