## DOWNLOADING DATASET

In [None]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("chauri/facebook-hateful-memes")

print("Path to dataset files:", path)

In [None]:
import shutil
import os

source_dir = "/root/.cache/kagglehub/datasets/chauri/facebook-hateful-memes/versions/1"
destination_dir = "/content"

if os.path.exists(path):
    try:
        shutil.copytree(path, destination_dir + "/facebook-hateful-memes", dirs_exist_ok=True)
        print(f"Successfully moved files from {path} to {destination_dir}")
    except OSError as e:
        print(f"Error moving files: {e}")
else:
    print(f"Source directory '{path}' does not exist.")


#prompt: Write a Python script to copy a directory's contents to a destination if the source exists merging files if needed.

## LOADING DATASET

In [None]:
import pandas as pd

def load_jsonl(path):
    return pd.read_json(path, lines=True)

# Load dataframes
train_df = load_jsonl('/content/facebook-hateful-memes/hateful_memes/train.jsonl')
dev_seen_df = load_jsonl('/content/facebook-hateful-memes/hateful_memes/dev_seen.jsonl')
dev_unseen_df = load_jsonl('/content/facebook-hateful-memes/hateful_memes/dev_unseen.jsonl')
test_seen_df = load_jsonl('/content/facebook-hateful-memes/hateful_memes/test_seen.jsonl')
test_unseen_df = load_jsonl('/content/facebook-hateful-memes/hateful_memes/test_unseen.jsonl')

# Check your data
print(train_df.head())


## IMPORTING LIBRARIES

In [None]:
import os
import torch
import torch.nn as nn
import torchvision.models as models
from transformers import BertModel, BertTokenizer, DistilBertTokenizer, DistilBertModel
from PIL import Image
from torch.utils.data import Dataset, DataLoader, random_split, TensorDataset
from torchvision import transforms
import pandas as pd
import numpy as np
from imblearn.over_sampling import SMOTE
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_auc_score

# RESNET50 + BERT

## DEFINING DATASET

In [None]:
# 1) Load data & set up transforms
train_df = pd.read_json(
    '/content/facebook-hateful-memes/hateful_memes/train.jsonl',
    lines=True
)
img_dir   = '/content/facebook-hateful-memes/hateful_memes/img'
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
data_transforms = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize([0.485,0.456,0.406],[0.229,0.224,0.225])
])

# 2) Dataset
class HatefulMemesDataset(Dataset):
    def __init__(self, df, img_dir, tokenizer, transform=None):
        self.df, self.img_dir = df.reset_index(drop=True), img_dir
        self.tokenizer, self.transform = tokenizer, transform

    def __len__(self): return len(self.df)
    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        # image
        path = os.path.join(self.img_dir, os.path.basename(row['img']))
        img  = Image.open(path).convert('RGB')
        if self.transform: img = self.transform(img)
        # text
        toks = self.tokenizer(
            row['text'], padding='max_length',
            truncation=True, max_length=128, return_tensors='pt'
        )
        return img, toks['input_ids'].squeeze(0), toks['attention_mask'].squeeze(0), torch.tensor(row['label'])


# prompt:Write a Python script to apply image transformations and tokenize text using BERT.
# Create a custom dataset class that returns image, tokenized text, attention mask, and label for each entry.

## CREATING DATASET FOR FEATURE EXTRACTION

In [None]:
# 3) Feature extractor
class FeatureExtractor1(nn.Module):
    def __init__(self):
        super().__init__()
        self.image_model = models.resnet50(pretrained=True)
        self.image_model.fc = nn.Identity()
        self.text_model  = BertModel.from_pretrained('bert-base-uncased')

    def forward(self, img, ids, mask):
        img_feat = self.image_model(img)
        txt_feat = self.text_model(input_ids=ids, attention_mask=mask).pooler_output
        return img_feat, txt_feat

# prompt: Write a Python class that defines a feature extractor using ResNet50 for images and BERT for text.
# The forward method should return image and text features.

In [None]:
@torch.no_grad()
def extract_embeddings1(extractor, loader, device):
    extractor.to(device).eval()
    imgs, txts, lbls = [], [], []
    for img, ids, mask, y in loader:
        img, ids, mask = img.to(device), ids.to(device), mask.to(device)
        i_f, t_f = extractor(img, ids, mask)
        imgs.append(i_f.cpu())
        txts.append(t_f.cpu())
        lbls.append(y)
    return torch.cat(imgs), torch.cat(txts), torch.cat(lbls)

# 4) Split, extract & SMOTE
full_ds       = HatefulMemesDataset(train_df, img_dir, tokenizer, data_transforms)
train_ds, val_ds = random_split(full_ds, [int(0.8*len(full_ds)), len(full_ds) - int(0.8*len(full_ds))], generator=torch.Generator().manual_seed(42))
train_loader  = DataLoader(train_ds, batch_size=32, shuffle=False)
val_loader    = DataLoader(val_ds,   batch_size=32, shuffle=False)

# prompt1: Write a function to extract embeddings from a dataset using a feature extractor.
# The function should run in evaluation mode and return concatenated image text features and labels.

# prompt2: Write a script to split a dataset into training and validation sets (80/20 ratio) and create corresponding data loaders for each."


## EMBEDDINGS AND SMOTE

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
extractor1 = FeatureExtractor1()

img_feats, txt_feats, labels = extract_embeddings1(extractor1, train_loader, device)
# scale text features
txt_weight = 2.0
txt_feats  = txt_feats * txt_weight

X = torch.cat([img_feats, txt_feats], dim=1).numpy()
y = labels.numpy()

smote = SMOTE(random_state=42)
X_res, y_res = smote.fit_resample(X, y)
X_t = torch.tensor(X_res, dtype=torch.float32)
y_t = torch.tensor(y_res, dtype=torch.long)
classifier_loader = DataLoader(TensorDataset(X_t, y_t), batch_size=32, shuffle=True)

# prompt: "Write a function to extract image and text features from a dataset, scale text features
# apply SMOTE for class balancing and create a data loader for the resampled dataset."


## MODEL TRAINING

In [None]:
# 5) Weighted two-branch classifier
img_dim, txt_dim = img_feats.shape[1], txt_feats.shape[1]
class WeightedMultiModalClassifier(nn.Module):
    def __init__(self, img_dim, txt_dim, hidden=256):
        super().__init__()
        self.img_fc = nn.Sequential(nn.Linear(img_dim, hidden), nn.ReLU())
        self.txt_fc = nn.Sequential(nn.Linear(txt_dim, hidden), nn.ReLU())
        self.classifier = nn.Sequential(nn.Dropout(0.4), nn.Linear(hidden, 2))
    def forward(self, x):
        img_part, txt_part = torch.split(x, [img_dim, txt_dim], dim=1)
        h_img = self.img_fc(img_part)
        h_txt = self.txt_fc(txt_part)  # text already scaled
        return self.classifier(h_img + h_txt)

resnet50_bert = WeightedMultiModalClassifier(img_dim, txt_dim).to(device)

# 6) Training loop
def train_model(model, train_loader, val_loader, extractor, epochs=10, lr=1e-4, patience=3):
    opt = torch.optim.AdamW(model.parameters(), lr=lr)
    crit = nn.CrossEntropyLoss()
    best_loss, no_imp = float('inf'), 0

    for epoch in range(1, epochs+1):
        # train
        model.train()
        tot_loss, correct, tot = 0, 0, 0
        for x, y in train_loader:
            x, y = x.to(device), y.to(device)
            opt.zero_grad()
            out = model(x)
            loss = crit(out, y)
            loss.backward(); opt.step()
            tot_loss += loss.item()*y.size(0)
            correct += (out.argmax(1)==y).sum().item()
            tot += y.size(0)
        train_loss, train_acc = tot_loss/tot, correct/tot

        # validate
        model.eval(); extractor.eval()
        val_loss, correct, tot = 0, 0, 0
        with torch.no_grad():
            for img, ids, mask, y in val_loader:
                img, ids, mask, y = img.to(device), ids.to(device), mask.to(device), y.to(device)
                i_f, t_f = extractor(img, ids, mask)
                t_f = t_f * txt_weight
                x_val = torch.cat([i_f, t_f], dim=1)
                out = model(x_val)
                loss = crit(out, y)
                val_loss += loss.item()*y.size(0)
                correct += (out.argmax(1)==y).sum().item()
                tot += y.size(0)
        val_loss, val_acc = val_loss/tot, correct/tot

        print(f"Epoch {epoch} → train: {train_loss:.4f}/{train_acc:.4f}, val: {val_loss:.4f}/{val_acc:.4f}")
        if val_loss < best_loss:
            best_loss, no_imp = val_loss, 0
        else:
            no_imp += 1
            if no_imp >= patience:
                print("Early stopping.")
                break

# 7) Train & evaluate
train_model(resnet50_bert, classifier_loader, val_loader, extractor1, epochs=10, lr=1e-4, patience=3)


# prompt1: Write a PyTorch model that combines image and text features using separate branches, followed by a classifier.
# Implement a weighted fusion of both branches

# prompt2:Write a training loop for the model with early stopping based on validation loss. Include loss calculation, accuracy evaluation, and gradient updates.

#prompt3:Train the model using a specified dataset and evaluate performance over multiple epochs, incorporating early stopping for improved efficiency."

## MODEL EVALUATION

In [None]:
torch.save(resnet50_bert.state_dict(), 'resnet50_bert_model.pth')

In [None]:
# 8) Final validation metrics
# extract val features
unseen_df = pd.read_json(
    '/content/facebook-hateful-memes/hateful_memes/test_unseen.jsonl',
    lines=True
)
eval_dataset = HatefulMemesDataset(
    unseen_df, img_dir, tokenizer, transform=data_transforms
)
eval_loader = DataLoader(
    eval_dataset, batch_size=32, shuffle=False, num_workers=2
)
img_v, txt_v, lbl_v = extract_embeddings1(extractor1, eval_loader, device)
txt_v = txt_v * txt_weight
Xv = torch.cat([img_v, txt_v], dim=1).to(device)
y_true = lbl_v.numpy()
with torch.no_grad():
    logits = resnet50_bert(Xv)
    probs  = torch.softmax(logits, dim=1)[:,1].cpu().numpy()
    preds  = logits.argmax(1).cpu().numpy()

bert1_acc = accuracy_score(y_true, preds)
bert1_roc = roc_auc_score(y_true, probs)

print("Eval Acc:", bert1_acc)
print("Eval AUROC:", bert1_roc)
print("Confusion Matrix:\n", confusion_matrix(y_true, preds))
print("Classification Report:\n", classification_report(y_true, preds))

# EFFICIENTNET + DISTILBERT

In [None]:
!pip install efficientnet_pytorch

## DEFINING DATASET

In [None]:
from efficientnet_pytorch import EfficientNet

# --- Hyperparameter Config ---
config = {
    "batch_size": 32,
    "lr": 2e-5,                # Base learning rate
    "epochs": 50,              # Longer training
    "patience": 3,             # More patience for early stopping
    "weight_decay": 1e-4,
    "max_grad_norm": 1.0,
    "img_weight": 0.1,
    "txt_weight": 1.5
}

# --- Setup ---
img_dir = '/content/facebook-hateful-memes/hateful_memes/img'
df = train_df  # Must contain 'img', 'text', 'label'
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
data_transforms = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
])

# --- Dataset ---
class HatefulMemesDataset(Dataset):
    def __init__(self, df, img_dir, tokenizer, transform=None):
        self.df = df.reset_index(drop=True)
        self.img_dir = img_dir
        self.tokenizer = tokenizer
        self.transform = transform

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        img_path = os.path.join(self.img_dir, os.path.basename(row['img']))
        image = Image.open(img_path).convert('RGB')
        if self.transform:
            image = self.transform(image)
        tokens = self.tokenizer(row['text'], padding='max_length', truncation=True, max_length=128, return_tensors='pt')
        input_ids = tokens['input_ids'].squeeze(0)
        attention_mask = tokens['attention_mask'].squeeze(0)
        label = torch.tensor(row['label'], dtype=torch.long)
        return image, input_ids, attention_mask, label

#prompt1: "Write a dataset class that loads images and tokenizes text using DistilBERT for the Hateful Memes dataset, applying transformations to images."
#prompt2: "Configure hyperparameters for training, including batch size, learning rate, epochs, early stopping patience, and weights for image and text features."

## CREATING DATASET FOR FEATURE EXTRACTION

In [None]:
# --- Model ---
class MultiModalExtractor(nn.Module):
    def __init__(self):
        super().__init__()
        self.image_model = EfficientNet.from_pretrained('efficientnet-b0')
        self.text_model = DistilBertModel.from_pretrained('distilbert-base-uncased')

    def forward(self, images, input_ids, attention_mask):
        img_feat = self.image_model.extract_features(images)
        img_feat = nn.functional.adaptive_avg_pool2d(img_feat, (1, 1)).squeeze()
        txt_feat = self.text_model(input_ids=input_ids, attention_mask=attention_mask).last_hidden_state[:, 0, :]
        return img_feat, txt_feat

class MultiModalClassifier(nn.Module):
    def __init__(self, image_dim=1280, text_dim=768):
        super().__init__()
        self.classifier = nn.Sequential(
            nn.Linear(image_dim + text_dim, 256),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(256, 2)
        )

    def forward(self, img_feat, txt_feat):
        x = torch.cat((img_feat, txt_feat), dim=1)
        return self.classifier(x)

#prompt1:"Write a multi-modal feature extractor model using EfficientNet for images and DistilBERT for text. The forward method should return image and text features."
#prompt2:Write a multi-modal classifier that combines image and text features and classifies the input into two categories using a fully connected network."

In [None]:
# --- Data Preparation ---
full_ds = HatefulMemesDataset(df, img_dir, tokenizer, data_transforms)
train_len = int(0.8 * len(full_ds))
val_len = len(full_ds) - train_len
train_ds, val_ds = random_split(full_ds, [train_len, val_len])
train_loader = DataLoader(train_ds, batch_size=config["batch_size"], shuffle=True)
val_loader = DataLoader(val_ds, batch_size=config["batch_size"], shuffle=False)

## EMBEDDINGS AND SMOTE

In [None]:
# --- Feature Extraction for SMOTE ---
@torch.no_grad()
def extract_embeddings2(extractor, loader):
    extractor.eval()
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    all_img, all_txt, all_lbl = [], [], []
    for imgs, ids, masks, labels in loader:
        imgs, ids, masks = imgs.to(device), ids.to(device), masks.to(device)
        img_feat, txt_feat = extractor(imgs, ids, masks)
        img_feat *= 0.1  # ↓ Reduce image weight
        txt_feat *= 1.5  # ↑ Increase text weight
        all_img.append(img_feat.cpu())
        all_txt.append(txt_feat.cpu())
        all_lbl.append(labels)
    return torch.cat(all_img), torch.cat(all_txt), torch.cat(all_lbl)

#prompt: "Write a function to extract image and text embeddings from a dataset using a multi-modal feature extractor
#Apply weighted scaling to image and text features, and return concatenated embeddings for image, text, and labels.

In [None]:
# --- SMOTE ---
extractor2 = MultiModalExtractor().to(torch.device("cuda" if torch.cuda.is_available() else "cpu"))
resnet18_disbert = MultiModalClassifier().to(torch.device("cuda" if torch.cuda.is_available() else "cpu"))
img_feat, txt_feat, labels = extract_embeddings2(extractor2, train_loader)
X = torch.cat([img_feat, txt_feat], dim=1).numpy()
y = labels.numpy()
X_resampled, y_resampled = SMOTE(random_state=42).fit_resample(X, y)
X_tensor = torch.tensor(X_resampled, dtype=torch.float32)
y_tensor = torch.tensor(y_resampled, dtype=torch.long)
smote_loader = DataLoader(TensorDataset(X_tensor, y_tensor), batch_size=config["batch_size"], shuffle=True)

#prompt: "Write a function to extract image and text embeddings using a multi-modal extractor
#apply SMOTE to handle class imbalance, and create a data loader for the resampled dataset."


## MODEL TRAINING

In [None]:
# --- Training Function ---
def train(model, extractor, smote_loader, val_loader, config):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    criterion = nn.CrossEntropyLoss()

    # Optimizer with different LRs for extractor and classifier
    optimizer = torch.optim.AdamW([
        {"params": extractor.parameters(), "lr": config["lr"] / 2},  # 1e-5 for extractor
        {"params": model.parameters(), "lr": config["lr"]}           # 2e-5 for classifier
    ], weight_decay=config["weight_decay"])

    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
        optimizer, mode='min', factor=0.5, patience=2, verbose=True
    )

    best_loss = float('inf')
    wait = 0

    for epoch in range(1, config["epochs"] + 1):
        model.train()
        extractor.train()
        train_loss, train_correct, train_total = 0, 0, 0

        for x, y in smote_loader:
            x, y = x.to(device), y.to(device)
            outputs = model(x[:, :1280], x[:, 1280:])
            loss = criterion(outputs, y)
            optimizer.zero_grad()
            loss.backward()
            nn.utils.clip_grad_norm_(model.parameters(), max_norm=config["max_grad_norm"])
            optimizer.step()

            train_loss += loss.item() * y.size(0)
            train_correct += (outputs.argmax(1) == y).sum().item()
            train_total += y.size(0)

        train_loss /= train_total
        train_acc = train_correct / train_total

        # --- Validation ---
        model.eval()
        extractor.eval()
        val_loss, val_correct, val_total = 0, 0, 0
        with torch.no_grad():
            for imgs, ids, masks, labels in val_loader:
                imgs, ids, masks, labels = imgs.to(device), ids.to(device), masks.to(device), labels.to(device)
                img_feat, txt_feat = extractor(imgs, ids, masks)
                img_feat *= config["img_weight"]
                txt_feat *= config["txt_weight"]
                outputs = model(img_feat, txt_feat)
                loss = criterion(outputs, labels)

                val_loss += loss.item() * labels.size(0)
                val_correct += (outputs.argmax(1) == labels).sum().item()
                val_total += labels.size(0)

        val_loss /= val_total
        val_acc = val_correct / val_total
        scheduler.step(val_loss)

        print(f"Epoch {epoch}/{config['epochs']} | "
              f"Train Loss: {train_loss:.4f}, Acc: {train_acc:.4f} | "
              f"Val Loss: {val_loss:.4f}, Acc: {val_acc:.4f}")

        if val_loss < best_loss:
            best_loss = val_loss
            wait = 0
        else:
            wait += 1
            if wait >= config["patience"]:
                print("Early stopping triggered.")
                break

# --- Train Model ---
train(resnet18_disbert, extractor2, smote_loader, val_loader, config)

#prompt1: "Write a training function for a multi-modal model with SMOTE for class balancing, AdamW optimizer, gradient clipping, and early stopping."
#prompt2:"Implement learning rate scheduling with ReduceLROnPlateau and track performance during training and validation."


In [None]:
torch.save(resnet18_disbert.state_dict(), 'resnet18_disbert_model.pth')

## MODEL EVALUATION

In [None]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_auc_score
# --- Final Evaluation on Validation Split ---
@torch.no_grad()
def evaluate(model, extractor, loader, config):
    model.eval()
    extractor.eval()
    all_preds, all_labels, all_probs = [], [], []

    for img, ids, mask, lbl in loader:
        img, ids, mask = img.to(device), ids.to(device), mask.to(device)
        img_f, txt_f = extractor(img, ids, mask)
        img_f *= config["img_weight"]
        txt_f *= config["txt_weight"]
        out = model(img_f, txt_f)

        probs = torch.softmax(out, dim=1)[:, 1].cpu().tolist()    # pos-class probability
        preds = out.argmax(1).cpu().tolist()

        all_probs.extend(probs)
        all_preds.extend(preds)
        all_labels.extend(lbl.tolist())

    acc   = accuracy_score(all_labels, all_preds)
    auroc = roc_auc_score(all_labels, all_probs)
    cm    = confusion_matrix(all_labels, all_preds)
    report = classification_report(all_labels, all_preds)

    print(f"Accuracy: {acc:.4f}")
    print(f"AUROC:    {auroc:.4f}")
    print("Confusion Matrix:")
    print(cm)
    print("\nClassification Report:")
    print(report)

# usage

unseen_df = pd.read_json(
    '/content/facebook-hateful-memes/hateful_memes/test_unseen.jsonl',
    lines=True
)
eval_dataset = HatefulMemesDataset(
    unseen_df, img_dir, tokenizer, transform= data_transforms
)
eval_loader = DataLoader(
    eval_dataset, batch_size=32, shuffle=False, num_workers=2
)
evaluate(resnet18_disbert, extractor2, eval_loader, config)


# CLIP

In [None]:
# Install dependencies
!pip install git+https://github.com/openai/CLIP.git
!pip install imbalanced-learn

import clip
from sklearn.model_selection import train_test_split
from torch.optim import AdamW
from torch.optim.lr_scheduler import ReduceLROnPlateau

## DEFINING DATASET

In [None]:
# 1) Settings & Hyperparameters
device       = "cuda" if torch.cuda.is_available() else "cpu"
IMG_DIR      = '/content/facebook-hateful-memes/hateful_memes/img'
DF           = train_df                                  # your dataframe with 'img','text','label'
BATCH_SIZE   = 64
ALPHA_IMG    = 0.5   # scale image embeddings down
ALPHA_TXT    = 1.5   # scale text embeddings up
LR           = 2e-5
WEIGHT_DECAY = 1e-4
DROPOUT      = 0.5
EPOCHS       = 100
PATIENCE     = 5
NUM_CLASSES  = 2

# 2) Load CLIP & Preprocessing
clip_model, preprocess = clip.load("ViT-B/32", device=device)
clip_model.eval()  # freeze CLIP

class CLIPDataset(Dataset):
    def __init__(self, df, img_dir, transform):
        self.df        = df.reset_index(drop=True)
        self.img_dir   = img_dir
        self.transform = transform

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        # strip any directory and build correct path
        filename = os.path.basename(row['img'])
        img_path = os.path.join(self.img_dir, filename)

        image = Image.open(img_path).convert('RGB')
        image = self.transform(image)

        text     = row['text'][:300]
        text_tok = clip.tokenize([text])[0]

        label = torch.tensor(row['label'], dtype=torch.long)
        return image, text_tok, label

#prompt1:"Write a script to configure hyperparameters and device settings for training, including image scaling factors, learning rate, and dropout rate."
#prompt2:"Create a custom CLIP dataset class that loads images and tokenizes text, applying the necessary transformations for both."

## CREATING DATASET

In [None]:
# Build full DataLoader for embedding precompute
full_ds     = CLIPDataset(DF, IMG_DIR, preprocess)
full_loader = DataLoader(full_ds, batch_size=BATCH_SIZE, shuffle=False, num_workers=2)

# 3) Precompute & weight embeddings
all_feats = []
all_lbls  = []

with torch.no_grad():
    for images, texts, labels in full_loader:
        images, texts = images.to(device), texts.to(device)
        img_f = clip_model.encode_image(images).float()
        txt_f = clip_model.encode_text(texts).float()

        img_f = img_f / (img_f.norm(dim=-1, keepdim=True) + 1e-6)
        txt_f = txt_f / (txt_f.norm(dim=-1, keepdim=True) + 1e-6)

        feat = torch.cat((ALPHA_IMG * img_f, ALPHA_TXT * txt_f), dim=1)
        all_feats.append(feat.cpu())
        all_lbls.append(labels)

all_feats = torch.cat(all_feats, dim=0).numpy()
all_lbls  = torch.cat(all_lbls, dim=0).numpy()

#prompt: "Write a script to create a DataLoader for the CLIP dataset, precompute image and text embeddings, normalize them, scale with specified factors, and store the concatenated features and labels."

## FEATURE EXTRACTION + SMOTE

In [None]:
# 4) Train/val split + SMOTE on train
X_train, X_val, y_train, y_val = train_test_split(
    all_feats, all_lbls,
    test_size=0.2,
    stratify=all_lbls,
    random_state=42
)

sm = SMOTE(random_state=42)
X_train_res, y_train_res = sm.fit_resample(X_train, y_train)

train_ds = TensorDataset(torch.from_numpy(X_train_res).float(),
                         torch.from_numpy(y_train_res).long())
val_ds   = TensorDataset(torch.from_numpy(X_val).float(),
                         torch.from_numpy(y_val).long())

train_loader = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True)
val_loader   = DataLoader(val_ds,   batch_size=BATCH_SIZE, shuffle=False)

# Prompt 1: Split the dataset into training and validation sets, ensuring stratified sampling for class distribution.
# Prompt 2: Apply SMOTE to the training set to handle class imbalance and create DataLoader objects for both the training and validati



## MODEL TRAINING

In [None]:
# 5) Classifier on CLIP embeddings
class EmbeddingClassifier(nn.Module):
    def __init__(self, input_dim, hidden_dim=512, dropout=0.5, num_classes=2):
        super().__init__()
        self.fc = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(hidden_dim, num_classes)
        )
    def forward(self, x):
        return self.fc(x)

clip_model1 = EmbeddingClassifier(input_dim=512*2,
                            hidden_dim=512,
                            dropout=DROPOUT,
                            num_classes=NUM_CLASSES).to(device)

# 6) Training loop
criterion = nn.CrossEntropyLoss()
optimizer = AdamW(clip_model1.parameters(), lr=LR, weight_decay=WEIGHT_DECAY)
scheduler = ReduceLROnPlateau(optimizer, mode="min", factor=0.5, patience=2)

best_loss, no_improve = float('inf'), 0

for epoch in range(1, EPOCHS+1):
    clip_model1.train()
    train_loss, train_corr, train_total = 0, 0, 0
    for feats, labels in train_loader:
        feats, labels = feats.to(device), labels.to(device)
        optimizer.zero_grad()
        logits = clip_model1(feats)
        loss   = criterion(logits, labels)
        loss.backward()
        optimizer.step()
        train_loss += loss.item() * labels.size(0)
        preds = logits.argmax(dim=1)
        train_corr += (preds == labels).sum().item()
        train_total += labels.size(0)

    clip_model1.eval()
    val_loss, val_corr, val_total = 0, 0, 0
    with torch.no_grad():
        for feats, labels in val_loader:
            feats, labels = feats.to(device), labels.to(device)
            logits = clip_model1(feats)
            loss   = criterion(logits, labels)
            val_loss += loss.item() * labels.size(0)
            preds = logits.argmax(dim=1)
            val_corr += (preds == labels).sum().item()
            val_total += labels.size(0)

    avg_train_loss = train_loss / train_total
    train_acc      = train_corr  / train_total
    avg_val_loss   = val_loss    / val_total
    val_acc        = val_corr    / val_total
    scheduler.step(avg_val_loss)

    print(f"[Epoch {epoch}/{EPOCHS}] "
          f"Train Loss: {avg_train_loss:.4f}, Acc: {train_acc:.4f} | "
          f"Val Loss: {avg_val_loss:.4f}, Acc: {val_acc:.4f}")

    if avg_val_loss < best_loss:
        best_loss, no_improve = avg_val_loss, 0
        torch.save(clip_model1.state_dict(), "best_classifier.pth")
    else:
        no_improve += 1
        if no_improve >= PATIENCE:
            print("Early stopping.")
            break

# 7) Load best model
clip_model1.load_state_dict(torch.load("best_classifier.pth"))
clip_model1.eval()
print("Best model saved to best_classifier.pt")

# Prompt 1: Write a classifier model using fully connected layers to classify CLIP embeddings with dropout and ReLU activations.
# Prompt 2: Implement a training loop with loss calculation, optimizer, scheduler, and early stopping. Evaluate the model on both training and validation sets.


## MODEL EVALUATION

In [None]:
# 1) Load your test sets
seen_df   = pd.read_json('/content/facebook-hateful-memes/hateful_memes/test_seen.jsonl', lines=True)
unseen_df = pd.read_json('/content/facebook-hateful-memes/hateful_memes/test_unseen.jsonl', lines=True)
combined  = pd.concat([seen_df, unseen_df], ignore_index=True)
eval_df   = combined.dropna(subset=['label'])

# 2) DataLoader
eval_loader = DataLoader(
    CLIPDataset(eval_df, IMG_DIR, preprocess),
    batch_size=32, shuffle=False, num_workers=2
)

# 3) Reload your best embedding classifier
emb_model = EmbeddingClassifier(
    input_dim=512*2,
    hidden_dim=512,
    dropout=DROPOUT,
    num_classes=NUM_CLASSES
).to(device)
emb_model.load_state_dict(torch.load("best_classifier.pt", map_location=device))
emb_model.eval()

# 4) Evaluation loop: compute weighted embeddings → classifier → collect preds & probs
all_preds, all_labels, all_probs = [], [], []

with torch.no_grad():
    for images, texts, labels in eval_loader:
        images, texts = images.to(device), texts.to(device)
        # CLIP emb + normalize
        img_f = clip_model.encode_image(images).float()
        txt_f = clip_model.encode_text(texts).float()
        img_f = img_f / (img_f.norm(dim=-1, keepdim=True) + 1e-6)
        txt_f = txt_f / (txt_f.norm(dim=-1, keepdim=True) + 1e-6)
        # apply α weights & concat
        feats = torch.cat((ALPHA_IMG * img_f, ALPHA_TXT * txt_f), dim=1)
        # classify
        logits = emb_model(feats)
        probs  = torch.softmax(logits, dim=1)[:, 1].cpu().numpy()   # positive-class scores
        preds  = logits.argmax(dim=1).cpu().numpy()

        all_probs.extend(probs)
        all_preds.extend(preds)
        all_labels.extend(labels.numpy())

# 5) Report
acc   = accuracy_score(all_labels, all_preds)
auroc = roc_auc_score(all_labels, all_probs)

print(f"Test Accuracy: {acc:.4f}")
print(f"Test AUROC:    {auroc:.4f}\n")
print(classification_report(
    all_labels,
    all_preds,
    target_names=[f"class_{i}" for i in range(NUM_CLASSES)]
))

# MODEL COMPARISON

In [None]:
import matplotlib.pyplot as plt
import pandas as pd

# Prepare data
models = ['ResNet50+BERT', 'EffNet+BERT', 'CLIP']
accs = [resnet50_bert['acc'], effnet50_distilbert['acc'], clip['acc']]
aurocs = [resnet50_bert['auroc'], effnet50_distilbert['auroc'], clip['auroc']]
avg_f1 = [resnet50_bert['avg']['f1-score'], effnet50_distilbert['avg']['f1-score'], clip['avg']['f1-score']]
wt_f1  = [resnet50_bert['weighted']['f1-score'], effnet50_distilbert['weighted']['f1-score'], clip['weighted']['f1-score']]

# Plot Accuracy
plt.figure()
plt.bar(models, accs)
plt.title('Model Accuracy')
plt.ylabel('Accuracy')
plt.xlabel('Model')
plt.show()

# Plot AUROC
plt.figure()
plt.bar(models, aurocs)
plt.title('Model AUROC')
plt.ylabel('AUROC')
plt.xlabel('Model')
plt.show()

# Plot Avg F1-Score
plt.figure()
plt.bar(models, avg_f1)
plt.title('Average F1-Score by Model')
plt.ylabel('F1-Score')
plt.xlabel('Model')
plt.show()

# Plot Weighted F1-Score
plt.figure()
plt.bar(models, wt_f1)
plt.title('Weighted F1-Score by Model')
plt.ylabel('F1-Score')
plt.xlabel('Model')
plt.show()
