# **Mount Google Drive**



In [None]:
from google.colab import drive
import sys

drive.mount('/content/drive')

# **Import Neccesary Packages**

In [None]:
!pip install -q transformers datasets scikit-learn seaborn

import os
import pandas as pd
import numpy as np
from PIL import Image
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torch.optim import AdamW
from transformers import CLIPProcessor, CLIPTokenizer, CLIPModel, get_scheduler
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import precision_recall_fscore_support, accuracy_score
from tqdm.notebook import tqdm
import glob
import gc
import matplotlib.pyplot as plt
import seaborn as sns


# **Function for Configuring Files and Hyperparamters**

In [None]:
from sklearn.utils import resample

class CFG_B:
    DRIVE_PATH = "/content/drive/MyDrive/case-2025"

    train_dir = os.path.join(DRIVE_PATH, "SubTaskB/Train")
    train_text_path = os.path.join(DRIVE_PATH, "SubTaskB/Train/STask_B_train.csv")

    val_image_dir = os.path.join(DRIVE_PATH, "SubTaskB/Eval/STask_B_val_img")
    val_labels_path = os.path.join(DRIVE_PATH, "SubTaskB/Eval/STask-B(index,label)val.csv")
    val_text_path = os.path.join(DRIVE_PATH, "SubTaskB/Eval/STask-B(index,text)val.csv")

    test_image_dir = os.path.join(DRIVE_PATH, "SubTaskB/Test/STask_B_test_img")
    test_csv_path = os.path.join(DRIVE_PATH, "SubTaskB/Test/STask-B(index,text)test.csv")

    output_dir = os.path.join(DRIVE_PATH, "output_subtask_b_resampled")
    model_name = 'openai/clip-vit-large-patch14'
    image_size = 224
    max_token_len = 77
    learning_rate_base = 1e-6
    learning_rate_head = 1e-5
    batch_size = 16
    epochs = 8
    num_workers = 2
    device = "cuda" if torch.cuda.is_available() else "cpu"

os.makedirs(CFG_B.output_dir, exist_ok=True)

try:
    train_image_paths = glob.glob(os.path.join(CFG_B.train_dir, '**/*.png'), recursive=True)
    train_data = []
    class_folders = ['Undirected', 'Individual', 'Community', 'Organization']
    for path in train_image_paths:
        label = os.path.basename(os.path.dirname(path))
        if label in class_folders:
            train_data.append({'index': os.path.basename(path), 'label_text': label})
    ground_truth_labels_df = pd.DataFrame(train_data)
    text_data_df = pd.read_csv(CFG_B.train_text_path, usecols=['index', 'text'])
    train_df_b = pd.merge(ground_truth_labels_df, text_data_df, on="index")
    val_labels_df = pd.read_csv(CFG_B.val_labels_path)
    val_text_df = pd.read_csv(CFG_B.val_text_path)
    val_df_b = pd.merge(val_text_df, val_labels_df, on="index")
    print("Data loaded successfully.")

    # Enforcing Label Encoding
    official_target_map = {'Undirected': 0, 'Individual': 1, 'Community': 2, 'Organization': 3}
    train_df_b['label_encoded'] = train_df_b['label_text'].map(official_target_map)
    val_df_b.rename(columns={'label': 'label_encoded'}, inplace=True)
    val_df_b['label_encoded'] = val_df_b['label_encoded'].astype(int)

    CFG_B.num_classes = len(official_target_map)

except Exception as e:
    print(f"\n ERROR during data processing: {e}")
    sys.exit()

# OVER-SAMPLING THE TRAINING DATA

In [None]:
# Find the majority class
majority_class_name = train_df_b['label_text'].value_counts().idxmax()
majority_class_size = len(train_df_b[train_df_b['label_text'] == majority_class_name])
print(f"Majority class '{majority_class_name}' has {majority_class_size} samples.")


resampled_dfs = []
# Loop through each unique class
for class_name in train_df_b['label_text'].unique():
    class_df = train_df_b[train_df_b['label_text'] == class_name]
    if class_name == majority_class_name:
        # If it's the majority class, just add it as is
        resampled_dfs.append(class_df)
    else:
        # If it's a minority class, over-sample it with replacement
        resampled_class_df = resample(class_df,
                                      replace=True, # Sample with replacement
                                      n_samples=majority_class_size, # Match majority class size
                                      random_state=42) # For reproducibility
        resampled_dfs.append(resampled_class_df)

# Concatenate the dataframes to create the new, balanced training set
train_df_b_resampled = pd.concat(resampled_dfs)
print(f"Over-sampling complete. New training set size: {len(train_df_b_resampled)}")

# **Data Visualization**

In [None]:
# Visualization
print("\n--> Visualizing data characteristics...")
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 6))
# Original Distribution
sns.countplot(x='label_text', data=train_df_b, ax=ax1, palette='viridis', order=class_folders)
ax1.set_title('Original Training Set Distribution')
ax1.set_xlabel('Target Class'); ax1.set_ylabel('Count'); ax1.tick_params(axis='x', rotation=15)
# Resampled Distribution
sns.countplot(x='label_text', data=train_df_b_resampled, ax=ax2, palette='magma', order=class_folders)
ax2.set_title('Resampled Training Set Distribution')
ax2.set_xlabel('Target Class'); ax2.set_ylabel('Count'); ax2.tick_params(axis='x', rotation=15)
plt.suptitle('Data Distribution Before and After Over-sampling', fontsize=16)
plt.show()


# **Dataset and DataLoaders**

In [None]:
#Loading the CLIP Processor
processor = CLIPProcessor.from_pretrained(CFG_B.model_name)

# Define the Self-Contained Dataset Class
class HateSpeechDataset(Dataset):
    """
    Custom PyTorch Dataset for Subtask B.
    This class is self-contained and receives all its dependencies directly.
    """
    def __init__(self, df, processor, image_dir, image_size, max_token_len, is_test=False):
        self.df = df
        self.processor = processor
        self.image_dir = image_dir
        self.image_size = image_size
        self.max_token_len = max_token_len
        self.is_test = is_test

        self.image_path_map = {os.path.basename(p): p for p in glob.glob(os.path.join(image_dir, '**/*.*'), recursive=True)}
        print(f"   - Dataset initialized with {len(self.df)} samples. Found {len(self.image_path_map)} images.")

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        text = str(row.get('text', ''))
        tokenized = self.processor(
            text=text,
            truncation=True,
            max_length=self.max_token_len,
            padding="max_length",
            return_tensors="pt"
        )
        input_ids = tokenized['input_ids'].squeeze(0)
        attention_mask = tokenized['attention_mask'].squeeze(0)

        image_name = row['index']
        image_path = self.image_path_map.get(image_name)

        if image_path:
            image = Image.open(image_path).convert("RGB")
        else:
            image = Image.new('RGB', (self.image_size, self.image_size), 'black')

        processed_image = self.processor(images=image, return_tensors="pt")
        pixel_values = processed_image['pixel_values'].squeeze(0)

        item = {
            'input_ids': input_ids,
            'attention_mask': attention_mask,
            'pixel_values': pixel_values
        }
        if not self.is_test:
            item['label'] = torch.tensor(row['label_encoded'], dtype=torch.long)

        return item

# DataLoaders for Subtask B
test_df_b = pd.read_csv(CFG_B.test_csv_path)
train_dataset_b = HateSpeechDataset(train_df_b_resampled, processor, CFG_B.train_dir, CFG_B.image_size, CFG_B.max_token_len)
val_dataset_b = HateSpeechDataset(val_df_b, processor, CFG_B.val_image_dir, CFG_B.image_size, CFG_B.max_token_len)
test_dataset_b = HateSpeechDataset(test_df_b, processor, CFG_B.test_image_dir, CFG_B.image_size, CFG_B.max_token_len, is_test=True)

train_loader_b = DataLoader(train_dataset_b, batch_size=CFG_B.batch_size, shuffle=True, num_workers=CFG_B.num_workers)
val_loader_b = DataLoader(val_dataset_b, batch_size=CFG_B.batch_size, shuffle=False, num_workers=CFG_B.num_workers)
test_loader_b = DataLoader(test_dataset_b, batch_size=CFG_B.batch_size, shuffle=False, num_workers=CFG_B.num_workers)

print(f"DataLoaders created. Training loader is now using the resampled, balanced data.")
print(f"Training batches:   {len(train_loader_b)}")
print(f"Validation batches: {len(val_loader_b)}")
print(f"Test batches:       {len(test_loader_b)}")

# Clean up to free memory
gc.collect()

# **Defining the Model Architecture**

In [None]:
class CLIPForHateSpeech(nn.Module):
    def __init__(self, model_name, num_classes):
        super().__init__()
        self.clip = CLIPModel.from_pretrained(model_name)
        projection_dim = self.clip.projection_dim
        # This is our original, more stable classifier architecture.
        self.classifier = nn.Sequential(
            nn.Linear(2 * projection_dim, projection_dim),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(projection_dim, num_classes)
        )

    def forward(self, input_ids, attention_mask, pixel_values):
        outputs = self.clip(
            input_ids=input_ids,
            attention_mask=attention_mask,
            pixel_values=pixel_values
        )
        image_features = outputs.image_embeds
        text_features = outputs.text_embeds
        combined_features = torch.cat((image_features, text_features), dim=1)
        logits = self.classifier(combined_features)
        return logits

#Instantiate the Model

In [None]:
model_b = CLIPForHateSpeech(
    model_name=CFG_B.model_name,
    num_classes=CFG_B.num_classes
).to(CFG_B.device)
print(f"Model instantiated with {CFG_B.num_classes} output classes.")

#Defining the Training Engine with STANDARD Loss

In [None]:
criterion = nn.CrossEntropyLoss()

optimizer = AdamW([
    {'params': model_b.clip.parameters(), 'lr': CFG_B.learning_rate_base},
    {'params': model_b.classifier.parameters(), 'lr': CFG_B.learning_rate_head}
])
num_training_steps = CFG_B.epochs * len(train_loader_b)
lr_scheduler = get_scheduler(
    name="linear",
    optimizer=optimizer,
    num_warmup_steps=int(0.1 * num_training_steps),
    num_training_steps=num_training_steps
)
print("Optimizer and LR Scheduler are ready.")

# Helper Functions

In [None]:
def train_one_epoch(model, loader, optimizer, criterion, scheduler, device):
    model.train()
    total_loss = 0
    progress_bar = tqdm(loader, desc="Training")
    for batch in progress_bar:
        pixel_values = batch['pixel_values'].to(device)
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)
        outputs = model(input_ids, attention_mask, pixel_values)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()
        total_loss += loss.item()
        progress_bar.set_postfix(loss=f"{loss.item():.4f}")
    return total_loss / len(loader)

def validate_one_epoch(model, loader, criterion, device):
    model.eval()
    total_loss = 0
    all_preds, all_labels = [], []
    with torch.no_grad():
        progress_bar = tqdm(loader, desc="Validating")
        for batch in progress_bar:
            pixel_values = batch['pixel_values'].to(device)
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)
            outputs = model(input_ids, attention_mask, pixel_values)
            loss = criterion(outputs, labels)
            total_loss += loss.item()
            preds = torch.argmax(outputs, dim=1)
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())
    avg_loss = total_loss / len(loader)
    accuracy = accuracy_score(all_labels, all_preds)
    precision, recall, f1, _ = precision_recall_fscore_support(all_labels, all_preds, average='macro', zero_division=0)
    return avg_loss, accuracy, precision, recall, f1

# **Main Training Loop**

In [None]:
history_b = {
    'train_loss': [], 'val_loss': [], 'val_accuracy': [],
    'val_precision': [], 'val_recall': [], 'val_f1': []
}

best_val_f1 = 0.0
model_path_b = os.path.join(CFG_B.output_dir, 'best_model_subtask_b.pth')

print("--- Starting Subtask B Fine-Tuning ---")

for epoch in range(CFG_B.epochs):
    print(f"\n===== Epoch {epoch + 1}/{CFG_B.epochs} =====")

    train_loss = train_one_epoch(model_b, train_loader_b, optimizer, criterion, lr_scheduler, CFG_B.device)
    val_loss, val_acc, val_prec, val_rec, val_f1 = validate_one_epoch(model_b, val_loader_b, criterion, CFG_B.device)

    history_b['train_loss'].append(train_loss)
    history_b['val_loss'].append(val_loss)
    history_b['val_accuracy'].append(val_acc)
    history_b['val_precision'].append(val_prec)
    history_b['val_recall'].append(val_rec)
    history_b['val_f1'].append(val_f1)

    print(f"Epoch {epoch + 1} Summary:")
    print(f"Train Loss: {train_loss:.4f}")
    print(f"Val Loss:   {val_loss:.4f}")
    print(" - Validation Metrics -")
    print(f"Accuracy:  {val_acc:.4f}")
    print(f"Precision (Macro): {val_prec:.4f}")
    print(f"Recall (Macro):    {val_rec:.4f}")
    print(f"F1-Score (Macro):  {val_f1:.4f}")

    # Save the model based on the best F1-score
    if val_f1 > best_val_f1:
        best_val_f1 = val_f1
        print(f"New best F1-score! Saving model to {model_path_b}")
        torch.save(model_b.state_dict(), model_path_b)
    else:
        print("  - F1-score did not improve.")

print("\n Training Finished")
print(f"Best validation F1-Score for Subtask B achieved: {best_val_f1:.4f}")

# Plotting the Training History

In [None]:
fig, ax1 = plt.subplots(figsize=(12, 5))
ax1.set_xlabel('Epoch')
ax1.plot(history_b['train_loss'], 'r-o', label='Train Loss')
ax1.plot(history_b['val_loss'], 'orange', marker='o', label='Validation Loss')
ax1.set_ylabel('Loss')
ax1.legend(loc='upper left')
ax2 = ax1.twinx()
ax2.plot(history_b['val_f1'], 'b-x', label='Validation F1-Score (Macro)')
ax2.set_ylabel('F1-Score (Macro)')
ax2.legend(loc='upper right')
plt.title('Subtask B: Training and Validation History')
plt.show()

# **Final Prediction on Test Set **

In [None]:
import json

class CLIPForHateSpeech(nn.Module):
    def __init__(self, model_name, num_classes):
        super().__init__()
        self.clip = CLIPModel.from_pretrained(model_name)
        projection_dim = self.clip.projection_dim
        self.classifier = nn.Sequential(
            nn.Linear(2 * projection_dim, projection_dim),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(projection_dim, num_classes)
        )

    def forward(self, input_ids, attention_mask, pixel_values):
        outputs = self.clip(
            input_ids=input_ids,
            attention_mask=attention_mask,
            pixel_values=pixel_values
        )
        image_features = outputs.image_embeds
        text_features = outputs.text_embeds
        combined_features = torch.cat((image_features, text_features), dim=1)
        logits = self.classifier(combined_features)
        return logits

# Define the Prediction Function
def predict_subtask_b(model_path, test_loader, device):
    print("--> Instantiating model architecture for prediction...")
    model = CLIPForHateSpeech(
        model_name=CFG_B.model_name,
        num_classes=CFG_B.num_classes
    ).to(device)

    # Load the saved weights from your best model
    print(f"--> Loading best model weights from: {model_path}")
    model.load_state_dict(torch.load(model_path))
    model.eval()

    all_preds = []

    with torch.no_grad():
        progress_bar = tqdm(test_loader, desc="Predicting on Test Set")
        for batch in progress_bar:
            pixel_values = batch['pixel_values'].to(device)
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)

            outputs = model(input_ids, attention_mask, pixel_values)
            preds = torch.argmax(outputs, dim=1)
            all_preds.extend(preds.cpu().numpy())

    return all_preds

# Run the Prediction and Create the Submission File
model_path = os.path.join(CFG_B.output_dir, 'best_model_subtask_b.pth')
predictions = predict_subtask_b(model_path, test_loader_b, CFG_B.device)

indices = test_df_b['index'].tolist()

submission_path = os.path.join(CFG_B.output_dir, 'submission.json')

with open(submission_path, 'w') as f:
    for index, prediction in zip(indices, predictions):

        result = {
            "index": index,
            "prediction": int(prediction)
        }
        f.write(json.dumps(result) + '\n')

print(f"\n Submission file for Subtask B created successfully at: {submission_path}")

# The first few lines of the created file
print("\n Submission File Preview: ")
!head -n 5 "{submission_path}"