# **Mount Google Drive**


In [None]:
from google.colab import drive
import sys
drive.mount('/content/drive')

In [None]:
!pip install -q transformers datasets scikit-learn seaborn

# **Import Neccesary Packages**

In [None]:
import os
import pandas as pd
import numpy as np
from PIL import Image
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torch.optim import AdamW
from transformers import CLIPProcessor, CLIPTokenizer, CLIPModel, get_scheduler
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import precision_recall_fscore_support, accuracy_score
from tqdm.notebook import tqdm
import glob
import gc
import matplotlib.pyplot as plt
import seaborn as sns

# **Function for Configuring Files and Hyperparamters**

In [None]:
class CFG:
    DRIVE_PATH = "/content/drive/MyDrive/case-2025"

    train_csv_path = os.path.join(DRIVE_PATH, "SubTaskA/Train/ocr_train_text_label.csv")
    train_image_dir = os.path.join(DRIVE_PATH, "SubTaskA/Train")
    val_image_dir = os.path.join(DRIVE_PATH, "SubTaskA/Eval/STask_A_val_img")
    val_labels_path = os.path.join(DRIVE_PATH, "SubTaskA/Eval/(index,label)val.csv")
    val_text_path = os.path.join(DRIVE_PATH, "SubTaskA/Eval/(index,text)val.csv")
    test_image_dir = os.path.join(DRIVE_PATH, "SubTaskA/Test/STask_A_test_img")
    test_csv_path = os.path.join(DRIVE_PATH, "SubTaskA/Test/(index,text)test.csv")
    output_dir = os.path.join(DRIVE_PATH, "output_clip_v2")


    # Model & Preprocessing Parameters
    model_name = 'openai/clip-vit-large-patch14'
    max_token_len = 77 # CLIP's fixed token length

    #Hyperparameters
    learning_rate_base = 1e-6 # For the CLIP model base
    learning_rate_head = 1e-5 # For our custom classifier head
    batch_size = 16
    epochs = 5
    num_workers = 2

    # Hardware & Environment
    device = "cuda" if torch.cuda.is_available() else "cpu"

# Output directory to store our results and models
os.makedirs(CFG.output_dir, exist_ok=True)
print(f"--> Output will be saved to: {CFG.output_dir}")

# Initial Data Loading & Reconnaissance
try:
    # Load the primary training dataframe
    train_df = pd.read_csv(CFG.train_csv_path)

    # Load and merge the two separate validation files
    val_labels_df = pd.read_csv(CFG.val_labels_path)
    val_text_df = pd.read_csv(CFG.val_text_path)
    val_df = pd.merge(val_text_df, val_labels_df, on="index")
    print("Data loaded successfully.")

except FileNotFoundError as e:
    print(f"\nFATAL ERROR: A data file was not found. Please double-check your DRIVE_PATH in the CFG.")
    print(f"   - Details: {e}")
    # Stop execution if data isn't found
    sys.exit()

# **Training and Validatio DataFrame**

In [None]:
    train_df.info()
    val_df.info()

# **Data Cleaning and Data Preparation**

In [None]:
# Data Cleaning: Correcting Data Types
print("Cleaning data: Converting label columns to string type...")
train_df['label'] = train_df['label'].astype(str)
val_df['label'] = val_df['label'].astype(str)
print("Label columns converted.")

# Data Preparation: Enforcing Official Label Encoding
print("\nPreparing data: Enforcing OFFICIAL competition label map...")

# The map connecting the CSV label value to the folder name.
label_to_folder_map = {
    '1': 'Hate',
    '0': 'No Hate'
}
print(f"CSV-to-Folder Map: {label_to_folder_map}")
train_df['folder_name'] = train_df['label'].map(label_to_folder_map)
val_df['folder_name'] = val_df['label'].map(label_to_folder_map)

official_target_map = {
    'No Hate': 0,
    'Hate': 1
}
print(f"Offical Model Target Map: {official_target_map}")

train_df['label_encoded'] = train_df['folder_name'].map(official_target_map)
val_df['label_encoded'] = val_df['folder_name'].map(official_target_map)
print("Final target 'label_encoded' column created according to official rules.")

reverse_target_map = {v: k for k, v in official_target_map.items()}

CFG.num_classes = len(official_target_map)
print(f"Number of classes: {CFG.num_classes}")

# **Data Visualization**

In [None]:
# Data Visualization
plt.style.use('seaborn-v0_8-whitegrid')
fig, ax = plt.subplots(1, 2, figsize=(15, 5))
sns.countplot(x='folder_name', data=train_df, ax=ax[0], palette='magma', order=['No Hate', 'Hate'])
ax[0].set_title('Training Set Label Distribution')
ax[0].set_xlabel('Class')
ax[0].set_ylabel('Count')
train_df['text_length'] = train_df['text'].str.split().str.len().fillna(0)
sns.histplot(train_df['text_length'], bins=30, kde=True, ax=ax[1], color='indigo')
ax[1].set_title('Distribution of Text Lengths (Words)')
ax[1].set_xlabel('Number of Words')
ax[1].axvline(x=CFG.max_token_len, color='red', linestyle='--', label=f'CLIP Max Tokens ({CFG.max_token_len})')
ax[1].legend()
plt.suptitle('Data Characteristics Analysis', fontsize=16)
plt.show()

# **Displaying a Sample**

In [None]:
#Displaying a Sample
print("\n Displaying a Random Sample")
hate_sample = train_df[train_df['folder_name'] == 'Hate'].sample(1).iloc[0]
nohate_sample = train_df[train_df['folder_name'] == 'No Hate'].sample(1).iloc[0]

for sample in [hate_sample, nohate_sample]:
    image_path = os.path.join(CFG.train_image_dir, sample['folder_name'], sample['index'])
    try:
        image = Image.open(image_path)
        print(f"\nImage: {sample['index']}")
        print(f"Folder Name: '{sample['folder_name']}' -> Encoded as: {sample['label_encoded']} (OFFICIAL)")
        print(f"Text: '{sample['text']}'")
        plt.figure()
        plt.imshow(image)
        plt.axis('off')
        plt.show()
    except FileNotFoundError:
        print(f"\n Could not display sample. Image not found at: {image_path}")


# **Load the CLIP Processor**

In [None]:
processor = CLIPProcessor.from_pretrained(CFG.model_name)
print("Processor loaded.")

# **Dataset and DataLoaders**

In [None]:
# Custom Dataset Class
class HateSpeechDataset(Dataset):
    """
    Custom PyTorch Dataset.
    """
    def __init__(self, df, processor, image_dir, is_test=False):
        self.df = df
        self.processor = processor
        self.is_test = is_test

        all_image_paths = glob.glob(os.path.join(image_dir, '**/*.*'), recursive=True)
        self.image_path_map = {os.path.basename(p): p for p in all_image_paths}
        print(f"   - Dataset initialized with {len(self.df)} samples. Found {len(self.image_path_map)} images in {image_dir}.")

    def __len__(self):
        # The DataLoader needs to know the total size of the dataset.
        return len(self.df)

    def __getitem__(self, idx):
        # This defines how to retrieve and process a single item by its index.
        row = self.df.iloc[idx]

        # Text Processing
        text = str(row.get('text', '')) # Get text, or empty string if it's missing (for test set)
        # The processor's text part (tokenizer) converts the string to numerical IDs.
        tokenized = self.processor(
            text=text,
            truncation=True,
            max_length=CFG.max_token_len,
            padding="max_length",
            return_tensors="pt"
        )
        input_ids = tokenized['input_ids'].squeeze(0)
        attention_mask = tokenized['attention_mask'].squeeze(0)

        # Image Processing
        image_name = row['index']
        image_path = self.image_path_map.get(image_name)

        if image_path:
            image = Image.open(image_path).convert("RGB")
        else:
            print(f"WARNING: Image '{image_name}' not found. Using a black placeholder image.")
            image = Image.new('RGB', (CFG.image_size, CFG.image_size), 'black')

        # The processor's vision part handles resizing, cropping, and normalization.
        processed_image = self.processor(images=image, return_tensors="pt")
        pixel_values = processed_image['pixel_values'].squeeze(0)

        # --- Assemble the final output dictionary ---
        item = {
            'input_ids': input_ids,
            'attention_mask': attention_mask,
            'pixel_values': pixel_values
        }
        if not self.is_test:
            item['label'] = torch.tensor(row['label_encoded'], dtype=torch.long)

        return item

# DataLoaders
print("\n Creating DataLoaders")
test_df = pd.read_csv(CFG.test_csv_path)

train_dataset = HateSpeechDataset(train_df, processor, CFG.train_image_dir)
val_dataset = HateSpeechDataset(val_df, processor, CFG.val_image_dir)
test_dataset = HateSpeechDataset(test_df, processor, CFG.test_image_dir, is_test=True)

train_loader = DataLoader(train_dataset, batch_size=CFG.batch_size, shuffle=True, num_workers=CFG.num_workers)
val_loader = DataLoader(val_dataset, batch_size=CFG.batch_size, shuffle=False, num_workers=CFG.num_workers)
test_loader = DataLoader(test_dataset, batch_size=CFG.batch_size, shuffle=False, num_workers=CFG.num_workers)

print(f" DataLoaders created.")
print(f"   - Training batches:   {len(train_loader)}")
print(f"   - Validation batches: {len(val_loader)}")
print(f"   - Test batches:       {len(test_loader)}")

# Clean up memory
del val_labels_df, val_text_df
gc.collect()

# **Main Model Architecture**

In [None]:
# Model Architecture
from sklearn.metrics import precision_recall_fscore_support, accuracy_score

# Defining the Model Architecture
class CLIPForHateSpeech(nn.Module):
    """
    This class defines the custom model. It uses a pre-trained CLIP model
    as its 'backbone' and adds a custom classification 'head'.
    """
    def __init__(self, model_name, num_classes):
        super().__init__()
        self.clip = CLIPModel.from_pretrained(model_name)
        projection_dim = self.clip.projection_dim

        self.classifier = nn.Sequential(
            nn.Linear(2 * projection_dim, projection_dim),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(projection_dim, num_classes)
        )

    def forward(self, input_ids, attention_mask, pixel_values):
        outputs = self.clip(
            input_ids=input_ids,
            attention_mask=attention_mask,
            pixel_values=pixel_values
        )
        image_features = outputs.image_embeds
        text_features = outputs.text_embeds

        combined_features = torch.cat((image_features, text_features), dim=1).
        logits = self.classifier(combined_features)
        return logits


# **Instantiate the Model**

In [None]:
model = CLIPForHateSpeech(
    model_name=CFG.model_name,
    num_classes=CFG.num_classes
).to(CFG.device)
print(f"Model instantiated and moved to {CFG.device}.")
print(f" Trainable Parameters: {sum(p.numel() for p in model.parameters() if p.requires_grad):,}")

# **Defining the Training Engine**

In [None]:
criterion = nn.CrossEntropyLoss()
optimizer = AdamW([
    {'params': model.clip.parameters(), 'lr': CFG.learning_rate_base},
    {'params': model.classifier.parameters(), 'lr': CFG.learning_rate_head}
])

num_training_steps = CFG.epochs * len(train_loader)
lr_scheduler = get_scheduler(
    name="linear",
    optimizer=optimizer,
    num_warmup_steps=int(0.1 * num_training_steps),
    num_training_steps=num_training_steps
)
print("Loss, Optimizer, and LR Scheduler are ready.")

# Training and Validation helper functions

In [None]:
# Defining the Helper Functions
def train_one_epoch(model, loader, optimizer, criterion, scheduler, device):
    model.train()
    total_loss = 0
    progress_bar = tqdm(loader, desc="Training")
    for batch in progress_bar:
        pixel_values = batch['pixel_values'].to(device)
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)

        # Perform a forward pass and calculate loss
        outputs = model(input_ids, attention_mask, pixel_values)
        loss = criterion(outputs, labels)

        # Backpropagation: calculate gradients and update weights
        loss.backward()
        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()

        total_loss += loss.item()
        progress_bar.set_postfix(loss=f"{loss.item():.4f}")
    return total_loss / len(loader)

def validate_one_epoch(model, loader, criterion, device):
    model.eval() # Set model to evaluation mode
    total_loss = 0
    all_preds, all_labels = [], []
    with torch.no_grad(): # Disable gradient calculations for validation
        progress_bar = tqdm(loader, desc="Validating")
        for batch in progress_bar:
            pixel_values = batch['pixel_values'].to(device)
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)

            outputs = model(input_ids, attention_mask, pixel_values)
            loss = criterion(outputs, labels)
            total_loss += loss.item()

            preds = torch.argmax(outputs, dim=1)
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    avg_loss = total_loss / len(loader)
    # Calculate our key performance metrics
    accuracy = accuracy_score(all_labels, all_preds)
    precision, recall, f1, _ = precision_recall_fscore_support(all_labels, all_preds, average='macro', zero_division=0)

    return avg_loss, accuracy, precision, recall, f1

print("Training and validation helper functions are defined.")

# **The Main Training Loop**

In [None]:
# The Main Training Loop
history = {
    'train_loss': [], 'val_loss': [], 'val_accuracy': [],
    'val_precision': [], 'val_recall': [], 'val_f1': []
}
best_val_f1 = 0.0
model_path = os.path.join(CFG.output_dir, 'best_model_finetuned.pth')

print("--- Starting Full Model Fine-Tuning ---")

for epoch in range(CFG.epochs):
    print(f"\n===== Epoch {epoch + 1}/{CFG.epochs} =====")

    # Train for one epoch
    train_loss = train_one_epoch(model, train_loader, optimizer, criterion, lr_scheduler, CFG.device)

    # Validate the model
    val_loss, val_acc, val_prec, val_rec, val_f1 = validate_one_epoch(model, val_loader, criterion, CFG.device)

    # Store the results for this epoch
    history['train_loss'].append(train_loss)
    history['val_loss'].append(val_loss)
    history['val_accuracy'].append(val_acc)
    history['val_precision'].append(val_prec)
    history['val_recall'].append(val_rec)
    history['val_f1'].append(val_f1)

    # Print the summary for the epoch
    print(f"Epoch {epoch + 1} Summary:")
    print(f"  - Train Loss: {train_loss:.4f}")
    print(f"  - Val Loss:   {val_loss:.4f}")
    print("  --- Validation Metrics ---")
    print(f"  - Accuracy:  {val_acc:.4f}")
    print(f"  - Precision: {val_prec:.4f}")
    print(f"  - Recall:    {val_rec:.4f}")
    print(f"  - F1-Score:  {val_f1:.4f}")

    # Save the model based on the best F1-score
    if val_f1 > best_val_f1:
        best_val_f1 = val_f1
        print(f"New best F1-score! Saving model to {model_path}")
        torch.save(model.state_dict(), model_path)
    else:
        print("F1-score did not improve.")

print("\n Training Finished")
print(f"Best validation F1-Score achieved: {best_val_f1:.4f}")

# **Visualizaing Training History**

In [None]:
#Plotting the Training History
fig, ax1 = plt.subplots(figsize=(12, 5))
ax1.set_xlabel('Epoch')
ax1.plot(history['train_loss'], 'r-o', label='Train Loss')
ax1.plot(history['val_loss'], 'orange', marker='o', label='Validation Loss')
ax1.set_ylabel('Loss')
ax1.legend(loc='upper left')
ax2 = ax1.twinx()
ax2.plot(history['val_f1'], 'b-x', label='Validation F1-Score')
ax2.set_ylabel('F1-Score')
ax2.legend(loc='upper right')
plt.title('Training and Validation History')
plt.show()

# **Exporting the test json**

In [None]:
import json

# Main Prediction Function
def predict(model_path, test_loader, device):
    model = CLIPForHateSpeech(
        model_name=CFG.model_name,
        num_classes=CFG.num_classes
    ).to(device)

    # Load the weights from the best saved model.
    model.load_state_dict(torch.load(model_path))
    model.eval()

    all_preds = []
    all_indices = []

    with torch.no_grad():
        progress_bar = tqdm(test_loader, desc="Predicting on Test Set")
        for i, batch in enumerate(progress_bar):
            pixel_values = batch['pixel_values'].to(device)
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)

            outputs = model(input_ids, attention_mask, pixel_values)
            preds = torch.argmax(outputs, dim=1)
            all_preds.extend(preds.cpu().numpy())

            start_idx = i * CFG.batch_size
            end_idx = start_idx + len(preds)
            batch_indices = test_df['index'][start_idx:end_idx].tolist()
            all_indices.extend(batch_indices)

    return all_indices, all_preds

# Run the prediction process
model_path = os.path.join(CFG.output_dir, 'best_model_finetuned.pth')
indices, predictions = predict(model_path, test_loader, CFG.device)

submission_path = os.path.join(CFG.output_dir, 'submission-taskA.json')

with open(submission_path, 'w') as f:
    for index, prediction in zip(indices, predictions):
        # Create a dictionary for the current prediction
        result = {
            "index": index,
            "prediction": int(prediction)
        }
        f.write(json.dumps(result) + '\n')

print(f"\n Submission file created successfully at: {submission_path}")

#Preview the first few lines of the created file
print("\n--- Submission File Preview ---")
!head -n 3 "{submission_path}"
