In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Clip

In [None]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, f1_score, roc_auc_score, roc_curve
import torch
import torch.nn as nn
import torch.optim as optim
from torch.optim import AdamW
from torch.utils.data import Dataset, DataLoader
from transformers import CLIPModel, CLIPProcessor, AutoTokenizer, AutoModel, get_linear_schedule_with_warmup
from PIL import Image
import copy
import time
import math
from collections import Counter
from tqdm import tqdm
from torch.amp import autocast, GradScaler
import warnings
warnings.filterwarnings("ignore")

# Load datasets
train_df = pd.read_csv('')
val_df = pd.read_csv('')
test_df = pd.read_csv('')

# Hyperparameters - Optimized for T4 GPU
SEED = 42
BATCH_SIZE = 4  # Increased since CLIP is more efficient
GRADIENT_ACCUMULATION_STEPS = 2  # Effective batch size = 8 * 2 = 16
NUM_EPOCHS = 100
LEARNING_RATE = 1e-5  # Lower LR for CLIP fine-tuning
WEIGHT_DECAY = 0.01
DROPOUT_RATE = 0.3
MAX_LENGTH = 77  # CLIP's default max length
IMAGE_SIZE = 224
NUM_CLASSES = 3
PATIENCE = 5
WARMUP_RATIO = 0.1

# CONFIGURATION FLAGS
USE_WEIGHTED_LOSS = True  # Set to False for standard CrossEntropy
USE_FOCAL_LOSS = True     # Set to True to use Focal Loss
USE_LABEL_SMOOTHING = True # Set to True to use Label Smoothing
FOCAL_ALPHA = 1.0
FOCAL_GAMMA = 2.0
LABEL_SMOOTHING = 0.1
FREEZE_CLIP_LAYERS = True  # Set to False to fine-tune all CLIP layers

# Model configurations
CLIP_MODEL_NAME = "openai/clip-vit-base-patch16"
TEXT_MODEL_NAME = "microsoft/mdeberta-v3-base"  # For dedicated text encoder
IMAGE_DIR = ""
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

print(f"Using device: {DEVICE}")
print(f"Batch size: {BATCH_SIZE}, Gradient accumulation: {GRADIENT_ACCUMULATION_STEPS}")
print(f"Effective batch size: {BATCH_SIZE * GRADIENT_ACCUMULATION_STEPS}")
print(f"Configuration: Weighted Loss: {USE_WEIGHTED_LOSS}, Focal Loss: {USE_FOCAL_LOSS}, Label Smoothing: {USE_LABEL_SMOOTHING}")

# Set seeds for reproducibility
torch.manual_seed(SEED)
np.random.seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

# Target classes
target_classes = ['x', 'y', 'z']

# Compute class weights for imbalanced dataset
class_counts = Counter(train_df['class_idx'])
total_samples = sum(class_counts.values())
num_classes = len(class_counts)
class_weights = torch.tensor([
    total_samples / (num_classes * class_counts[i]) for i in range(num_classes)
], dtype=torch.float32).to(DEVICE)

print(f'Class distribution: {class_counts}')
print(f'Class weights: {class_weights}')

# Loss Functions
class FocalLoss(nn.Module):
    def __init__(self, alpha=1, gamma=2, weight=None, ignore_index=-100):
        super(FocalLoss, self).__init__()
        self.alpha = alpha
        self.gamma = gamma
        self.weight = weight
        self.ignore_index = ignore_index
        self.ce_fn = nn.CrossEntropyLoss(weight=self.weight, ignore_index=self.ignore_index, reduction='none')

    def forward(self, preds, labels):
        ce_loss = self.ce_fn(preds, labels)
        pt = torch.exp(-ce_loss)
        focal_loss = self.alpha * (1 - pt) ** self.gamma * ce_loss
        return focal_loss.mean()

class LabelSmoothingCrossEntropy(nn.Module):
    def __init__(self, smoothing=0.1, weight=None):
        super(LabelSmoothingCrossEntropy, self).__init__()
        self.smoothing = smoothing
        self.weight = weight

    def forward(self, input, target):
        log_prob = nn.functional.log_softmax(input, dim=-1)
        weight = self.weight.unsqueeze(0) if self.weight is not None else None
        
        nll_loss = nn.functional.nll_loss(log_prob, target, weight=weight, reduction='none')
        smooth_loss = -log_prob.mean(dim=-1)
        
        loss = (1 - self.smoothing) * nll_loss + self.smoothing * smooth_loss
        return loss.mean()

# Initialize loss function based on configuration
if USE_FOCAL_LOSS:
    criterion = FocalLoss(alpha=FOCAL_ALPHA, gamma=FOCAL_GAMMA, 
                         weight=class_weights if USE_WEIGHTED_LOSS else None)
    print(f'Using Focal Loss with alpha={FOCAL_ALPHA}, gamma={FOCAL_GAMMA}')
elif USE_LABEL_SMOOTHING:
    criterion = LabelSmoothingCrossEntropy(smoothing=LABEL_SMOOTHING,
                                         weight=class_weights if USE_WEIGHTED_LOSS else None)
    print(f'Using Label Smoothing CrossEntropy with smoothing={LABEL_SMOOTHING}')
else:
    criterion = nn.CrossEntropyLoss(weight=class_weights if USE_WEIGHTED_LOSS else None)
    print('Using standard CrossEntropy Loss')

# Initialize processors and tokenizers
clip_processor = CLIPProcessor.from_pretrained(CLIP_MODEL_NAME)
text_tokenizer = AutoTokenizer.from_pretrained(TEXT_MODEL_NAME, use_fast=False)

import random
import re

class TextAugmentation:
    def __init__(self, augment_prob=0.3):
        self.augment_prob = augment_prob
    
    def random_swap(self, text, n=2):
        """Randomly swap n pairs of words"""
        words = text.split()
        if len(words) < 2:
            return text
        
        for _ in range(n):
            if random.random() < self.augment_prob:
                idx1, idx2 = random.sample(range(len(words)), 2)
                words[idx1], words[idx2] = words[idx2], words[idx1]
        
        return ' '.join(words)
    
    def random_deletion(self, text, p=0.1):
        """Randomly delete words with probability p"""
        words = text.split()
        if len(words) <= 2:
            return text
        
        new_words = []
        for word in words:
            if random.random() > p:
                new_words.append(word)
        
        return ' '.join(new_words) if new_words else text
    
    def random_insertion(self, text):
        """Randomly insert a random word from the text"""
        words = text.split()
        if len(words) < 2:
            return text
        
        if random.random() < self.augment_prob:
            random_word = random.choice(words)
            random_idx = random.randint(0, len(words))
            words.insert(random_idx, random_word)
        
        return ' '.join(words)
    
    def character_level_noise(self, text, p=0.02):
        """Add random character-level noise"""
        chars = list(text)
        for i in range(len(chars)):
            if random.random() < p and chars[i].isalnum():
                # Random character substitution
                if chars[i].isdigit():
                    chars[i] = str(random.randint(0, 9))
                elif chars[i].islower():
                    chars[i] = chr(random.randint(ord('a'), ord('z')))
                elif chars[i].isupper():
                    chars[i] = chr(random.randint(ord('A'), ord('Z')))
        
        return ''.join(chars)
    
    def add_whitespace_noise(self, text):
        """Add random extra spaces"""
        if random.random() < self.augment_prob:
            words = text.split()
            for i in range(len(words)):
                if random.random() < 0.1:
                    words[i] += ' ' * random.randint(1, 3)
            return ' '.join(words)
        return text
    
    def case_variation(self, text):
        """Random case changes"""
        if random.random() < self.augment_prob:
            words = text.split()
            for i in range(len(words)):
                if random.random() < 0.2:
                    choice = random.choice(['upper', 'lower', 'title'])
                    if choice == 'upper':
                        words[i] = words[i].upper()
                    elif choice == 'lower':
                        words[i] = words[i].lower()
                    elif choice == 'title':
                        words[i] = words[i].title()
            return ' '.join(words)
        return text
    
    def augment_text(self, text, training=True):
        """Apply random augmentations during training"""
        if not training or random.random() > 0.5:  # 50% chance to augment
            return text
        
        # Apply random combination of augmentations
        augmentations = [
            self.random_swap,
            self.random_deletion,
            self.random_insertion,
            self.character_level_noise,
            self.add_whitespace_noise,
            self.case_variation
        ]
        
        # Apply 1-2 random augmentations
        num_augs = random.randint(1, 2)
        selected_augs = random.sample(augmentations, num_augs)
        
        augmented_text = text
        for aug_func in selected_augs:
            augmented_text = aug_func(augmented_text)
        
        return augmented_text

# Modified MultimodalDataset class
class MultimodalDataset(Dataset):
    def __init__(self, dataframe, image_dir, clip_processor, text_tokenizer, max_length, is_training=False):
        self.dataframe = dataframe
        self.image_dir = image_dir
        self.clip_processor = clip_processor
        self.text_tokenizer = text_tokenizer
        self.max_length = max_length
        self.is_training = is_training
        self.text_augmenter = TextAugmentation(augment_prob=0.3)
        
    def __len__(self):
        return len(self.dataframe)
    
    def __getitem__(self, idx):
        row = self.dataframe.iloc[idx]
        
        # Process text with augmentation
        text = str(row['text'])
        
        # Apply text augmentation only during training
        if self.is_training:
            text = self.text_augmenter.augment_text(text, training=True)
        
        # CLIP text processing
        clip_inputs = self.clip_processor(
            text=[text], 
            images=None, 
            return_tensors="pt",
            padding="max_length",
            truncation=True, 
            max_length=77
        )
        
        # Dedicated text encoder processing
        text_encoding = self.text_tokenizer(
            text,
            add_special_tokens=True,
            max_length=256,
            padding="max_length",
            truncation=True,
            return_attention_mask=True,
            return_tensors="pt"
        )
        
        # Process image (unchanged)
        img_name = row['image']
        img_path = os.path.join(self.image_dir, img_name)
        
        try:
            image = Image.open(img_path).convert('RGB')
        except Exception as e:
            image = Image.new('RGB', (IMAGE_SIZE, IMAGE_SIZE), color='white')
        
        clip_image_inputs = self.clip_processor(text=None, images=image, return_tensors="pt")
        
        label = row['class_idx']
        
        return {
            'clip_input_ids': clip_inputs['input_ids'].flatten(),
            'clip_attention_mask': clip_inputs['attention_mask'].flatten(),
            'clip_pixel_values': clip_image_inputs['pixel_values'].squeeze(0),
            'text_input_ids': text_encoding['input_ids'].flatten(),
            'text_attention_mask': text_encoding['attention_mask'].flatten(),
            'label': torch.tensor(label, dtype=torch.long)
        }

class CLIPMultimodalClassifier(nn.Module):
    def __init__(self, clip_model_name, text_model_name, num_classes, dropout_rate=0.3):
        super(CLIPMultimodalClassifier, self).__init__()
        
        # CLIP model for joint text-image representation
        self.clip_model = CLIPModel.from_pretrained(clip_model_name)
        clip_hidden_size = self.clip_model.config.projection_dim
        
        # Dedicated text encoder for better text understanding
        self.text_encoder = AutoModel.from_pretrained(text_model_name)
        text_hidden_size = self.text_encoder.config.hidden_size
        
        # Freeze CLIP layers if specified
        if FREEZE_CLIP_LAYERS:
            print("Freezing CLIP layers...")
            for param in self.clip_model.parameters():
                param.requires_grad = False
            # Only train the projection layers
            for param in self.clip_model.text_projection.parameters():
                param.requires_grad = True
            for param in self.clip_model.visual_projection.parameters():
                param.requires_grad = True
        else:
            print("Fine-tuning all CLIP layers...")
        
        # Feature fusion layers
        self.clip_feature_proj = nn.Linear(clip_hidden_size * 2, 512)  # CLIP text + image
        self.text_feature_proj = nn.Linear(text_hidden_size, 512)
        
        # Multi-scale fusion
        self.fusion_layer = nn.Sequential(
            nn.Linear(512 + 512, 768),  # CLIP features + text features
            nn.GELU(),
            nn.Dropout(dropout_rate),
            nn.Linear(768, 384),
            nn.GELU(),
            nn.Dropout(dropout_rate),
            nn.Linear(384, 192),
            nn.GELU(),
            nn.Dropout(dropout_rate / 2)
        )
        
        # Classification head
        self.classifier = nn.Linear(192, num_classes)
        
        # Layer normalization
        self.clip_ln = nn.LayerNorm(512)
        self.text_ln = nn.LayerNorm(512)
        
    def forward(self, clip_input_ids, clip_attention_mask, clip_pixel_values, 
                text_input_ids, text_attention_mask):
        
        # CLIP multimodal features
        clip_outputs = self.clip_model(
            input_ids=clip_input_ids,
            attention_mask=clip_attention_mask,
            pixel_values=clip_pixel_values,
            return_dict=True
        )
        
        # Get CLIP text and image embeddings
        clip_text_embeds = clip_outputs.text_embeds
        clip_image_embeds = clip_outputs.image_embeds
        
        # Concatenate CLIP embeddings
        clip_combined = torch.cat([clip_text_embeds, clip_image_embeds], dim=-1)
        clip_features = self.clip_ln(self.clip_feature_proj(clip_combined))
        
        # Dedicated text encoder features
        text_outputs = self.text_encoder(
            input_ids=text_input_ids,
            attention_mask=text_attention_mask,
            output_hidden_states=True
        )
        
        # Global average pooling for text features
        text_hidden_states = text_outputs.last_hidden_state
        text_mask = text_attention_mask.unsqueeze(-1).float()
        text_pooled = (text_hidden_states * text_mask).sum(dim=1) / text_mask.sum(dim=1)
        text_features = self.text_ln(self.text_feature_proj(text_pooled))
        
        # Combine all features
        combined_features = torch.cat([clip_features, text_features], dim=-1)
        
        # Final fusion and classification
        fused_features = self.fusion_layer(combined_features)
        logits = self.classifier(fused_features)
        
        return logits

# Create datasets
# Replace the original dataset creation with:
train_dataset = MultimodalDataset(train_df, IMAGE_DIR, clip_processor, text_tokenizer, MAX_LENGTH, is_training=True)
val_dataset = MultimodalDataset(val_df, IMAGE_DIR, clip_processor, text_tokenizer, MAX_LENGTH, is_training=False)
test_dataset = MultimodalDataset(test_df, IMAGE_DIR, clip_processor, text_tokenizer, MAX_LENGTH, is_training=False)

# Create data loaders
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=4)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=4)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=4)

# Initialize model
model = CLIPMultimodalClassifier(CLIP_MODEL_NAME, TEXT_MODEL_NAME, NUM_CLASSES, DROPOUT_RATE)
model = model.to(DEVICE)

# Calculate model parameters
total_params = sum(p.numel() for p in model.parameters())
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f"Total parameters: {total_params:,}")
print(f"Trainable parameters: {trainable_params:,}")

# Optimizer and scheduler
optimizer = AdamW(model.parameters(), lr=LEARNING_RATE, weight_decay=WEIGHT_DECAY)

total_steps = len(train_loader) * NUM_EPOCHS // GRADIENT_ACCUMULATION_STEPS
warmup_steps = int(WARMUP_RATIO * total_steps)
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=warmup_steps,
    num_training_steps=total_steps
)

scaler = GradScaler()

def evaluate_model(model, data_loader, device, name):
    model.eval()
    all_preds = []
    all_labels = []
    total_loss = 0
    
    with torch.no_grad():
        for batch in tqdm(data_loader, desc=name):
            clip_input_ids = batch['clip_input_ids'].to(device)
            clip_attention_mask = batch['clip_attention_mask'].to(device)
            clip_pixel_values = batch['clip_pixel_values'].to(device)
            text_input_ids = batch['text_input_ids'].to(device)
            text_attention_mask = batch['text_attention_mask'].to(device)
            labels = batch['label'].to(device)
            
            with autocast('cuda'):
                outputs = model(clip_input_ids, clip_attention_mask, clip_pixel_values,
                              text_input_ids, text_attention_mask)
                loss = criterion(outputs, labels)
            
            total_loss += loss.item() * clip_input_ids.size(0)
            
            _, preds = torch.max(outputs, 1)
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())
    
    avg_loss = total_loss / len(data_loader.dataset)
    accuracy = accuracy_score(all_labels, all_preds)
    f1 = f1_score(all_labels, all_preds, average='macro')
    
    try:
        auc = roc_auc_score(all_labels, np.eye(NUM_CLASSES)[all_preds], multi_class='ovr')
    except ValueError:
        auc = 0.0
    
    return avg_loss, accuracy, f1, auc, all_preds, all_labels

# Training loop
print("Starting training...")
best_f1 = 0.0
no_improve_epochs = 0
best_model_wts = copy.deepcopy(model.state_dict())

for epoch in range(NUM_EPOCHS):
    print(f'\nEpoch {epoch+1}/{NUM_EPOCHS}')
    print('-' * 50)
    
    # Training phase
    model.train()
    train_loss = 0.0
    train_preds = []
    train_labels = []
    
    progress_bar = tqdm(train_loader, desc="Training")
    
    for step, batch in enumerate(progress_bar):
        clip_input_ids = batch['clip_input_ids'].to(DEVICE)
        clip_attention_mask = batch['clip_attention_mask'].to(DEVICE)
        clip_pixel_values = batch['clip_pixel_values'].to(DEVICE)
        text_input_ids = batch['text_input_ids'].to(DEVICE)
        text_attention_mask = batch['text_attention_mask'].to(DEVICE)
        labels = batch['label'].to(DEVICE)
        
        with autocast('cuda'):
            outputs = model(clip_input_ids, clip_attention_mask, clip_pixel_values,
                          text_input_ids, text_attention_mask)
            loss = criterion(outputs, labels) / GRADIENT_ACCUMULATION_STEPS
        
        scaler.scale(loss).backward()
        
        # Gradient accumulation
        if (step + 1) % GRADIENT_ACCUMULATION_STEPS == 0:
            scaler.unscale_(optimizer)
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
            scaler.step(optimizer)
            scaler.update()
            scheduler.step()
            optimizer.zero_grad()
        
        # Track metrics
        train_loss += loss.item() * GRADIENT_ACCUMULATION_STEPS * clip_input_ids.size(0)
        _, preds = torch.max(outputs, 1)
        train_preds.extend(preds.cpu().numpy())
        train_labels.extend(labels.cpu().numpy())
        
        progress_bar.set_postfix({
            "batch_loss": loss.item() * GRADIENT_ACCUMULATION_STEPS,
            "lr": scheduler.get_last_lr()[0]
        })
    
    # Calculate training metrics
    train_loss = train_loss / len(train_loader.dataset)
    train_acc = accuracy_score(train_labels, train_preds)
    train_f1 = f1_score(train_labels, train_preds, average='macro')
    
    # Validation phase
    val_loss, val_acc, val_f1, val_auc, _, _ = evaluate_model(model, val_loader, DEVICE, 'Validation')
    
    print(f'Train Loss: {train_loss:.4f} | Train Acc: {train_acc:.4f} | Train F1: {train_f1:.4f}')
    print(f'Val Loss: {val_loss:.4f} | Val Acc: {val_acc:.4f} | Val F1: {val_f1:.4f} | Val AUC: {val_auc:.4f}')
    
    # Early stopping
    if val_f1 > best_f1:
        print(f'Validation F1 improved from {best_f1:.4f} to {val_f1:.4f}')
        best_f1 = val_f1
        best_model_wts = copy.deepcopy(model.state_dict())
        no_improve_epochs = 0
    else:
        no_improve_epochs += 1
        print(f'No improvement for {no_improve_epochs} epochs')
    
    if no_improve_epochs >= PATIENCE:
        print(f'Early stopping triggered after {epoch+1} epochs')
        break
        
print(f'\nBest Validation F1: {best_f1:.4f}')

# Load best model weights
model.load_state_dict(best_model_wts)

# Test evaluation
_, test_acc, test_f1, test_auc, test_preds, test_labels = evaluate_model(model, test_loader, DEVICE, 'Testing')

print('\n' + '='*60)
print('CLIP MULTIMODAL CLASSIFIER RESULTS')
print('='*60)
print("\nTest Classification Report:")
print(classification_report(test_labels, test_preds, target_names=target_classes, digits=4))
print(f'\nTest Accuracy: {test_acc:.4f}')
print(f'Test F1-Score: {test_f1:.4f}')
print(f'Test ROC-AUC: {test_auc:.4f}')

# Visualizations
fig, axes = plt.subplots(1, 2, figsize=(15, 6))

# Confusion Matrix
cm = confusion_matrix(test_labels, test_preds)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
            xticklabels=target_classes, yticklabels=target_classes, ax=axes[0])
axes[0].set_xlabel('Predicted')
axes[0].set_ylabel('True')
axes[0].set_title('Confusion Matrix')

# ROC Curves
for i in range(NUM_CLASSES):
    if len(np.unique(np.array(test_labels) == i)) > 1:  # Check if both classes exist
        fpr, tpr, _ = roc_curve(np.array(test_labels) == i, np.array(test_preds) == i)
        auc_score = roc_auc_score(np.array(test_labels) == i, np.array(test_preds) == i)
        axes[1].plot(fpr, tpr, label=f'{target_classes[i]} (AUC = {auc_score:.4f})')

axes[1].plot([0, 1], [0, 1], 'r--', alpha=0.5)
axes[1].set_xlabel('False Positive Rate')
axes[1].set_ylabel('True Positive Rate')
axes[1].set_title('ROC Curves')
axes[1].legend()
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig('clip_multimodal_results.png', dpi=300, bbox_inches='tight')
plt.show()

# Save the model
torch.save({
    'model_state_dict': model.state_dict(),
    'optimizer_state_dict': optimizer.state_dict(),
    'config': {
        'clip_model_name': CLIP_MODEL_NAME,
        'text_model_name': TEXT_MODEL_NAME,
        'num_classes': NUM_CLASSES,
        'dropout_rate': DROPOUT_RATE,
        'max_length': MAX_LENGTH,
        'image_size': IMAGE_SIZE,
        'use_weighted_loss': USE_WEIGHTED_LOSS,
        'use_focal_loss': USE_FOCAL_LOSS,
        'use_label_smoothing': USE_LABEL_SMOOTHING
    },
    'best_f1': best_f1,
    'target_classes': target_classes
}, 'clip_multimodal_classifier.pt')

print(f"\nModel saved as 'clip_multimodal_classifier.pt'")
print(f"Best validation F1-score: {best_f1:.4f}")

# Memory cleanup
torch.cuda.empty_cache()
print("\nTraining completed successfully!")