## PART C

In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertModel
import pandas as pd
import numpy as np
from sklearn.metrics import precision_recall_fscore_support, accuracy_score, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
import os
import re

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

2025-04-14 10:53:22.294985: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1744628002.696424      31 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1744628002.814967      31 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


Using device: cuda


### Function: `load_and_process_csv_data`

Processes two CSV files (Custom and SmolVLM captions) into a unified dataset.

#### **Inputs**:
- `custom_model_file`: Path to custom model CSV.
- `smolvlm_file`: Path to SmolVLM CSV.

#### **Outputs**:
- Returns a `DataFrame` with:
  - `image_id`, `occlusion_level`, `original_caption`, `generated_caption`, `model_name`.


In [2]:
def load_and_process_csv_data(custom_model_file, smolvlm_file):

    custom_df = pd.read_csv(custom_model_file)
    smolvlm_df = pd.read_csv(smolvlm_file)
    
    def extract_caption(text):
        match = re.search(r"Assistant:(.*?)($|\n)", text, re.DOTALL)
        if match:
            return match.group(1).strip()
        return text
    
    smolvlm_df['caption'] = smolvlm_df['smolvlm_caption'].apply(extract_caption)
    
    smolvlm_df = smolvlm_df[['filename', 'occlusion_level', 'caption']]
    custom_df = custom_df[['filename', 'occlusion_level', 'predicted_caption', 'reference_caption']]
    
    smolvlm_df = smolvlm_df.rename(columns={'caption': 'smolvlm_caption'})
    custom_df = custom_df.rename(columns={'predicted_caption': 'custom_caption'})
    custom_df = custom_df.rename(columns={'reference_caption': 'reference'})
    
    merged_df = pd.merge(
        custom_df, 
        smolvlm_df, 
        on=['filename', 'occlusion_level'], 
        how='left'
    )
    
    smolvlm_rows = []
    custom_rows = []
    
    for _, row in merged_df.iterrows():
        filename = row['filename']
        occlusion_level = row['occlusion_level']
        reference = row['reference']
        
        if pd.notna(row['smolvlm_caption']):
            smolvlm_rows.append({
                'image_id': filename,
                'occlusion_level': occlusion_level,
                'original_caption': reference,
                'generated_caption': row['smolvlm_caption'],
                'model_name': 'SmolVLM'
            })
        
        custom_rows.append({
            'image_id': filename,
            'occlusion_level': occlusion_level,
            'original_caption': reference,
            'generated_caption': row['custom_caption'],
            'model_name': 'Custom'
        })
    
    all_rows = smolvlm_rows + custom_rows
    processed_data = pd.DataFrame(all_rows)
    
    processed_data['occlusion_level'] = processed_data['occlusion_level'].astype(str)
    
    print(f"Total processed data points: {len(processed_data)}")
    print(f"SmolVLM data points: {len(smolvlm_rows)}")
    print(f"Custom model data points: {len(custom_rows)}")
    
    return processed_data

### Class: `CaptionDataset`

#### **Purpose**:
Prepares data for training/testing by tokenizing input text and assigning labels.

#### **Inputs**:
- `dataframe`: Data containing captions and occlusion levels.
- `tokenizer`: Pre-trained tokenizer (e.g., BERT).
- `max_length`: Maximum token length (default: 128).

#### **Outputs**:
- Returns tokenized input (`input_ids`, `attention_mask`) and label:
  - `label = 0` for `SmolVLM`, `1` for `Custom`.


In [3]:
class CaptionDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_length=128):
        self.dataframe = dataframe
        self.tokenizer = tokenizer
        self.max_length = max_length
        
    def __len__(self):
        return len(self.dataframe)
    
    def __getitem__(self, idx):
        row = self.dataframe.iloc[idx]
        
        input_text = f"{row['original_caption']} [SEP] {row['generated_caption']} [SEP] {row['occlusion_level']}"
        
        encoding = self.tokenizer(
            input_text,
            add_special_tokens=True,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        
        label = 0 if row['model_name'] == 'SmolVLM' else 1
        
        return {
            'input_ids': encoding['input_ids'].squeeze(),
            'attention_mask': encoding['attention_mask'].squeeze(),
            'label': torch.tensor(label, dtype=torch.long)
        }

### Class: `CaptionClassifier`

#### **Purpose**:
A BERT-based classifier for distinguishing between `SmolVLM` and `Custom` captions.

#### **Architecture**:
1. **Encoder**: Pre-trained BERT (`bert-base-uncased`).
2. **Dropout**: Regularization with a 0.1 dropout rate.
3. **Classifier**: Two fully connected layers with ReLU activation.

#### **Forward Pass**:
1. Extracts `pooler_output` from BERT.
2. Applies dropout and passes through the classifier to generate logits.

In [4]:
class CaptionClassifier(nn.Module):
    def __init__(self, pretrained_model_name='bert-base-uncased', num_classes=2):
        super(CaptionClassifier, self).__init__()
        
        self.bert = BertModel.from_pretrained(pretrained_model_name)
        
        self.dropout = nn.Dropout(0.1)
        self.classifier = nn.Sequential(
            nn.Linear(self.bert.config.hidden_size, 512),
            nn.ReLU(),
            nn.Dropout(0.1),
            nn.Linear(512, num_classes)
        )
        
    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.pooler_output
        
        pooled_output = self.dropout(pooled_output)
        logits = self.classifier(pooled_output)
        
        return logits

### Function: `train_classifier`

#### **Purpose**:
Trains the `CaptionClassifier` model using a given dataset.

#### **Inputs**:
- `model`: The classifier model.
- `dataloader`: DataLoader for training data.
- `optimizer`: Optimizer for updating model weights.
- `criterion`: Loss function (e.g., CrossEntropyLoss).
- `device`: Device to run the model (CPU/GPU).
- `epochs`: Number of training epochs.

#### **Outputs**:
- Prints training progress and metrics for each epoch.

In [5]:
def train_classifier(model, dataloader, optimizer, criterion, device, epochs):
    model.train()
    
    for epoch in range(epochs):
        print(f"Epoch {epoch+1}/{epochs}")
        running_loss = 0.0
        correct_predictions = 0
        total_predictions = 0
        
        progress_bar = tqdm(dataloader, desc=f"Training", leave=False)
        
        for batch in progress_bar:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)
            
            optimizer.zero_grad()
            
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            
            loss = criterion(outputs, labels)
            
            loss.backward()
            
            optimizer.step()
            
            running_loss += loss.item()
            
            _, predicted = torch.max(outputs, dim=1)
            correct_predictions += (predicted == labels).sum().item()
            total_predictions += labels.size(0)
            
            progress_bar.set_postfix({'loss': loss.item(), 'accuracy': correct_predictions / total_predictions})
        
        epoch_loss = running_loss / len(dataloader)
        epoch_accuracy = correct_predictions / total_predictions
        
        print(f"Epoch {epoch+1}/{epochs} - Loss: {epoch_loss:.4f}, Accuracy: {epoch_accuracy:.4f}")


### Function: `evaluate_classifier`

#### **Purpose**:
Evaluates the `CaptionClassifier` model on a test dataset.

#### **Inputs**:
- `model`: The trained classifier model.
- `dataloader`: DataLoader for evaluation data.
- `device`: Device to run the model (CPU/GPU).

#### **Outputs**:
- Prints evaluation metrics (accuracy, precision, recall, F1 score).
- Returns a dictionary of metrics.

In [6]:
def evaluate_classifier(model, dataloader, device):
    model.eval()
    
    all_predictions = []
    all_labels = []
    
    with torch.no_grad():
        for batch in tqdm(dataloader, desc="Evaluating"):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)
            
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            
            _, predictions = torch.max(outputs, dim=1)
            
            all_predictions.extend(predictions.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())
    
    accuracy = accuracy_score(all_labels, all_predictions)
    precision, recall, f1, _ = precision_recall_fscore_support(all_labels, all_predictions, average='macro')
    cm = confusion_matrix(all_labels, all_predictions)
    
    # Plot confusion matrix
    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
    plt.xlabel('Predicted')
    plt.ylabel('True')
    plt.title('Confusion Matrix')
    plt.savefig('confusion_matrix.png')
    plt.close()
    
    metrics = {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1
    }
    
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1 Score: {f1:.4f}")
    
    return metrics

### Function: `main`

#### **Purpose**:
This function runs the entire process: loading data, training the model, validating it, and testing its performance.

#### **Steps**:
1. **Load and Process Data**:
   - Reads CSV files for `Custom` and `SmolVLM` models.
   - Splits the data into training, validation, and test sets.
   - Prepares datasets and dataloaders.

2. **Train the Model**:
   - Initializes the `CaptionClassifier`.
   - Trains the model using the training data.
   - Uses early stopping to avoid overfitting by monitoring validation F1 score.

3. **Evaluate the Model**:
   - Loads the best model saved during training.
   - Tests the model on the test set and calculates metrics like accuracy, precision, recall, and F1 score.
   - Saves the metrics and predictions to CSV files.

4. **Analyze Performance**:
   - Calculates accuracy for different occlusion levels.
   - Creates a bar chart showing accuracy vs. occlusion level and saves it.

#### **Outputs**:
- Metrics saved in `part_c_metrics.csv`.
- Predictions saved in `part_c_predictions.csv`.
- Accuracy chart saved as `occlusion_level_performance.png`.


In [7]:
def main():
    custom_model_file = "/kaggle/input/dl2partbresults/partB_custom.csv"
    smolvlm_file = "/kaggle/input/dl2partbresults/output_smolvlm_partb.csv"
    
    print("Loading and processing data...")
    processed_data = load_and_process_csv_data(custom_model_file, smolvlm_file)
    
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    
    image_ids = processed_data['image_id'].unique()
    np.random.seed(42)
    np.random.shuffle(image_ids)
    
    n_images = len(image_ids)
    train_size = int(0.7 * n_images)
    val_size = int(0.1 * n_images)
    
    train_image_ids = image_ids[:train_size]
    val_image_ids = image_ids[train_size:train_size+val_size]
    test_image_ids = image_ids[train_size+val_size:]
    
    train_df = processed_data[processed_data['image_id'].isin(train_image_ids)]
    val_df = processed_data[processed_data['image_id'].isin(val_image_ids)]
    test_df = processed_data[processed_data['image_id'].isin(test_image_ids)]
    
    train_dataset = CaptionDataset(train_df, tokenizer)
    val_dataset = CaptionDataset(val_df, tokenizer)
    test_dataset = CaptionDataset(test_df, tokenizer)
    
    batch_size = 16
    train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_dataloader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
    test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)
    
    print(f"Training set size: {len(train_dataset)}")
    print(f"Validation set size: {len(val_dataset)}")
    print(f"Test set size: {len(test_dataset)}")
    
    train_labels = [train_dataset[i]['label'].item() for i in range(len(train_dataset))]
    val_labels = [val_dataset[i]['label'].item() for i in range(len(val_dataset))]
    test_labels = [test_dataset[i]['label'].item() for i in range(len(test_dataset))]
    
    print(f"Training set label distribution: SmolVLM={train_labels.count(0)}, Custom={train_labels.count(1)}")
    print(f"Validation set label distribution: SmolVLM={val_labels.count(0)}, Custom={val_labels.count(1)}")
    print(f"Test set label distribution: SmolVLM={test_labels.count(0)}, Custom={test_labels.count(1)}")
    
    model = CaptionClassifier().to(device)
    
    optimizer = optim.AdamW(model.parameters(), lr=2e-5)
    criterion = nn.CrossEntropyLoss()
    
    print("Training classifier...")
    train_classifier(model, train_dataloader, optimizer, criterion, device, epochs=3)
    
    best_val_f1 = 0
    patience = 3
    counter = 0
    early_stopping_flag = False
    
    for epoch in range(7):
        train_classifier(model, train_dataloader, optimizer, criterion, device, epochs=1)
        
        model.eval()
        all_val_preds = []
        all_val_labels = []
        
        with torch.no_grad():
            for batch in val_dataloader:
                input_ids = batch['input_ids'].to(device)
                attention_mask = batch['attention_mask'].to(device)
                labels = batch['label'].to(device)
                
                outputs = model(input_ids=input_ids, attention_mask=attention_mask)
                _, predictions = torch.max(outputs, dim=1)
                
                all_val_preds.extend(predictions.cpu().numpy())
                all_val_labels.extend(labels.cpu().numpy())
        
        _, _, val_f1, _ = precision_recall_fscore_support(all_val_labels, all_val_preds, average='macro')
        print(f"Validation F1 Score: {val_f1:.4f}")
        
        if val_f1 > best_val_f1:
            best_val_f1 = val_f1
            torch.save(model.state_dict(), 'best_classifier_model.pt')
            counter = 0
        else:
            counter += 1
            if counter >= patience:
                print(f"Early stopping triggered after {epoch+1} epochs")
                early_stopping_flag = True
                break
    
    if early_stopping_flag or os.path.exists('best_classifier_model.pt'):
        model.load_state_dict(torch.load('best_classifier_model.pt'))
    
    print("Evaluating classifier...")
    test_metrics = evaluate_classifier(model, test_dataloader, device)
    
    metrics_df = pd.DataFrame([test_metrics])
    metrics_df.to_csv('part_c_metrics.csv', index=False)
    
    model.eval()
    all_test_preds = []
    all_test_labels = []
    all_test_images = []
    all_test_occlusions = []
    
    with torch.no_grad():
        for i, item in enumerate(test_dataset):
            input_ids = item['input_ids'].unsqueeze(0).to(device)
            attention_mask = item['attention_mask'].unsqueeze(0).to(device)
            label = item['label'].item()
            
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            _, prediction = torch.max(outputs, dim=1)
            
            all_test_preds.append(prediction.item())
            all_test_labels.append(label)
            all_test_images.append(test_df.iloc[i]['image_id'])
            all_test_occlusions.append(test_df.iloc[i]['occlusion_level'])
    
    predictions_df = pd.DataFrame({
        'image_id': all_test_images,
        'occlusion_level': all_test_occlusions,
        'true_model': ['SmolVLM' if label == 0 else 'Custom' for label in all_test_labels],
        'predicted_model': ['SmolVLM' if pred == 0 else 'Custom' for pred in all_test_preds],
        'correct': [pred == label for pred, label in zip(all_test_preds, all_test_labels)]
    })
    
    predictions_df.to_csv('part_c_predictions.csv', index=False)
    
    occlusion_performance = predictions_df.groupby('occlusion_level')['correct'].mean().reset_index()
    occlusion_performance.columns = ['occlusion_level', 'accuracy']
    print("\nPerformance by occlusion level:")
    print(occlusion_performance)
    
    plt.figure(figsize=(10, 6))
    sns.barplot(x='occlusion_level', y='accuracy', data=occlusion_performance)
    plt.title('Classification Accuracy by Occlusion Level')
    plt.xlabel('Occlusion Level (%)')
    plt.ylabel('Accuracy')
    plt.savefig('occlusion_level_performance.png')
    plt.close()
    
    print("Part C completed successfully!")


In [8]:
if __name__ == "__main__":
    main()

Loading and processing data...
Total processed data points: 5568
SmolVLM data points: 2784
Custom model data points: 2784


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Training set size: 3894
Validation set size: 552
Test set size: 1122


Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


Training set label distribution: SmolVLM=1947, Custom=1947
Validation set label distribution: SmolVLM=276, Custom=276
Test set label distribution: SmolVLM=561, Custom=561


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Training classifier...
Epoch 1/3


                                                                                         

Epoch 1/3 - Loss: 0.2118, Accuracy: 0.9081
Epoch 2/3


                                                                                         

Epoch 2/3 - Loss: 0.0468, Accuracy: 0.9800
Epoch 3/3


                                                                                         

Epoch 3/3 - Loss: 0.0424, Accuracy: 0.9756
Epoch 1/1


                                                                                          

Epoch 1/1 - Loss: 0.0312, Accuracy: 0.9784
Validation F1 Score: 0.9837
Epoch 1/1


                                                                                          

Epoch 1/1 - Loss: 0.0264, Accuracy: 0.9802
Validation F1 Score: 0.9837
Epoch 1/1


                                                                                          

Epoch 1/1 - Loss: 0.0261, Accuracy: 0.9795
Validation F1 Score: 0.9837
Epoch 1/1


                                                                                          

Epoch 1/1 - Loss: 0.0259, Accuracy: 0.9813
Validation F1 Score: 0.9837
Early stopping triggered after 4 epochs


  model.load_state_dict(torch.load('best_classifier_model.pt'))


Evaluating classifier...


Evaluating: 100%|██████████| 71/71 [00:08<00:00,  8.21it/s]


Accuracy: 0.9750
Precision: 0.9762
Recall: 0.9750
F1 Score: 0.9750

Performance by occlusion level:
  occlusion_level  accuracy
0              10  0.973262
1              50  0.975936
2              80  0.975936
Part C completed successfully!
