In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import os
import pandas as pd
from PIL import Image
from transformers import ViltProcessor, ViltForQuestionAnswering, Trainer, TrainingArguments
import torch
from accelerate import Accelerator
from sklearn.model_selection import train_test_split
from peft import LoraConfig, get_peft_model
from transformers.data.data_collator import default_data_collator
import logging

# Set up logging for unmapped answers
logging.basicConfig(filename='unmapped_answers.log', level=logging.WARNING, format='%(message)s')

# Initialize Accelerator for efficient multi-GPU training
accelerator = Accelerator()

# Load ViLT processor and model
processor = ViltProcessor.from_pretrained("dandelin/vilt-b32-finetuned-vqa")
model = ViltForQuestionAnswering.from_pretrained("dandelin/vilt-b32-finetuned-vqa")

# Get ViLT's answer vocabulary (id2label mapping)
answer_vocab = model.config.id2label  # Maps index (0-3128) to answer string
num_answers = len(answer_vocab)  # Should be 3129
print(f"ViLT answer vocabulary size: {num_answers}")

# Define dataset paths 
VQA_DATASET_PATH = '/kaggle/input/vqadataset/VQADataset.csv'
ABO_METADATA_PATH = '/kaggle/input/abo-small/metadata/images.csv'
ABO_IMAGE_BASE_PATH = '/kaggle/input/abo-small/small'

# Validate dataset paths
if not os.path.exists(VQA_DATASET_PATH):
    raise FileNotFoundError(f"VQA dataset not found at {VQA_DATASET_PATH}")
if not os.path.exists(ABO_METADATA_PATH):
    raise FileNotFoundError(f"ABO metadata not found at {ABO_METADATA_PATH}")
if not os.path.exists(ABO_IMAGE_BASE_PATH):
    raise FileNotFoundError(f"ABO image directory not found at {ABO_IMAGE_BASE_PATH}")

# Load VQA dataset
vqa_df = pd.read_csv(VQA_DATASET_PATH)
print(f"Loaded VQA dataset with {len(vqa_df)} entries")

# Load ABO metadata and merge to get image paths
abo_metadata = pd.read_csv(ABO_METADATA_PATH)
vqa_df = pd.merge(vqa_df, abo_metadata[['image_id', 'path']], on='image_id', how='left')

# Handle missing values and ensure answers are strings
vqa_df['answer'] = vqa_df['answer'].fillna('unknown').astype(str)

# Validate image paths
vqa_df['image_path'] = vqa_df['path'].apply(lambda p: os.path.join(ABO_IMAGE_BASE_PATH, p))
missing_images = vqa_df[~vqa_df['image_path'].apply(os.path.exists)]
if not missing_images.empty:
    print(f"Warning: {len(missing_images)} images not found. Examples: {missing_images['image_path'].head().tolist()}")

# Split data into train and test sets
train_df, test_df = train_test_split(vqa_df, test_size=0.2, random_state=118)
print(f"Training set size: {len(train_df)}, Test set size: {len(test_df)}")

# Define a custom VQA Dataset class
class VQADataset(torch.utils.data.Dataset):
    def __init__(self, df, processor, image_base_path, answer_vocab, image_size=(384, 384)):
        self.df = df
        self.processor = processor
        self.image_base_path = image_base_path
        self.image_size = image_size
        self.answer_vocab = answer_vocab  # {index: answer}
        self.answer_to_idx = {v.lower(): k for k, v in answer_vocab.items()}  # Reverse mapping

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        image_path = os.path.join(self.image_base_path, row['path'])
        try:
            image = Image.open(image_path).convert("RGB")
            image = image.resize(self.image_size, Image.Resampling.LANCZOS)
        except Exception as e:
            print(f"Error loading image {image_path}: {e}")
            image = Image.new("RGB", self.image_size, (0, 0, 0))
        
        question = row['question']
        answer = row['answer'].lower()

        # Process image and question with ViLT processor
        encoding = self.processor(
            images=image,
            text=question,
            padding="max_length",
            max_length=40,
            truncation=True,
            return_tensors="pt",
            return_attention_mask=True
        )

        # Create binary labels for ViLT (shape: [num_answers])
        labels = torch.zeros(len(self.answer_vocab), dtype=torch.float32)
        # Map answer to ViLT's vocabulary
        if answer in self.answer_to_idx:
            answer_idx = self.answer_to_idx[answer]
            labels[answer_idx] = 1.0
        else:
            logging.warning(f"Answer '{answer}' not in ViLT vocabulary, using 'unknown'")
            if 'unknown' in self.answer_to_idx:
                labels[self.answer_to_idx['unknown']] = 1.0

        # Remove batch dimension from tensors
        encoding = {k: v.squeeze(0) for k, v in encoding.items()}
        encoding["labels"] = labels

        return encoding

# Custom data collator to handle tensor stacking
def custom_data_collator(features):
    batch = {}
    keys = ["input_ids", "attention_mask", "pixel_values", "labels"]
    
    for key in keys:
        if key in features[0]:
            try:
                batch[key] = torch.stack([f[key] for f in features])
            except RuntimeError as e:
                print(f"Error stacking {key}: {e}")
                # Fallback: pad to max size in batch
                max_len = max(f[key].shape[-1] for f in features if key in f)
                batch[key] = torch.stack([
                    torch.nn.functional.pad(
                        f[key], (0, max_len - f[key].shape[-1]), value=0
                    ) if key in f else torch.zeros_like(features[0][key])
                    for f in features
                ])
    
    return batch

# Create training dataset
train_dataset = VQADataset(train_df, processor, ABO_IMAGE_BASE_PATH, answer_vocab)

# Define LoRA configuration
lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["query", "value"], 
    lora_dropout=0.1,
    bias="none"
)

# Apply LoRA to the model
model = get_peft_model(model, lora_config)
print("LoRA applied to the model")

# Prepare model with Accelerator
model = accelerator.prepare(model)

# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    run_name="vilt_vqa_lora_finetune",
    num_train_epochs=3,
    per_device_train_batch_size=4,
    gradient_accumulation_steps=4,
    learning_rate=5e-5,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    save_strategy="epoch",
    fp16=True,
    remove_unused_columns=False,
    report_to="none"
)

# Create Trainer instance with custom data collator
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    data_collator=custom_data_collator,
)

# Check GPU memory usage before training
if torch.cuda.is_available():
    print("GPU Memory Usage Before Training:")
    print(torch.cuda.memory_summary())

# Start fine-tuning with LoRA
trainer.train()

# Save the fine-tuned model
trainer.save_model("./fine_tuned_vilt_vqa_lora")
print("Model saved to './fine_tuned_vilt_vqa_lora'")

2025-05-14 18:18:55.355475: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1747246735.597329      35 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1747246735.667762      35 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


preprocessor_config.json:   0%|          | 0.00/251 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/320 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/136k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/470M [00:00<?, ?B/s]

ViLT answer vocabulary size: 3129
Loaded VQA dataset with 64406 entries


model.safetensors:   0%|          | 0.00/470M [00:00<?, ?B/s]

Training set size: 51524, Test set size: 12882
LoRA applied to the model


No label_names provided for model class `PeftModel`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


GPU Memory Usage Before Training:
|                  PyTorch CUDA memory summary, device ID 0                 |
|---------------------------------------------------------------------------|
|            CUDA OOMs: 0            |        cudaMalloc retries: 0         |
|        Metric         | Cur Usage  | Peak Usage | Tot Alloc  | Tot Freed  |
|---------------------------------------------------------------------------|
| Allocated memory      | 462229 KiB | 462229 KiB | 462229 KiB |      0 B   |
|       from large pool | 458838 KiB | 458838 KiB | 458838 KiB |      0 B   |
|       from small pool |   3391 KiB |   3391 KiB |   3391 KiB |      0 B   |
|---------------------------------------------------------------------------|
| Active memory         | 462229 KiB | 462229 KiB | 462229 KiB |      0 B   |
|       from large pool | 458838 KiB | 458838 KiB | 458838 KiB |      0 B   |
|       from small pool |   3391 KiB |   3391 KiB |   3391 KiB |      0 B   |
|-----------------------------



Step,Training Loss
10,5.3528
20,4.8499
30,5.5711
40,5.0833
50,5.3724
60,5.0189
70,4.9567
80,4.9156
90,4.5996
100,4.7549




Model saved to './fine_tuned_vilt_vqa_lora'


In [2]:
import os
import zipfile
from datetime import datetime

# Define paths to the output directories and files
output_dirs = [
    './results',                    # Training results/checkpoints
    './logs',                       # Training logs
    './fine_tuned_vilt_vqa_lora',   # Fine-tuned model
]
output_files = [
    'unmapped_answers.log',         # Log of unmapped answers
]

# Define the output zip file path
zip_output_path = '/kaggle/working/model_outputs.zip'

# Create a timestamp for the zip file name (optional, for uniqueness)
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
zip_output_path = f'/kaggle/working/model_outputs_{timestamp}.zip'

# Function to add files and directories to zip
def zip_outputs(dirs, files, output_path):
    with zipfile.ZipFile(output_path, 'w', zipfile.ZIP_DEFLATED) as zipf:
        # Add directories
        for dir_path in dirs:
            if os.path.exists(dir_path):
                for root, _, filenames in os.walk(dir_path):
                    for filename in filenames:
                        file_path = os.path.join(root, filename)
                        # Write file to zip with relative path
                        arcname = os.path.relpath(file_path, start=os.path.dirname(dir_path))
                        zipf.write(file_path, os.path.join(os.path.basename(dir_path), arcname))
                        print(f"Added to zip: {file_path}")
            else:
                print(f"Directory not found, skipping: {dir_path}")
        
        # Add individual files
        for file_path in files:
            if os.path.exists(file_path):
                zipf.write(file_path, os.path.basename(file_path))
                print(f"Added to zip: {file_path}")
            else:
                print(f"File not found, skipping: {file_path}")

# Create the zip file
zip_outputs(output_dirs, output_files, zip_output_path)

# Print instructions for downloading
print(f"\nZip file created at: {zip_output_path}")
print("To download in Kaggle:")
print("1. Go to the 'Output' tab in the Kaggle notebook interface.")
print(f"2. Locate '{os.path.basename(zip_output_path)}' under '/kaggle/working'.")
print("3. Click the three dots next to the file and select 'Download'.")

Added to zip: ./results/checkpoint-3222/training_args.bin
Added to zip: ./results/checkpoint-3222/rng_state.pth
Added to zip: ./results/checkpoint-3222/scheduler.pt
Added to zip: ./results/checkpoint-3222/trainer_state.json
Added to zip: ./results/checkpoint-3222/adapter_config.json
Added to zip: ./results/checkpoint-3222/adapter_model.safetensors
Added to zip: ./results/checkpoint-3222/optimizer.pt
Added to zip: ./results/checkpoint-3222/scaler.pt
Added to zip: ./results/checkpoint-3222/README.md
Added to zip: ./results/checkpoint-4830/training_args.bin
Added to zip: ./results/checkpoint-4830/rng_state.pth
Added to zip: ./results/checkpoint-4830/scheduler.pt
Added to zip: ./results/checkpoint-4830/trainer_state.json
Added to zip: ./results/checkpoint-4830/adapter_config.json
Added to zip: ./results/checkpoint-4830/adapter_model.safetensors
Added to zip: ./results/checkpoint-4830/optimizer.pt
Added to zip: ./results/checkpoint-4830/scaler.pt
Added to zip: ./results/checkpoint-4830/READ