# Qwen 2.5-2B VL Training on HAM10000 Dataset

**Streamlined training pipeline for Kaggle environment**

Features: QLoRA + 4-bit quantization for memory efficiency

In [None]:
# Install dependencies
%pip install transformers accelerate peft tqdm pillow pandas bitsandbytes --q

In [None]:
# Import libraries and configuration
import torch
from torch.utils.data import DataLoader
from transformers import (
    Qwen2VLForConditionalGeneration,
    Qwen2VLProcessor,
    BitsAndBytesConfig,
    TrainingArguments,
    Trainer
)
from peft import LoraConfig, get_peft_model, TaskType
from datasets import Dataset as HFDataset
import pandas as pd
from PIL import Image
import os
from pathlib import Path

CONFIG = {
    "model_name": "Qwen/Qwen2-VL-2B-Instruct",
    "train_image_dir": "/kaggle/input/small-isic",
    "train_metadata_file": "/kaggle/input/small-isic/HAM10000_metadata.csv",
    "output_dir": "./qwen2_5_vl_trained",
    "num_train_epochs": 3,
    "learning_rate": 5e-5,
    "per_device_train_batch_size": 1,
    "gradient_accumulation_steps": 1,
    "max_length": 512,
    "lora_r": 32,
    "lora_alpha": 64,
    "lora_dropout": 0.05,
    
    # Additional stability settings
    "warmup_steps": 100,
    "weight_decay": 0.01,
    "max_grad_norm": 1.0,
    "seed": 42
}

print("Configuration loaded ✓")


In [None]:
# Load model with quantization
# bnb_config = BitsAndBytesConfig(
#     load_in_4bit=True,
#     bnb_4bit_use_double_quant=True,
#     bnb_4bit_quant_type="nf4",
#     bnb_4bit_compute_dtype=torch.bfloat16
# )

model = Qwen2VLForConditionalGeneration.from_pretrained(
    CONFIG["model_name"],
    device_map="auto",
    torch_dtype=torch.bfloat16,
    # quantization_config=bnb_config,
    trust_remote_code=True
)

processor = Qwen2VLProcessor.from_pretrained(
    CONFIG["model_name"],
    trust_remote_code=True
)

if processor.tokenizer.pad_token is None:
    processor.tokenizer.pad_token = processor.tokenizer.eos_token
    processor.tokenizer.pad_token_id = processor.tokenizer.eos_token_id

# Apply QLoRA with gradient fix
# peft_config = LoraConfig(
#     lora_alpha=CONFIG["lora_alpha"],
#     lora_dropout=CONFIG["lora_dropout"],
#     r=CONFIG["lora_r"],
#     bias="none",
#     target_modules=["q_proj", "v_proj", "k_proj", "o_proj"],
#     task_type=TaskType.CAUSAL_LM,
# )

# model = get_peft_model(model, peft_config)
# model.print_trainable_parameters()

# Fix gradient computation
# for name, param in model.named_parameters():
#     if 'lora' in name:
#         param.requires_grad = True

print("Model loaded ✓")

In [None]:
# Load HAM10000 dataset
def load_ham10000_data():
    image_dir = Path(CONFIG["train_image_dir"])
    metadata_file = CONFIG["train_metadata_file"]
    df = pd.read_csv(metadata_file)
    
    # Find image files
    image_files = []
    for ext in ["*.jpg", "*.jpeg", "*.png"]:
        image_files.extend(list(image_dir.glob(f"**/{ext}")))
    
    conversations = []
    for _, row in df.iterrows():
        image_id = str(row.get('image_id', row.get('Image', '')))
        matching_images = [img for img in image_files if image_id in str(img)]
        
        if matching_images:
            diagnosis = str(row.get('dx', row.get('diagnosis', 'skin lesion')))
            conversation = {
                "image_path": str(matching_images[0]),
                "conversation": [
                    {
                        "role": "user",
                        "content": [
                            {"type": "image"},
                            {"type": "text", "text": "Analyze this skin lesion and provide a diagnosis."}
                        ]
                    },
                    {
                        "role": "assistant",
                        "content": [
                            {"type": "text", "text": f"This appears to be {diagnosis}."}
                        ]
                    }
                ]
            }
            conversations.append(conversation)
    
    return conversations

dataset = load_ham10000_data()
print(f"Dataset loaded ✓ ({len(dataset)} samples)")


In [None]:
# Data collator
def collate_fn(examples):
    texts = []
    images = []
    
    for example in examples:
        image = Image.open(example["image_path"]).convert('RGB')
        images.append(image)
        
        text = processor.apply_chat_template(
            example["conversation"], 
            tokenize=False, 
            add_generation_prompt=False
        )
        texts.append(text)
    
    batch = processor(
        text=texts,
        images=images, 
        return_tensors="pt",
        padding=True,
        truncation=True,
        max_length=CONFIG["max_length"]
    )
    
    labels = batch["input_ids"].clone()
    labels[labels == processor.tokenizer.pad_token_id] = -100
    
    # Mask image tokens in labels to prevent learning image embeddings as text
    if hasattr(processor, 'image_token_id'):
        labels[labels == processor.image_token_id] = -100
    
    batch["labels"] = labels
    return batch


In [None]:
# Prepare datasets
train_size = int(0.8 * len(dataset))
train_data = dataset[:train_size]
eval_data = dataset[train_size:]

train_dataset = HFDataset.from_list(train_data)
eval_dataset = HFDataset.from_list(eval_data)

print(f"Training: {len(train_data)} | Evaluation: {len(eval_data)}")


In [None]:
# Setup training
training_args = TrainingArguments(
    output_dir=CONFIG["output_dir"],
    per_device_train_batch_size=CONFIG["per_device_train_batch_size"],
    gradient_accumulation_steps=CONFIG["gradient_accumulation_steps"],
    num_train_epochs=CONFIG["num_train_epochs"],
    learning_rate=CONFIG["learning_rate"],
    
    # Learning rate scheduling and regularization
    warmup_steps=CONFIG["warmup_steps"],
    lr_scheduler_type="cosine",
    weight_decay=CONFIG["weight_decay"],
    max_grad_norm=CONFIG["max_grad_norm"],
    
    # Logging and saving
    logging_steps=100,
    save_steps=1000,  # Less frequent saves
    eval_steps=1000,
    evaluation_strategy="steps",
    save_strategy="steps",
    save_total_limit=2,  # Keep only 2 checkpoints
    
    # Memory and performance optimizations
    remove_unused_columns=False,
    bf16=True,
    # gradient_checkpointing=True,
    report_to="none"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    data_collator=collate_fn,
    tokenizer=processor.tokenizer,
)

In [None]:
# Start training
print("Starting training...")
trainer.train()
print("Training complete ✓")

# Save model and processor
trainer.save_model(CONFIG["output_dir"])
processor.save_pretrained(CONFIG["output_dir"])
print("Model saved")

In [None]:
# Test the trained model inference

def test_model_inference(model, processor, test_samples, num_tests=3):
    """Test model inference on sample images"""
    
    results = []
    
    for i, sample in enumerate(test_samples[:num_tests]):
        print(f"\n--- Test {i+1}/{num_tests} ---")
        
        try:
            # Load test image
            image_path = sample["image_path"]
            test_image = Image.open(image_path).convert('RGB')
            print(f"Image: {os.path.basename(image_path)}")
            
            # Create conversation for inference
            conversation = [
                {
                    "role": "user",
                    "content": [
                        {"type": "image"},
                        {"type": "text", "text": "Analyze this skin lesion and provide a diagnosis."}
                    ]
                }
            ]
            
            # Process input
            text_prompt = processor.apply_chat_template(conversation, add_generation_prompt=True)
            inputs = processor(
                text=[text_prompt], 
                images=[test_image], 
                return_tensors="pt"
            ).to(model.device)
            
            # Generate response
            with torch.no_grad():
                generated_ids = model.generate(
                    **inputs,
                    max_new_tokens=150,
                    temperature=0.7,
                    do_sample=True,
                    pad_token_id=processor.tokenizer.eos_token_id
                )
            
            # Decode response
            response = processor.batch_decode(
                generated_ids[:, inputs["input_ids"].shape[1]:], 
                skip_special_tokens=True
            )[0].strip()
            
            # Get ground truth from dataset
            ground_truth = sample["conversation"][1]["content"][0]["text"]
            
            print(f"Ground Truth: {ground_truth}")
            print(f"Model Output: {response}")
            
            results.append({
                "image": os.path.basename(image_path),
                "ground_truth": ground_truth,
                "prediction": response,
                "success": True
            })
            
        except Exception as e:
            print(f"❌ Error processing test {i+1}: {e}")
            results.append({
                "image": os.path.basename(image_path) if 'image_path' in locals() else f"test_{i+1}",
                "error": str(e),
                "success": False
            })
    
    return results

# Loading model and processor and Run inference tests
print("Running inference tests on trained model...")
model = Qwen2VLForConditionalGeneration.from_pretrained(CONFIG["output_dir"],
                             device_map="auto", torch_dtype=torch.bfloat16,
                             trust_remote_code=True)
processor = Qwen2VLProcessor.from_pretrained(CONFIG["output_dir"],
                                             trust_remote_code=True)

test_results = test_model_inference(model, processor, dataset, num_tests=3)

# Summary
successful_tests = sum(1 for r in test_results if r["success"])
print(f"\n{'='*50}")
print(f"TEST SUMMARY: {successful_tests}/{len(test_results)} tests passed")
print(f"{'='*50}")
