In [3]:
# Download and extract the dataset files
!wget https://amazon-berkeley-objects.s3.amazonaws.com/archives/abo-images-small.tar
!wget https://amazon-berkeley-objects.s3.amazonaws.com/archives/abo-listings.tar

--2025-05-14 17:59:03--  https://amazon-berkeley-objects.s3.amazonaws.com/archives/abo-images-small.tar
Resolving amazon-berkeley-objects.s3.amazonaws.com (amazon-berkeley-objects.s3.amazonaws.com)... 52.216.53.153, 3.5.29.73, 52.217.226.73, ...
Connecting to amazon-berkeley-objects.s3.amazonaws.com (amazon-berkeley-objects.s3.amazonaws.com)|52.216.53.153|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 3253381120 (3.0G) [application/x-tar]
Saving to: ‘abo-images-small.tar’


2025-05-14 18:00:26 (37.4 MB/s) - ‘abo-images-small.tar’ saved [3253381120/3253381120]

--2025-05-14 18:00:26--  https://amazon-berkeley-objects.s3.amazonaws.com/archives/abo-listings.tar
Resolving amazon-berkeley-objects.s3.amazonaws.com (amazon-berkeley-objects.s3.amazonaws.com)... 3.5.24.52, 16.182.104.153, 52.217.15.60, ...
Connecting to amazon-berkeley-objects.s3.amazonaws.com (amazon-berkeley-objects.s3.amazonaws.com)|3.5.24.52|:443... connected.
HTTP request sent, awaiting response.

In [4]:
!tar -xf abo-images-small.tar
!tar -xf abo-listings.tar

In [5]:
!gzip -d images/metadata/images.csv.gz

In [1]:
!pip install peft accelerate transformers datasets bitsandbytes

Collecting bitsandbytes
  Downloading bitsandbytes-0.45.5-py3-none-manylinux_2_24_x86_64.whl.metadata (5.0 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.13.0->peft)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.13.0->peft)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=1.13.0->peft)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.13.0->peft)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=1.13.0->peft)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch>=1.13.0

In [None]:
import os
import pandas as pd
from PIL import Image
from transformers import BlipProcessor, BlipForQuestionAnswering, Trainer, TrainingArguments
import torch
from accelerate import Accelerator
#from sklearn.model_selection import train_test_split
from peft import LoraConfig, get_peft_model
from transformers.data.data_collator import default_data_collator

# === CONFIGURATION ===
BASE_IMAGE_PATH = './images/small'  # Adjust this to match your images/small directory
CSV_PATH = './dataset_0_fixed_simplified.csv'  # Path to your CSV file
METADATA_PATH = './images/metadata/images.csv'  # Path to the metadata CSV

# === ACCELERATOR INIT ===
accelerator = Accelerator()

# === LOAD BLIP MODEL & PROCESSOR ===
processor = BlipProcessor.from_pretrained("Salesforce/blip-vqa-base", use_fast=True)
model = BlipForQuestionAnswering.from_pretrained("Salesforce/blip-vqa-base")

# === LOAD YOUR CURATED CSV ===
df = pd.read_csv(CSV_PATH)
print(f"Loaded custom dataset with {len(df)} entries.")

# Load image metadata to map image_id to file paths
try:
    metadata_df = pd.read_csv(METADATA_PATH)
    print(f"Loaded metadata with {len(metadata_df)} images.")
    # Create a mapping from image_id to path
    image_id_to_path = {}
    for _, row in metadata_df.iterrows():
        if 'image_id' in row and 'path' in row:
            image_id_to_path[row['image_id']] = row['path']
except Exception as e:
    print(f"Error loading metadata: {e}")
    # Fallback: assume image_id directly maps to path
    image_id_to_path = {}

# Ensure proper types
df['answer'] = df['answer'].fillna('unknown').astype(str)
df['image_id'] = df['image_id'].astype(str)

# === TRAIN-TEST SPLIT ===
train_df = df #, test_df = train_test_split(df, test_size=0.2, random_state=42)
print(f"Train size: {len(train_df)}")# | Test size: {len(test_df)}")

# === DEFINE CUSTOM DATASET ===
class VQADataset(torch.utils.data.Dataset):
    def __init__(self, df, processor, image_base_path, image_id_to_path=None):
        self.df = df
        self.processor = processor
        self.image_base_path = image_base_path
        self.image_id_to_path = image_id_to_path or {}

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        image_id = row['image_id']

        # Try to find image path using metadata mapping
        if image_id in self.image_id_to_path:
            # Use the path from metadata
            relative_path = self.image_id_to_path[image_id]
            full_image_path = os.path.join(self.image_base_path, relative_path)
        else:
            # Fallback: Determine path based on image_id first two characters
            # For example, if image_id is "81iZlv3bjpL", it would go in folder "8"
            # Adjust this logic based on your actual naming convention
            prefix = image_id[:2]
            full_image_path = os.path.join(self.image_base_path, prefix, f"{image_id}.jpg")

            # If not found, try alternative patterns
            if not os.path.exists(full_image_path):
                # Try looking in a folder matching the first two characters
                prefix = image_id[:2]
                full_image_path = os.path.join(self.image_base_path, prefix, f"{image_id}.jpg")

            if not os.path.exists(full_image_path):
                # Last resort: search for the image recursively (could be slow)
                for root, _, files in os.walk(self.image_base_path):
                    for file in files:
                        if image_id in file:
                            full_image_path = os.path.join(root, file)
                            break

        try:
            image = Image.open(full_image_path).convert("RGB")
        except Exception as e:
            print(f"Failed to load {full_image_path} for image_id {image_id}: {e}")
            image = Image.new("RGB", (224, 224), (0, 0, 0))  # Fallback image

        encoding = self.processor(
            images=image,
            text=row['question'],
            padding="max_length",
            max_length=128,
            truncation=True,
            return_tensors="pt",
            return_attention_mask=True
        )

        labels = self.processor.tokenizer(
            row['answer'],
            padding="max_length",
            truncation=True,
            max_length=32,
            return_tensors="pt"
        )["input_ids"]

        encoding = {k: v.squeeze(0) for k, v in encoding.items()}
        encoding["labels"] = labels.squeeze(0)
        return encoding

# Add verification functions here
def verify_dataset_images(dataset, num_samples=5):
    """Verify that images are being loaded correctly by checking a few samples"""
    print("\n=== DATASET VERIFICATION ===")
    print(f"Dataset contains {len(dataset)} samples")

    # Check a few random samples
    import random
    random.seed(42)  # For reproducibility
    sample_indices = random.sample(range(len(dataset)), min(num_samples, len(dataset)))

    for i, idx in enumerate(sample_indices):
        try:
            # Get the original data row
            row = dataset.df.iloc[idx]
            print(f"\nSample {i+1}/{len(sample_indices)}:")
            print(f"  Question: {row['question']}")
            print(f"  Answer: {row['answer']}")
            print(f"  Image ID: {row['image_id']}")

            # Try to get the processed item
            item = dataset[idx]
            if 'pixel_values' in item:
                pixel_shape = item['pixel_values'].shape
                print(f"  Image loaded successfully with shape: {pixel_shape}")
            else:
                print("  Warning: No pixel_values in processed item")

            if 'input_ids' in item:
                input_length = item['input_ids'].shape[0]
                print(f"  Question tokenized to {input_length} tokens")
            else:
                print("  Warning: No input_ids in processed item")

            if 'labels' in item:
                label_length = item['labels'].shape[0]
                print(f"  Answer tokenized to {label_length} tokens")
            else:
                print("  Warning: No labels in processed item")

            print("  Sample loaded successfully!")
        except Exception as e:
            print(f"  Error processing sample {idx}: {e}")

    print("\n=== VERIFICATION COMPLETE ===\n")
    return True

# === CREATE DATASET INSTANCE ===
train_dataset = VQADataset(train_df, processor, BASE_IMAGE_PATH, image_id_to_path)

# Verify that the dataset is working properly
verify_dataset_images(train_dataset)

# === APPLY LoRA TO MODEL ===
lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["query", "value"],
    lora_dropout=0.1,
    bias="none"
)
model = get_peft_model(model, lora_config)
print("LoRA applied.")

# === PREPARE MODEL FOR ACCELERATION ===
model = accelerator.prepare(model)

# === DEFINE TRAINING ARGUMENTS ===
training_args = TrainingArguments(
    output_dir="./results",
    run_name="blip_vqa_lora_finetune_curated",
    num_train_epochs=3,
    per_device_train_batch_size=4,
    gradient_accumulation_steps=4,
    learning_rate=5e-5,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    save_strategy="epoch",
    fp16=True,
    remove_unused_columns=False,
    report_to="none"
)

# === DEFINE A VALIDATION CALLBACK ===
from transformers import TrainerCallback

class ValidationCallback(TrainerCallback):
    def __init__(self, processor, interval=500):
        self.processor = processor
        self.interval = interval

    def on_step_end(self, args, state, control, **kwargs):
        if state.global_step % self.interval == 0 and state.global_step > 0:
            model = kwargs.get('model', None)
            if model is None:
                return

            model.eval()
            with torch.no_grad():
                # Generate a prediction for a simple example
                prompt = "What color is the object in the image?"
                inputs = processor(images=Image.new("RGB", (224, 224), (100, 150, 200)),
                                  text=prompt, return_tensors="pt")
                inputs = {k: v.to(model.device) for k, v in inputs.items()}

                # Generate output
                generated_ids = model.generate(**inputs, max_length=20)
                generated_text = processor.decode(generated_ids[0], skip_special_tokens=True)

                print(f"\n=== VALIDATION AT STEP {state.global_step} ===")
                print(f"Q: {prompt}")
                print(f"A: {generated_text}")
                print(f"Current training loss: {state.log_history[-1]['loss']:.4f}")
                print(f"=== END VALIDATION ===\n")

            model.train()

# === TRAINER SETUP ===
validation_callback = ValidationCallback(processor)
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    data_collator=default_data_collator,
    callbacks=[validation_callback]
)

# === GPU INFO ===
if torch.cuda.is_available():
    print("GPU GPU Memory Usage Before Training:")
    print(torch.cuda.memory_summary())

# === START TRAINING ===
trainer.train()

# === SAVE MODEL ===
trainer.save_model("./blip_vqa_lora_r_16")
print("Model saved to './blip_vqa_lora_r_16'")


Loaded custom dataset with 35184 entries.
Loaded metadata with 398212 images.
Train size: 35184

=== DATASET VERIFICATION ===
Dataset contains 35184 samples

Sample 1/5:
  Question: What is the primary component of the dish sauce?
  Answer: Chicken
  Image ID: 71ry7DlIvBL
  Image loaded successfully with shape: torch.Size([3, 384, 384])
  Question tokenized to 128 tokens
  Answer tokenized to 32 tokens
  Sample loaded successfully!

Sample 2/5:
  Question: What is the weight?
  Answer: 13.2oz
  Image ID: 81AGKphS3rL
  Image loaded successfully with shape: torch.Size([3, 384, 384])
  Question tokenized to 128 tokens
  Answer tokenized to 32 tokens
  Sample loaded successfully!

Sample 3/5:
  Question: Is it wireless?
  Answer: Yes
  Image ID: 81dfXp9sPZL
  Image loaded successfully with shape: torch.Size([3, 384, 384])
  Question tokenized to 128 tokens
  Answer tokenized to 32 tokens
  Sample loaded successfully!

Sample 4/5:
  Question: How many laundry detergent packs does this bag c

No label_names provided for model class `PeftModel`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


GPU GPU Memory Usage Before Training:
|                  PyTorch CUDA memory summary, device ID 0                 |
|---------------------------------------------------------------------------|
|            CUDA OOMs: 0            |        cudaMalloc retries: 0         |
|        Metric         | Cur Usage  | Peak Usage | Tot Alloc  | Tot Freed  |
|---------------------------------------------------------------------------|
| Allocated memory      |   4565 MiB |   4565 MiB |  10999 GiB |  10995 GiB |
|       from large pool |   4498 MiB |   4498 MiB |   9180 GiB |   9175 GiB |
|       from small pool |     66 MiB |    206 MiB |   1819 GiB |   1819 GiB |
|---------------------------------------------------------------------------|
| Active memory         |   4565 MiB |   4565 MiB |  10999 GiB |  10995 GiB |
|       from large pool |   4498 MiB |   4498 MiB |   9180 GiB |   9175 GiB |
|       from small pool |     66 MiB |    206 MiB |   1819 GiB |   1819 GiB |
|-------------------------

Step,Training Loss
10,10.3221
20,10.1623
30,9.903
40,9.7516
50,9.5658
60,9.4826
70,9.3731
80,9.2079
90,9.1072
100,9.0404
