In [2]:
import os
import pandas as pd
from sklearn.model_selection import train_test_split
from datasets import Dataset
from PIL import Image
import torch
from torch.utils.data import DataLoader
import pytorch_lightning as pl
from transformers import AutoProcessor

class ImageTextDataModule(pl.LightningDataModule):
    def __init__(self, data_dir, image_folder, processor, batch_size=1):
        super().__init__()
        self.data_dir = data_dir
        self.image_folder = image_folder
        self.processor = processor
        self.batch_size = batch_size

    def prepare_data(self):
        # Load the JSON file into a pandas DataFrame
        df = pd.read_json(os.path.join(self.data_dir, 'combined_dataset.json'))

        # Split dataset into train+val (80%) and test (20%)
        train_val, test = train_test_split(df, test_size=0.2, random_state=42)

        # Split train+val into train (80% of 80% = 64% of total) and val (20% of 80% = 16% of total)
        train, val = train_test_split(train_val, test_size=0.2, random_state=42)

        # Save the train, val, and test datasets as separate JSON files
        train.to_json(os.path.join(self.data_dir, 'train_dataset.json'), orient='records', indent=4)
        val.to_json(os.path.join(self.data_dir, 'val_dataset.json'), orient='records', indent=4)
        test.to_json(os.path.join(self.data_dir, 'test_dataset.json'), orient='records', indent=4)

    def setup(self, stage=None):
        # Load datasets
        self.train_data = Dataset.from_json(os.path.join(self.data_dir, 'train_dataset.json'))
        self.val_data = Dataset.from_json(os.path.join(self.data_dir, 'val_dataset.json'))
        self.test_data = Dataset.from_json(os.path.join(self.data_dir, 'test_dataset.json'))

    def process(self, examples):
        texts = [
            f"<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\n<|image|> {item['question']} <|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n{item['value']}<|eot_id|>"
            for item in examples
        ]
        images = [
            Image.open(os.path.join(self.image_folder, item["image"])).convert("RGB")
            for item in examples
        ]
        batch = self.processor(text=texts, images=images, return_tensors="pt", padding=True)
        labels = batch["input_ids"].clone()
        labels[labels == self.processor.tokenizer.pad_token_id] = -100
        labels[labels == 128256] = -100  # Mask image token index for images
        batch["labels"] = labels
        return batch

    def collate_fn(self, batch):
        return self.process(batch)

    def train_dataloader(self):
        return DataLoader(self.train_data, batch_size=self.batch_size, collate_fn=self.collate_fn)

    def val_dataloader(self):
        return DataLoader(self.val_data, batch_size=self.batch_size, collate_fn=self.collate_fn)

    def test_dataloader(self):
        return DataLoader(self.test_data, batch_size=self.batch_size, collate_fn=self.collate_fn)


In [3]:
import torch
import pytorch_lightning as pl
from transformers import AutoModelForVision2Seq, get_scheduler, BitsAndBytesConfig
from peft import LoraConfig, get_peft_model

class VisionTextModel(pl.LightningModule):
    def __init__(self, model_id, learning_rate=1e-4, weight_decay=0.01):
        super().__init__()
        self.save_hyperparameters()

        # Set up quantization configuration
        bnb_config = BitsAndBytesConfig(
            load_in_4bit=True, 
            bnb_4bit_use_double_quant=True, 
            bnb_4bit_quant_type="nf4", 
            bnb_4bit_compute_dtype=torch.bfloat16
        )

        # Load the model with quantization and apply LoRA adapters
        self.model = AutoModelForVision2Seq.from_pretrained(
            model_id,
            device_map="auto",
            torch_dtype=torch.bfloat16,
            quantization_config=bnb_config,
            low_cpu_mem_usage=True,
        )
        
        # Define LoRA configuration
        peft_config = LoraConfig(
            lora_alpha=64,
            lora_dropout=0.1,
            r=64,
            bias="none",
            target_modules=["q_proj", "v_proj"],
            task_type="FEATURE_EXTRACTION",
        )

        # Apply LoRA adapters
        self.model = get_peft_model(self.model, peft_config)
        self.model.tie_weights()  # Tie weights for shared embeddings

    def forward(self, input_ids=None, attention_mask=None, pixel_values=None, labels=None, **kwargs):
        return self.model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            pixel_values=pixel_values,
            labels=labels,
            **kwargs  # Pass through any additional arguments
        )

    def training_step(self, batch, batch_idx):
        outputs = self(**batch)
        loss = outputs.loss
        self.log("train_loss", loss)
        return loss

    def validation_step(self, batch, batch_idx):
        outputs = self(**batch)
        loss = outputs.loss
        self.log("val_loss", loss)
        return loss

    def configure_optimizers(self):
        optimizer = torch.optim.AdamW(
            self.parameters(),
            lr=self.hparams.learning_rate,
            weight_decay=self.hparams.weight_decay,
        )
        
        # Configure learning rate scheduler
        scheduler = get_scheduler(
            name="linear",
            optimizer=optimizer,
            num_warmup_steps=100,
            num_training_steps=self.trainer.estimated_stepping_batches,
        )
        
        return [optimizer], [{"scheduler": scheduler, "interval": "step"}]


In [4]:
import pytorch_lightning as pl
from transformers import AutoProcessor

# Initialize the processor
processor = AutoProcessor.from_pretrained("meta-llama/Llama-3.2-11B-Vision-Instruct")

# Define the model ID and login
model_id = "meta-llama/Llama-3.2-11B-Vision-Instruct"
login('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')

# Define DataModule
data_module = ImageTextDataModule(
    data_dir='/home/huuthanhvy.nguyen001/tmp/LLMP/EXP/outputEXP1/json',
    image_folder='/home/huuthanhvy.nguyen001/tmp/LLMP/EXP/outputEXP1/images',
    processor=processor,
    batch_size=1,
)

# Define Model
model = VisionTextModel(model_id="meta-llama/Llama-3.2-11B-Vision-Instruct")

# Trainer configuration
trainer = pl.Trainer(
    accelerator="gpu",  # Use GPU (ensure your device is compatible)
    devices=1,  # Specify number of GPUs
    max_epochs=10,  # Number of epochs
    log_every_n_steps=1,  # Log every step
    val_check_interval=1,  # Evaluate every 5 steps (similar to eval_steps=5)
    check_val_every_n_epoch=1,  # Run validation every epoch
    accumulate_grad_batches=8,  # Accumulate gradients over 8 batches
    gradient_clip_val=1.0,  # Gradient clipping value
    enable_checkpointing=True,  # Enable checkpoint saving
    limit_train_batches=1.0,  # Use entire training data (can adjust if needed)
    limit_val_batches=1.0,  # Use entire validation data
)

# Training
trainer.fit(model, datamodule=data_module)


OSError: You are trying to access a gated repo.
Make sure to have access to it at https://huggingface.co/meta-llama/Llama-3.2-11B-Vision-Instruct.
401 Client Error. (Request ID: Root=1-672ab646-58c9a2b56d560384194a0d69;c4051bf8-0437-460e-bfa0-1b6726d3f7aa)

Cannot access gated repo for url https://huggingface.co/meta-llama/Llama-3.2-11B-Vision-Instruct/resolve/main/config.json.
Access to model meta-llama/Llama-3.2-11B-Vision-Instruct is restricted. You must have access to it and be authenticated to access it. Please log in.