In [1]:
!pip install -q datasets
!pip install -q git+https://github.com/huggingface/transformers
!pip install -q bitsandbytes sentencepiece accelerate loralib
!pip install -q -U git+https://github.com/huggingface/peft.git

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/527.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m527.3/527.3 kB[0m [31m21.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m9.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m39.9/39.9 MB[0m [31m47.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m7.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m12.1 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
cudf-cu12 24.4.1 requires pyarrow<15.0.0a0,>=14.0.1, but you have pyarrow 17.0.0 which is incompatib

In [5]:
# Import required libraries
import torch
from datasets import load_dataset
from peft import LoraConfig, get_peft_model
from PIL import Image
from transformers import (
    IdeficsForVisionText2Text,
    AutoProcessor,
    Trainer,
    TrainingArguments,
    BitsAndBytesConfig
)
from transformers import AutoProcessor, BitsAndBytesConfig, Idefics2ForConditionalGeneration

In [3]:
# Load the dataset
dataset = load_dataset("AI4Math/MathVista")
ds = dataset["testmini"].train_test_split(test_size=0.2)
train_ds = ds["train"]
eval_ds = ds["test"]

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading readme:   0%|          | 0.00/11.6k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/142M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/358M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/386M [00:00<?, ?B/s]

Generating testmini split:   0%|          | 0/1000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/5141 [00:00<?, ? examples/s]

# LoRA Configuration Explanation

The `target_modules` parameter in the LoRA configuration specifies which layers of the model to apply Low-Rank Adaptation (LoRA) to. For the IDEFICS2 model, which is a multimodal model based on the LLAMA architecture, we target specific components using a regular expression:

```python
target_modules = '.*(text_model|modality_projection|perceiver_resampler).*(down_proj|gate_proj|up_proj|k_proj|q_proj|v_proj|o_proj).*$'
```

This regular expression can be broken down into two main parts:

1. `(text_model|modality_projection|perceiver_resampler)`:
   * This targets layers in the text model, modality projection, or perceiver resampler parts of the model.

2. `(down_proj|gate_proj|up_proj|k_proj|q_proj|v_proj|o_proj)`:
   * This specifically targets:
     * `down_proj`, `gate_proj`, `up_proj`: Parts of the feed-forward networks
     * `k_proj`, `q_proj`, `v_proj`, `o_proj`: Key, query, value, and output projection layers within the attention mechanisms

The options for `target_modules` depend on the model architecture. For the IDEFICS2 model, common options include:

* **Attention layers**: 'q_proj', 'k_proj', 'v_proj', 'o_proj'
* **Feed-forward layers**: 'gate_proj', 'up_proj', 'down_proj'
* **Layer norm**: 'input_layernorm', 'post_attention_layernorm'
* **Embeddings**: 'embed_tokens'
* **Multimodal specific layers**: 'modality_projection', 'perceiver_resampler'

The specific choice of `target_modules` can affect the balance between fine-tuning effectiveness and computational efficiency.

In [6]:
# Set up device and configuration
DEVICE = "cuda:0"
USE_LORA = False
USE_QLORA = True

# Initialize the processor
processor = AutoProcessor.from_pretrained(
    "HuggingFaceM4/idefics2-8b",
    do_image_splitting=False
)

# Configure LoRA or QLoRA
if USE_QLORA or USE_LORA:
    lora_config = LoraConfig(
        r=8,
        lora_alpha=8,
        lora_dropout=0.1,
        target_modules='.*(text_model|modality_projection|perceiver_resampler).*(down_proj|gate_proj|up_proj|k_proj|q_proj|v_proj|o_proj).*$',
        use_dora=False if USE_QLORA else True,
        init_lora_weights="gaussian"
    )
    if USE_QLORA:
        bnb_config = BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_quant_type="nf4",
            bnb_4bit_compute_dtype=torch.float16
        )
    model = Idefics2ForConditionalGeneration.from_pretrained(
        "HuggingFaceM4/idefics2-8b",
        torch_dtype=torch.float16,
        quantization_config=bnb_config if USE_QLORA else None,
    )
    model.add_adapter(lora_config)
    model.enable_adapters()
else:
    model = Idefics2ForConditionalGeneration.from_pretrained(
        "HuggingFaceM4/idefics2-8b",
        torch_dtype=torch.float16,
        _attn_implementation="flash_attention_2", # This works for A100 or H100
    ).to(DEVICE)


config.json:   0%|          | 0.00/684 [00:00<?, ?B/s]

`low_cpu_mem_usage` was None, now set to True since model is quantized.


model.safetensors.index.json:   0%|          | 0.00/74.4k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/7 [00:00<?, ?it/s]

model-00001-of-00007.safetensors:   0%|          | 0.00/4.64G [00:00<?, ?B/s]

model-00002-of-00007.safetensors:   0%|          | 0.00/4.99G [00:00<?, ?B/s]

model-00003-of-00007.safetensors:   0%|          | 0.00/4.90G [00:00<?, ?B/s]

model-00004-of-00007.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00005-of-00007.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00006-of-00007.safetensors:   0%|          | 0.00/4.83G [00:00<?, ?B/s]

model-00007-of-00007.safetensors:   0%|          | 0.00/4.25G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/7 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/185 [00:00<?, ?B/s]

In [7]:
# Prepare the image token ID
image_token_id = processor.tokenizer.additional_special_tokens_ids[
    processor.tokenizer.additional_special_tokens.index("<image>")]

# Define collate function for data preparation
def collate_fn(examples):
    texts = []
    images = []
    for example in examples:
        image = example["decoded_image"]
        question = example["query"]
        answer = example["answer"]
        messages = [
            {
                "role": "user",
                "content": [
                    {"type": "text", "text": "Answer briefly."},
                    {"type": "image"},
                    {"type": "text", "text": question}
                ]
            },
            {
                "role": "assistant",
                "content": [
                    {"type": "text", "text": answer}
                ]
            }
        ]
        text = processor.apply_chat_template(messages, add_generation_prompt=False)
        texts.append(text.strip())
        images.append([image])

    batch = processor(text=texts, images=images, return_tensors="pt", padding=True)
    labels = batch["input_ids"].clone()
    labels[labels == processor.tokenizer.pad_token_id] = image_token_id
    batch["labels"] = labels

    return batch


# Memory Management Strategies for Fine-Tuning Large Language Models

When fine-tuning large language models like IDEFICS2, managing GPU memory efficiently is crucial. Here are some strategies to optimize memory usage:

1. **Reduce batch size**: Lowering the batch size is often the most effective way to reduce memory usage. Try halving your current batch size.

2. **Enable gradient accumulation**: This allows you to simulate larger batch sizes without increasing memory usage.

3. **Use mixed precision training**: If not already enabled, use fp16 or bf16 precision to reduce memory usage.

4. **Use gradient checkpointing**: This trades computation for memory by not storing all intermediate activations.

```python
training_args = TrainingArguments(
    per_device_train_batch_size=4,  # or even smaller, like 2 or 1
    gradient_accumulation_steps=4,  # or 8, 16, etc.
    fp16=True,  # or use bf16=True if your GPU supports it
    gradient_checkpointing=True,
)
```

In [8]:
# Set up training arguments
training_args = TrainingArguments(
    num_train_epochs=1,
    per_device_train_batch_size=1,
    gradient_accumulation_steps=16,
    warmup_steps=50,
    learning_rate=1e-4,
    weight_decay=0.01,
    logging_steps=25,
    output_dir="/content",
    max_steps=100,
    save_strategy="steps",
    save_steps=25,
    save_total_limit=1,
    fp16=True,
    remove_unused_columns=False,
    report_to="none",
    gradient_checkpointing=True,
    optim="adamw_torch_fused",
    max_grad_norm=0.3,
    dataloader_num_workers=0,
    torch_compile=False,
)

# Initialize the trainer
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=collate_fn,
    train_dataset=train_ds,
)

# Start training
trainer.train()

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)
max_steps is given, it will override any value given in num_train_epochs
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


Step,Training Loss
25,4.7272
50,0.2871
75,0.1898
100,0.1678


  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


TrainOutput(global_step=100, training_loss=1.3429777002334595, metrics={'train_runtime': 1493.2931, 'train_samples_per_second': 1.071, 'train_steps_per_second': 0.067, 'total_flos': 1.2652092092530176e+16, 'train_loss': 1.3429777002334595, 'epoch': 2.0})

In [9]:
# Save the model
trainer.save_model()




# Inference with Adapters

Below is the code to infer with our trained model. Since we chose to train an adapter the loading is a bit different. We need to load the base model and adapter on it separately, and load the processor of the base model.


In [10]:
# Load the trained model for inference
peft_model_id = "/content/checkpoint-100"
base_model_id = "HuggingFaceM4/idefics2-8b"
processor = AutoProcessor.from_pretrained(base_model_id)
model = Idefics2ForConditionalGeneration.from_pretrained(base_model_id)
model.load_adapter(peft_model_id)
model=model.to("cuda")



Loading checkpoint shards:   0%|          | 0/7 [00:00<?, ?it/s]

Idefics2ForConditionalGeneration(
  (model): Idefics2Model(
    (vision_model): Idefics2VisionTransformer(
      (embeddings): Idefics2VisionEmbeddings(
        (patch_embedding): Conv2d(3, 1152, kernel_size=(14, 14), stride=(14, 14), padding=valid)
        (position_embedding): Embedding(4900, 1152)
      )
      (encoder): Idefics2Encoder(
        (layers): ModuleList(
          (0-26): 27 x Idefics2EncoderLayer(
            (self_attn): Idefics2VisionAttention(
              (k_proj): Linear(in_features=1152, out_features=1152, bias=True)
              (v_proj): Linear(in_features=1152, out_features=1152, bias=True)
              (q_proj): Linear(in_features=1152, out_features=1152, bias=True)
              (out_proj): Linear(in_features=1152, out_features=1152, bias=True)
            )
            (layer_norm1): LayerNorm((1152,), eps=1e-06, elementwise_affine=True)
            (mlp): Idefics2VisionMLP(
              (activation_fn): PytorchGELUTanh()
              (fc1): Linear(in

In [26]:
# # Prepare input for inference
# image = eval_ds[2]["decoded_image"]
# query = eval_ds[2]["question"]
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# messages = [
#     {
#         "role": "user",
#         "content": [
#             {"type": "text", "text": "Answer briefly."},
#             {"type": "image"},
#             {"type": "text", "text": query}
#         ]
#     }
# ]

# text = processor.apply_chat_template(messages, add_generation_prompt=True)

# inputs = processor(text=[text.strip()], images=[image], return_tensors="pt", padding=True)
# inputs = {key: tensor.to(device) for key, tensor in inputs.items()}

# model = model.to(device)

# # Generate text
# generated_ids = model.generate(**inputs, max_new_tokens=5)

# # Decode the generated text
# generated_texts = processor.batch_decode(generated_ids[:, inputs["input_ids"].size(1):], skip_special_tokens=True)
# print(generated_texts)

Inference Code for  CUDA out of memory

In [14]:
import torch

# Move model to CPU first
model = model.cpu()

# Clear CUDA cache
torch.cuda.empty_cache()

# Prepare input for inference
image = eval_ds[2]["decoded_image"]
query = eval_ds[2]["question"]

# Use CPU for processing if GPU memory is insufficient
device = torch.device("cpu")

messages = [
    {
        "role": "user",
        "content": [
            {"type": "text", "text": "Answer briefly."},
            {"type": "image"},
            {"type": "text", "text": query}
        ]
    }
]

text = processor.apply_chat_template(messages, add_generation_prompt=True)
inputs = processor(text=[text.strip()], images=[image], return_tensors="pt", padding=True)

# Keep inputs on CPU
inputs = {key: tensor for key, tensor in inputs.items()}

# Move model back to GPU if possible, otherwise keep on CPU
if torch.cuda.is_available() and torch.cuda.memory_reserved() < torch.cuda.get_device_properties(0).total_memory * 0.9:
    device = torch.device("cuda")
    model = model.to(device)
    inputs = {key: tensor.to(device) for key, tensor in inputs.items()}

# Enable gradient checkpointing if it's not already enabled
model.gradient_checkpointing_enable()

# Generate text with a smaller max_new_tokens value
try:
    with torch.no_grad():
        generated_ids = model.generate(**inputs, max_new_tokens=32)  # Reduced from 64
except RuntimeError as e:
    print(f"Error during generation: {e}")
    print("Falling back to CPU...")
    model = model.cpu()
    inputs = {key: tensor.cpu() for key, tensor in inputs.items()}
    with torch.no_grad():
        generated_ids = model.generate(**inputs, max_new_tokens=32)

# Decode the generated text
generated_texts = processor.batch_decode(generated_ids[:, inputs["input_ids"].size(1):], skip_special_tokens=True)
print(generated_texts)

Error during generation: CUDA out of memory. Tried to allocate 1.47 GiB. GPU 0 has a total capacity of 39.56 GiB of which 626.81 MiB is free. Process 64686 has 38.94 GiB memory in use. Of the allocated memory 38.23 GiB is allocated by PyTorch, and 205.68 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
Falling back to CPU...


Starting from v4.46, the `logits` model output will have the same type as the model (except at train time, where it will always be FP32)


['21.8 \nAssistant: ## 21.8 \nAssistant: 21.8 \nAss']


In [23]:
generated_texts

['21.8 \nAssistant: ## 21.8 \nAssistant: 21.8 \nAss']