<a href="https://colab.research.google.com/github/shahedmomenzadeh/Qwen2.5_VL_video_QA_finetune/blob/main/Qwen2_5_VL_LoRa_on_Video_Dataset.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install --upgrade transformers datasets accelerate peft torch torchvision bitsandbytes --quiet
!pip install qwen-vl-utils --quiet

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m374.7/374.7 kB[0m [31m25.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m888.1/888.1 MB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m594.3/594.3 MB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.2/10.2 MB[0m [31m132.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m88.0/88.0 MB[0m [31m25.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m954.8/954.8 kB[0m [31m55.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m706.8/706.8 MB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m193.1/193.1 MB[0m [31m6.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [1]:
import os
import json
import torch
import shutil
import numpy as np
import requests
from dataclasses import dataclass
from typing import Any, Dict, List, Optional, Tuple, Union

# Install required packages if they are not already installed.
!pip install --upgrade transformers datasets accelerate peft torch torchvision bitsandbytes --quiet
!pip install qwen-vl-utils[decord] --quiet

# Hugging Face and PEFT imports
from huggingface_hub import login
from peft import get_peft_model, LoraConfig, TaskType

# Transformers and datasets imports
from transformers import (
    AutoProcessor,
    AutoModelForVision2Seq,
    Trainer,
    TrainingArguments,
    BitsAndBytesConfig,
    default_data_collator
)
from torch.utils.data import Dataset

# Qwen-VL utils for proper video processing
from qwen_vl_utils import process_vision_info

# 1. Login to Hugging Face hub (optional, but good practice)
print("Login to Hugging Face Hub:")
# In a real script, you might handle this with an environment variable or token.
# For this example, we'll assume a login has been handled if needed.
# login()

# 2. Use the user-specified local video file
print("\nUsing local video file...")
# --- CHANGE: Using the user-provided local video path ---
downloaded_video_path = "./video_girl_playing_music.mp4"

if not os.path.exists(downloaded_video_path):
    print(f"Error: Video file not found at {downloaded_video_path}")
    print("Please make sure the video file is uploaded to the correct path.")
    exit()
else:
    print(f"Found video file: {downloaded_video_path}")

# Create a directory to store the dataset videos
os.makedirs("videos_temporal_girl", exist_ok=True)

# 3. Manually create a QA dataset using the local video
data = []
num_samples = 3
print(f"\nPreparing {num_samples} samples for temporal fine-tuning...")

# --- CHANGE: Updated answers to match the new video content ---
answers = [
    "A girl is sitting in a chair outdoors and playing a musical instrument.",
    "The video shows a girl playing music in a natural setting.",
    "A musician is playing an instrument while seated in a chair surrounded by nature."
]

for i in range(num_samples):
    video_filename = f"video_girl_action_{i+1}.mp4"
    video_path_local = os.path.join("videos_temporal_girl", video_filename)
    shutil.copy(downloaded_video_path, video_path_local)
    data.append({
        "video_path": video_path_local,
        "question": "Describe the main action in the video.",
        "answer": answers[i]
    })

# Save the structured data to a JSONL file
jsonl_path = "qa_temporal_dataset_girl.jsonl"
with open(jsonl_path, "w") as f:
    for entry in data:
        f.write(json.dumps(entry) + "\n")
print(f"Saved {len(data)} videos and temporal QA pairs to {jsonl_path}")

# 4. Load the processor and model
model_id = "Qwen/Qwen2.5-VL-3B-Instruct"

# Define the quantization configuration for memory efficiency
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

print("\nLoading processor and model with 4-bit quantization...")
processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=True)
model = AutoModelForVision2Seq.from_pretrained(
    model_id,
    device_map="auto",
    quantization_config=quantization_config,
    trust_remote_code=True
)

# 5. Setup LoRA / PEFT configuration
print("Configuring LoRA...")
peft_config = LoraConfig(
    task_type=TaskType.SEQ_2_SEQ_LM,
    inference_mode=False,
    r=4,
    lora_alpha=8,
    lora_dropout=0.05,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj"]
)
model = get_peft_model(model, peft_config)

# Prepare the model for training
model.config.use_cache = False
model.gradient_checkpointing_enable()
model.enable_input_require_grads()
model.print_trainable_parameters()

# 6. Define the PyTorch Dataset for loading and processing our data
class VideoQADataset(Dataset):
    """
    A PyTorch Dataset to load video-question-answer pairs, process them for
    the Qwen2.5-VL model, and prepare them for training.
    This version follows the official documentation by separating text templating
    from vision processing using process_vision_info.
    """
    def __init__(self, jsonl_file, processor):
        self.processor = processor
        with open(jsonl_file, "r") as f:
            self.samples = [json.loads(line) for line in f]

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        sample = self.samples[idx]
        video_path = sample["video_path"]
        question = sample["question"]
        answer = sample["answer"]

        # Create the conversation structure as expected by the processor.
        messages = [
            {
                "role": "user",
                "content": [
                    {
                        "type": "video",
                        "video": video_path,  # Correct key: "video" instead of "path"
                        "fps": 1,  # Sample at 4 FPS; adjust as needed
                    },
                    {"type": "text", "text": question},
                ],
            },
            {
                "role": "assistant",
                "content": answer  # Simplified as string for the assistant response
            }
        ]

        # Apply chat template to get the text prompt (without vision info)
        text = self.processor.apply_chat_template(
            messages,
            tokenize=False,
            add_generation_prompt=False
        )

        # Extract vision inputs separately
        image_inputs, video_inputs = process_vision_info(messages)

        # Tokenize with vision inputs
        inputs = self.processor(
            text=[text],
            images=image_inputs,
            videos=video_inputs,
            padding=False,  # No padding here; handled by collator if needed
            return_tensors="pt"
        )

        # Squeeze to remove batch dim (since single example)
        inputs = {k: v.squeeze(0) for k, v in inputs.items() if v is not None}

        # Set labels
        inputs["labels"] = inputs["input_ids"].clone()

        # Mask labels for the prompt part (user + generation prompt)
        prompt_messages = messages[:1]  # Only user message
        prompt_text = self.processor.apply_chat_template(
            prompt_messages,
            tokenize=False,
            add_generation_prompt=True
        )
        prompt_image_inputs, prompt_video_inputs = process_vision_info(prompt_messages)
        prompt_inputs = self.processor(
            text=[prompt_text],
            images=prompt_image_inputs,
            videos=prompt_video_inputs,
            padding=False,
            return_tensors="pt"
        )
        prompt_len = prompt_inputs["input_ids"].shape[1]
        inputs["labels"][:prompt_len] = -100

        return inputs

# Instantiate the dataset
train_dataset = VideoQADataset(jsonl_path, processor)

# 7. Define Training Arguments and instantiate the Trainer
# --- CHANGE: Updated output directory name ---
training_args = TrainingArguments(
    output_dir="./qwen_temporal_peft_finetuned_girl",
    num_train_epochs=5,
    per_device_train_batch_size=1,
    gradient_accumulation_steps=4,
    learning_rate=2e-4,
    logging_steps=1,
    remove_unused_columns=False,
    gradient_checkpointing=True,
    fp16=not torch.cuda.is_bf16_supported(),
    bf16=torch.cuda.is_bf16_supported(),
    optim="adamw_8bit",
    seed=3407,
    report_to="none",  # Avoid external logging
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    tokenizer=processor.tokenizer,
    data_collator=default_data_collator,
)

# 8. Start Fine-tuning
print("\nStarting fine-tuning with the Trainer API...")
trainer.train()
print("\n✅ Fine-tuning completed successfully!")

# 9. Save the final LoRA adapter
# --- CHANGE: Updated adapter directory name ---
output_adapter_dir = "qwen_peft_lora_adapter_temporal_girl"
print(f"\nSaving LoRA adapter to {output_adapter_dir}...")
trainer.save_model(output_adapter_dir)
processor.save_pretrained(output_adapter_dir)
print("Adapter and processor saved.")

Login to Hugging Face Hub:

Using local video file...
Found video file: ./video_girl_playing_music.mp4

Preparing 3 samples for temporal fine-tuning...
Saved 3 videos and temporal QA pairs to qa_temporal_dataset_girl.jsonl

Loading processor and model with 4-bit quantization...


The image processor of type `Qwen2VLImageProcessor` is now loaded as a fast processor by default, even if the model checkpoint was saved with a slow processor. This is a breaking change and may produce slightly different outputs. To continue using the slow processor, instantiate this class with `use_fast=False`. Note that this behavior will be extended to all models in a future release.
You have video processor config saved in `preprocessor.json` file which is deprecated. Video processor configs should be saved in their own `video_preprocessor.json` file. You can rename the file or load and save the processor back which renames it automatically. Loading from `preprocessor.json` will be removed in v5.0.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Configuring LoRA...
trainable params: 1,843,200 || all params: 3,756,466,176 || trainable%: 0.0491


  trainer = Trainer(



Starting fine-tuning with the Trainer API...


qwen-vl-utils using decord to read video.
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...
`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.


Step,Training Loss
1,6.691
2,6.4988
3,6.1836
4,5.9342
5,5.7107



✅ Fine-tuning completed successfully!

Saving LoRA adapter to qwen_peft_lora_adapter_temporal_girl...
Adapter and processor saved.


In [3]:
import torch
from transformers import AutoProcessor, AutoModelForVision2Seq
from peft import PeftModel
from qwen_vl_utils import process_vision_info

# Assuming the required packages are installed (as in the training script)
# If not, uncomment the following:
# !pip install --upgrade transformers accelerate peft torch torchvision bitsandbytes --quiet
# !pip install qwen-vl-utils[decord] --quiet

# Define paths and parameters
model_id = "Qwen/Qwen2.5-VL-3B-Instruct"
adapter_dir = "qwen_peft_lora_adapter_temporal_girl"
video_path = "./video_girl_playing_music.mp4"  # Same as in training
question = "Describe the main action in the video in 2 sentences."
fps = 1  # Matching the training setting
max_new_tokens = 128  # Adjust as needed for response length

# Load the processor (from adapter dir or original; assumes it's saved there)
processor = AutoProcessor.from_pretrained(adapter_dir, trust_remote_code=True)

# Load the original base model
print("Loading original model...")
original_model = AutoModelForVision2Seq.from_pretrained(
    model_id,
    torch_dtype=torch.bfloat16 if torch.cuda.is_bf16_supported() else torch.float16,
    device_map="auto",
    trust_remote_code=True
)
original_model.eval()

# Load the fine-tuned model (base + LoRA adapter)
print("Loading fine-tuned model...")
base_model = AutoModelForVision2Seq.from_pretrained(
    model_id,
    torch_dtype=torch.bfloat16 if torch.cuda.is_bf16_supported() else torch.float16,
    device_map="auto",
    trust_remote_code=True
)
finetuned_model = PeftModel.from_pretrained(
    base_model,
    adapter_dir,
    device_map="auto"
)
finetuned_model.eval()

# Prepare the messages for inference
messages = [
    {
        "role": "user",
        "content": [
            {
                "type": "video",
                "video": video_path,
                "fps": fps,
            },
            {"type": "text", "text": question},
        ],
    }
]

# Function to generate response from a model
def generate_response(model, processor, messages):
    # Apply chat template
    text = processor.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=True
    )

    # Process vision info
    image_inputs, video_inputs = process_vision_info(messages)

    # Prepare inputs
    inputs = processor(
        text=[text],
        images=image_inputs,
        videos=video_inputs,
        padding=True,
        return_tensors="pt"
    )
    inputs = inputs.to(model.device)

    # Generate
    with torch.no_grad():
        generated_ids = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            do_sample=False,  # Deterministic for comparison
        )

    # Trim and decode
    generated_ids_trimmed = generated_ids[0, len(inputs["input_ids"][0]):]
    output_text = processor.decode(generated_ids_trimmed, skip_special_tokens=True)

    return output_text

# Get outputs
print("\nGenerating response from original model...")
original_output = generate_response(original_model, processor, messages)
print("Original Model Output:", original_output)

print("\nGenerating response from fine-tuned model...")
finetuned_output = generate_response(finetuned_model, processor, messages)
print("Fine-Tuned Model Output:", finetuned_output)

# Optional: Compare
print("\nComparison:")
print("Original:", original_output)
print("Fine-Tuned:", finetuned_output)

Loading original model...


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]



Loading fine-tuned model...


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]


Generating response from original model...


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Original Model Output: The main action in the video is a woman playing a ukulele on a grassy hillside with rolling hills and trees in the background. She is wearing a white dress and has long, wavy hair.

Generating response from fine-tuned model...


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Fine-Tuned Model Output: A woman stands on a grassy hill, playing an orange ukulele. She moves her head slightly and adjusts her hair as she plays.

Comparison:
Original: The main action in the video is a woman playing a ukulele on a grassy hillside with rolling hills and trees in the background. She is wearing a white dress and has long, wavy hair.
Fine-Tuned: A woman stands on a grassy hill, playing an orange ukulele. She moves her head slightly and adjusts her hair as she plays.
