In [1]:
from huggingface_hub import notebook_login

In [2]:
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.svâ€¦

In [3]:
import trl

In [4]:
trl.__version__

'0.24.0'

## Dataset

In [5]:
from datasets import load_dataset

dataset_id = 'lmms-lab/multimodal-open-r1-8k-verified'
train_dataset = load_dataset(dataset_id, split='train[:5%]')

In [6]:
train_dataset[0]

{'image': <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=251x275>,
 'problem': "Based on the image, if segment DE is extended while maintaining the angle at point D constant, what will be the effect on the value of 's'? Choose the correct answer from the options below:\nChoices:\nA. Increase 's'\nB. Decrease 's'\nC. No change\nD. Make 's' negative",
 'solution': "<think>Let's consider the relationship between the length of segment DE and the value of 's'. The value of 's' is influenced by both the length of DE and the sine of the angle at point D. Since the angle remains constant, the sine of the angle does not change. Therefore, the only factor affecting 's' is the length of DE. As DE is extended, its length increases, which in turn increases the value of 's'. Hence, the correct answer is A.</think>\n\n<answer>A</answer>",
 'original_question': "According to the question shown in the image, please first perform reasoning, then finally select the right answer from the choices, 

In [7]:
from transformers import AutoProcessor

model_name = "Qwen/Qwen3-VL-4B-Instruct" # "Qwen/Qwen3-VL-8B-Instruct"
processor = AutoProcessor.from_pretrained(model_name, padding_side="left")

SYSTEM_PROMPT = (
    "You are a helpful AI Assistant that provides well-reasoned and detailed responses. "
    "You first think about the reasoning process as an internal monologue and then provide the user with the answer. "
    "Respond in the following format: \n...\n\n\n...\n"
)


def make_conversation(example):
    conversation = [
        {
            "role": "system",
            "content": [{"type": "text", "text": SYSTEM_PROMPT}],
        },
        {
            "role": "user",
            "content": [
                {"type": "image", "image": example["image"]},
                {"type": "text", "text": example["problem"]},
            ],
        },
    ]
    prompt = processor.apply_chat_template(conversation, add_generation_prompt=True)
    return {
        "prompt": prompt,
        "image": example["image"],
    }

train_dataset = train_dataset.map(make_conversation)

In [8]:
train_dataset = train_dataset.remove_columns(['problem', 'original_question', 'original_answer'])

In [9]:
train_dataset[0]

{'image': <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=251x275>,
 'solution': "<think>Let's consider the relationship between the length of segment DE and the value of 's'. The value of 's' is influenced by both the length of DE and the sine of the angle at point D. Since the angle remains constant, the sine of the angle does not change. Therefore, the only factor affecting 's' is the length of DE. As DE is extended, its length increases, which in turn increases the value of 's'. Hence, the correct answer is A.</think>\n\n<answer>A</answer>",
 'prompt': "<|im_start|>system\nYou are a helpful AI Assistant that provides well-reasoned and detailed responses. You first think about the reasoning process as an internal monologue and then provide the user with the answer. Respond in the following format: \n...\n\n\n...\n<|im_end|>\n<|im_start|>user\n<|vision_start|><|image_pad|><|vision_end|>Based on the image, if segment DE is extended while maintaining the angle at point D constan

## Load Model. Lora Config

In [10]:
from transformers import Qwen3VLForConditionalGeneration, BitsAndBytesConfig
import torch

model = Qwen3VLForConditionalGeneration.from_pretrained(
    model_name, dtype="auto",
    device_map="auto"
)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [11]:
from peft import LoraConfig

# update target_modules -> merger.linear_fc2 and deepstack_merger_list.linear_fc2 if we want to fine-tune the VL projection layers as well 
# reduced dropout to 0.05 since our dataset is small and to avoid underfitting
# increased lora_alpha to 32 to give more weight to the low-rank updates 
peft_config = LoraConfig(
    r=8,
    lora_alpha=32,
    lora_dropout=0.05,
    target_modules=["q_proj", "v_proj"],
)

In [22]:
sol = """<think>Let's think through this step-by-step. The height of the pole can be calculated using the tangent of the angle, which is given by the formula: height = distance * tan(angle). In this case, the distance is 10 meters, and the angle is 70 degrees. If the angle decreases, the tangent of the angle also decreases because the tangent function increases with increasing angles in the range from 0 to 90 degrees. Therefore, the height calculated using this tangent value will also decrease.</think>

<answer>B</answer>"""
parse(
    sol,
    extraction_mode="first_match",
    extraction_config=[
                LatexExtractionConfig(
                    normalization_config=NormalizationConfig(
                        nits=False,
                        malformed_operators=False,
                        basic_latex=True,
                        equations=True,
                        boxed=True,
                        units=True,
                    ),
                    boxed_match_priority=0,
                    try_extract_without_anchor=False,
                )
            ],
)

[]

## Reward fn

In [None]:
import re

def format_reward(completions, **kwargs):
    """Reward function that checks if the reasoning process is enclosed within  and  tags, while the final answer is enclosed within  and  tags."""
    pattern = r"^\n.*?\n\n\n.*?\n$"
    matches = [re.match(pattern, content, re.DOTALL | re.MULTILINE) for content in completions]
    return [1.0 if match else 0.0 for match in matches]

from math_verify import LatexExtractionConfig, parse, verify
from latex2sympy2_extended import NormalizationConfig


def len_reward(completions, solution, **kwargs) -> float:
    """Compute length-based rewards to discourage overthinking and promote token efficiency.

    Taken from the Kimi 1.5 tech report: https://huggingface.co/papers/2501.12599

    Args:
        completions: List of model completions
        solution: List of ground truth solutions

    Returns:
        List of rewards where:
        - For correct answers: reward = 0.5 - (len - min_len)/(max_len - min_len)
        - For incorrect answers: reward = min(0, 0.5 - (len - min_len)/(max_len - min_len))
    """
    contents = completions

    # First check correctness of answers
    correctness = []
    for content, sol in zip(contents, solution):
        gold_parsed = parse(
            sol,
            extraction_mode="first_match",
            extraction_config=[LatexExtractionConfig()],
        )
        if len(gold_parsed) == 0:
            # Skip unparseable examples
            correctness.append(True)  # Treat as correct to avoid penalizing
            print("Failed to parse gold solution: ", sol)
            continue

        answer_parsed = parse(
            content,
            extraction_config=[
                LatexExtractionConfig(
                    normalization_config=NormalizationConfig(
                        nits=False,
                        malformed_operators=False,
                        basic_latex=True,
                        equations=True,
                        boxed=True,
                        units=True,
                    ),
                    boxed_match_priority=0,
                    try_extract_without_anchor=False,
                )
            ],
            extraction_mode="first_match",
        )
        correctness.append(verify(answer_parsed, gold_parsed))

    # Calculate lengths
    lengths = [len(content) for content in contents]
    min_len = min(lengths)
    max_len = max(lengths)

    # If all responses have the same length, return zero rewards
    if max_len == min_len:
        return [0.0] * len(completions)

    rewards = []
    for length, is_correct in zip(lengths, correctness):
        lambda_val = 0.5 - (length - min_len) / (max_len - min_len)

        if is_correct:
            reward = lambda_val
        else:
            reward = min(0, lambda_val)

        rewards.append(float(reward))

    return rewards


## GRPO Config

In [13]:
from trl import GRPOConfig

output_dir = "Qwen3-VL-4B-Instruct-trl-grpo"

# Configure training arguments using GRPOConfig
training_args = GRPOConfig(
    learning_rate=2e-5,
    #num_train_epochs=1,
    max_steps=10,                                        # Number of dataset passes. For full trainings, use `num_train_epochs` instead

    # Parameters that control the data preprocessing
    per_device_train_batch_size=2,
    max_completion_length=1024, # default: 256            # Max completion length produced during training
    num_generations=2, # 2, # default: 8                  # Number of generations produced during trainig for comparison
    max_prompt_length=2048, # default: 512                # Max prompt lenght of the input prompt used for generation during training

    fp16=True,

    # Parameters related to reporting and saving
    output_dir=output_dir,                                # Where to save model checkpoints and logs
    logging_steps=1,                                      # Log training metrics every N steps
    report_to="trackio",                                  # Experiment tracking tool

    # Hub integration
    push_to_hub=True,
    log_completions=True
)

In [14]:
from trl import GRPOTrainer

trainer = GRPOTrainer(
    model=model,
    reward_funcs=[format_reward, len_reward],
    args=training_args,
    train_dataset=train_dataset,
    peft_config=peft_config,
)

In [15]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2,3"

In [16]:
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)

print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

GPU = NVIDIA L40. Max memory = 44.428 GB.
2.082 GB of memory reserved.


In [None]:
trainer_stats = trainer.train()

In [None]:
used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
used_memory_for_lora = round(used_memory - start_gpu_memory, 3)
used_percentage = round(used_memory / max_memory * 100, 3)
lora_percentage = round(used_memory_for_lora / max_memory * 100, 3)

print(f"{trainer_stats.metrics['train_runtime']} seconds used for training.")
print(f"{round(trainer_stats.metrics['train_runtime']/60, 2)} minutes used for training.")
print(f"Peak reserved memory = {used_memory} GB.")
print(f"Peak reserved memory for training = {used_memory_for_lora} GB.")
print(f"Peak reserved memory % of max memory = {used_percentage} %.")
print(f"Peak reserved memory for training % of max memory = {lora_percentage} %.")
     

391.6479 seconds used for training.
6.53 minutes used for training.
Peak reserved memory = 11.984 GB.
Peak reserved memory for training = 9.902 GB.
Peak reserved memory % of max memory = 26.974 %.
Peak reserved memory for training % of max memory = 22.288 %.


: 