In [2]:
%pip install -U -q git+https://github.com/huggingface/trl.git bitsandbytes peft qwen-vl-utils trackio
# Tested with trl==0.22.0.dev0, bitsandbytes==0.47.0, peft==0.17.1, qwen-vl-utils==0.0.11, trackio==0.2.8

%pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu129
%pip install hf_xet

Note: you may need to restart the kernel to use updated packages.
Looking in indexes: https://download.pytorch.org/whl/cu129
Collecting torchaudio
  Using cached https://download.pytorch.org/whl/cu129/torchaudio-2.8.0%2Bcu129-cp312-cp312-manylinux_2_28_x86_64.whl.metadata (7.2 kB)
Using cached https://download.pytorch.org/whl/cu129/torchaudio-2.8.0%2Bcu129-cp312-cp312-manylinux_2_28_x86_64.whl (3.6 MB)
Installing collected packages: torchaudio
Successfully installed torchaudio-2.8.0+cu129
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


In [1]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [2]:
import torch
from transformers import Qwen2VLForConditionalGeneration, Qwen2VLProcessor
from transformers import BitsAndBytesConfig
from peft import LoraConfig, get_peft_model
from qwen_vl_utils import process_vision_info
from trl import SFTConfig
from trl import SFTTrainer

from datasets import Dataset

import gc
import time
import trackio

model_id = "Qwen/Qwen2-VL-7B-Instruct"

In [3]:
# ## 3. 데이터셋 준비
#
# **중요! 수정된 부분**
#
# - `Assets_Zara01` 폴더의 CSV 파일과 이미지 데이터를 로드하도록 수정되었습니다.
# - 각 CSV 파일은 하나의 에피소드(trajectory)를 나타냅니다.
# - 각 샘플은 (현재 이미지, 자연어 명령, 현재 액션)으로 구성됩니다.
# - **자연어 명령을 CSV의 시작/목표 위치를 사용하여 동적으로 생성하도록 변경했습니다.**
# - 액션은 `[Action_X, Action_Z]` 2차원 벡터를 사용합니다.

import os
import pandas as pd
import glob

# 현재 작업 디렉토리 (프로젝트 루트)
project_root = os.getcwd()
data_root = os.path.join(project_root, "Assets_Zara01")
state_files_path = os.path.join(data_root, "Zara01_State")

# 모든 CSV 파일 경로 가져오기
csv_files = glob.glob(os.path.join(state_files_path, "*.csv"))

raw_dataset = []

# 각 CSV 파일(에피소드)을 순회
for csv_file in csv_files:
    df = pd.read_csv(csv_file)
    
    if df.empty:
        continue
        
    # 에피소드의 시작 위치와 목표 위치 추출 (첫 번째 행 기준)
    start_pos_row = df.iloc[0]
    start_pos = [start_pos_row['Position_X'], start_pos_row['Position_Y'], start_pos_row['Position_Z']]
    goal_pos = [start_pos_row['GoalPosition_X'], start_pos_row['GoalPosition_Y'], start_pos_row['GoalPosition_Z']]
    
    # 영어로 자연어 명령 생성 (소수점 2자리까지)
    instruction = (
        f"From the start position [{start_pos[0]:.2f}, {start_pos[1]:.2f}, {start_pos[2]:.2f}], "
        f"reach the goal position [{goal_pos[0]:.2f}, {goal_pos[1]:.2f}, {goal_pos[2]:.2f}]."
    )
    
    # 에피소드의 각 타임스텝을 순회하며 데이터 샘플 생성
    for index, row in df.iterrows():
        # CSV에 있는 이미지 경로는 'Assets/...'로 시작하는 상대 경로입니다.
        # os.path.abspath를 사용하여 현재 작업 디렉토리 기준으로 절대 경로를 생성합니다.
        relative_image_path = row['FovImagePath'].replace('/', os.sep).replace('Assets', 'Assets_Zara01')
        image_path = os.path.abspath(relative_image_path)

        # 액션 데이터 추출
        action = [row['Action_X'], row['Action_Z']]
        
        # 데이터 샘플 생성
        raw_dataset.append({
            "observation_images": [image_path],
            "instruction": instruction, 
            "action": action,
        })

# 데이터셋 크기 확인 (너무 많으면 일부만 사용)
print(f"Total samples created: {len(raw_dataset)}")
# 예시로 처음 5개 샘플 출력
print("Example samples:")
for i in range(min(5, len(raw_dataset))):
    print(raw_dataset[i])

# Hugging Face Dataset 객체로 변환
# 전체 데이터가 너무 클 경우, 메모리 부족을 방지하기 위해 일부만 사용할 수 있습니다.
# 예: hf_dataset = Dataset.from_list(raw_dataset[:1000])
hf_dataset = Dataset.from_list(raw_dataset)

Total samples created: 50172
Example samples:
{'observation_images': ['/home/rlawlsgus/github/VLMFinetuningToy/Assets_Zara01/FoVImages/zara01_146/t_0.jpg'], 'instruction': 'From the start position [17.03, 0.00, 2.57], reach the goal position [-1.32, 0.00, 2.60].', 'action': [-143.94, 1.8039]}
{'observation_images': ['/home/rlawlsgus/github/VLMFinetuningToy/Assets_Zara01/FoVImages/zara01_146/t_1.jpg'], 'instruction': 'From the start position [17.03, 0.00, 2.57], reach the goal position [-1.32, 0.00, 2.60].', 'action': [-143.94, 1.8039]}
{'observation_images': ['/home/rlawlsgus/github/VLMFinetuningToy/Assets_Zara01/FoVImages/zara01_146/t_2.jpg'], 'instruction': 'From the start position [17.03, 0.00, 2.57], reach the goal position [-1.32, 0.00, 2.60].', 'action': [-143.94, 1.8029]}
{'observation_images': ['/home/rlawlsgus/github/VLMFinetuningToy/Assets_Zara01/FoVImages/zara01_146/t_3.jpg'], 'instruction': 'From the start position [17.03, 0.00, 2.57], reach the goal position [-1.32, 0.00, 

In [4]:
system_message = """You are a Vision-Language Agent (VLA) controlling a mobile agent.
Your task is to interpret the visual data from a Field-of-View (FoV) image and a natural language instruction to navigate from a start to a goal position.
Your primary objective is to avoid collisions with obstacles (e.g., walls, furniture) while generating the appropriate action vector `[Action_X, Action_Z]` to reach the destination efficiently.
Must output only the precise action vector `[Action_X, Action_Z]` based on the current visual input and the overall goal."""

In [6]:
def format_data(sample):
    return {
        "images": sample["observation_images"],
        "messages": [
            {
                "role": "system",
                "content": [{"type": "text", "text": system_message}],
            },
            {
                "role": "user",
                "content": [
                    {
                        "type": "image",
                        "image": sample["observation_images"][0],
                    },
                    {
                        "type": "text",
                        "text": sample["instruction"],
                    },
                ],
            },
            {
                "role": "assistant",
                "content": [{"type": "text", "text": str(sample["action"])}],
            },
        ],
    }

In [7]:
# 1. hf_dataset을 train(80%)과 test/validation(20%) 세트로 분할합니다.
train_test_split = hf_dataset.train_test_split(test_size=0.2, seed=42)
train_dataset_raw = train_test_split['train']
test_val_dataset_raw = train_test_split['test']

# 2. test/validation(20%) 세트를 다시 test(10%)와 evaluation(10%) 세트로 분할합니다.
test_eval_split = test_val_dataset_raw.train_test_split(test_size=0.5, seed=42)
eval_dataset_raw = test_eval_split['train']  # 여기서 'train'은 10%에 해당하는 eval 데이터입니다.
test_dataset_raw = test_eval_split['test']

# 3. 각 데이터셋에 format_data 함수를 적용합니다.
# 이 과정은 SFTTrainer가 요구하는 대화 형식으로 데이터를 변환합니다.
train_dataset = [format_data(sample) for sample in train_dataset_raw]
eval_dataset = [format_data(sample) for sample in eval_dataset_raw]
test_dataset = [format_data(sample) for sample in test_dataset_raw]

# 4. 각 데이터셋의 크기를 확인합니다.
print(f"Total samples: {len(hf_dataset)}")
print(f"Train samples: {len(train_dataset)}")
print(f"Evaluation samples: {len(eval_dataset)}")
print(f"Test samples: {len(test_dataset)}")

Total samples: 50172
Train samples: 40137
Evaluation samples: 5017
Test samples: 5018


In [8]:
model = Qwen2VLForConditionalGeneration.from_pretrained(
    model_id,
    device_map="auto",
    dtype=torch.bfloat16,
)

processor = Qwen2VLProcessor.from_pretrained(model_id)

Loading checkpoint shards:   0%|          | 0/5 [00:00<?, ?it/s]

Some parameters are on the meta device because they were offloaded to the cpu and disk.


Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

The image processor of type `Qwen2VLImageProcessor` is now loaded as a fast processor by default, even if the model checkpoint was saved with a slow processor. This is a breaking change and may produce slightly different outputs. To continue using the slow processor, instantiate this class with `use_fast=False`. Note that this behavior will be extended to all models in a future release.


Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

In [9]:
import ast # 문자열을 파이썬 객체로 안전하게 변환하기 위해 추가합니다.

def generate_action_from_sample(model, processor, sample, max_new_tokens=30, device="cuda"):
    """
    샘플(이미지+명령)을 입력받아 VLA 모델로부터 행동 벡터(list)를 생성합니다.
    """
    # Prepare the text input by applying the chat template
    # 추론 시에는 'user'의 메시지만 입력으로 사용합니다.
    text_input = processor.apply_chat_template(
        sample["messages"][1:2],  # Use the user message
        tokenize=False,
        add_generation_prompt=True,
    )

    # Process the visual input from the sample
    image_inputs, _ = process_vision_info(sample["messages"])

    # Prepare the inputs for the model
    model_inputs = processor(
        text=[text_input],
        images=image_inputs,
        return_tensors="pt",
    ).to(device)

    # Generate text with the model
    generated_ids = model.generate(**model_inputs, max_new_tokens=max_new_tokens)

    # Trim the generated ids to remove the input ids
    trimmed_generated_ids = [out_ids[len(in_ids):] for in_ids, out_ids in zip(model_inputs.input_ids, generated_ids)]

    # Decode the output text
    output_text = processor.batch_decode(
        trimmed_generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=True
    )[0]

    # 모델이 생성한 문자열을 실제 파이썬 리스트(행동 벡터)로 파싱합니다.
    try:
        # ast.literal_eval을 사용해 문자열 형태의 리스트를 안전하게 파이썬 리스트 객체로 변환
        action_vector = ast.literal_eval(output_text.strip())
        if isinstance(action_vector, list) and len(action_vector) == 2:
            return action_vector
        else:
            print(f"Warning: Output could not be parsed as a 2D action vector. Output: {output_text}")
            return None # 혹은 기본값 [0.0, 0.0] 등
    except (ValueError, SyntaxError):
        # 파싱에 실패할 경우 (e.g., 모델이 리스트가 아닌 다른 텍스트를 생성)
        print(f"Warning: Failed to parse model output. Output: {output_text}")
        return None # 혹은 기본값 [0.0, 0.0] 등

In [23]:
# Example of how to call the method with sample:
output = generate_action_from_sample(model, processor, train_dataset[0])
output


1


In [10]:
def clear_memory():
    # Delete variables if they exist in the current global scope
    if "inputs" in globals():
        del globals()["inputs"]
    if "model" in globals():
        del globals()["model"]
    if "processor" in globals():
        del globals()["processor"]
    if "trainer" in globals():
        del globals()["trainer"]
    if "peft_model" in globals():
        del globals()["peft_model"]
    if "bnb_config" in globals():
        del globals()["bnb_config"]
    time.sleep(2)

    # Garbage collection and clearing CUDA memory
    gc.collect()
    time.sleep(2)
    torch.cuda.empty_cache()
    torch.cuda.synchronize()
    time.sleep(2)
    gc.collect()
    time.sleep(2)

    print(f"GPU allocated memory: {torch.cuda.memory_allocated() / 1024**3:.2f} GB")
    print(f"GPU reserved memory: {torch.cuda.memory_reserved() / 1024**3:.2f} GB")


clear_memory()

GPU allocated memory: 0.00 GB
GPU reserved memory: 0.00 GB


In [11]:
# BitsAndBytesConfig int-4 config
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True, bnb_4bit_use_double_quant=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=torch.bfloat16
)

# Load model and tokenizer
model = Qwen2VLForConditionalGeneration.from_pretrained(
    model_id, device_map="auto", dtype=torch.bfloat16, quantization_config=bnb_config
)
processor = Qwen2VLProcessor.from_pretrained(model_id)

Loading checkpoint shards:   0%|          | 0/5 [00:00<?, ?it/s]

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

In [None]:
# Configure LoRA
peft_config = LoraConfig(
    lora_alpha=16,
    lora_dropout=0.05,
    r=8,
    bias="none",
    target_modules=["q_proj", "v_proj"],
    task_type="CAUSAL_LM",
)

# Apply PEFT model adaptation
peft_model = get_peft_model(model, peft_config)

# Print trainable parameters
peft_model.print_trainable_parameters()

trainable params: 1,261,568 || all params: 8,292,637,184 || trainable%: 0.0152


In [None]:
# Configure training arguments
training_args = SFTConfig(
    output_dir="qwen2-7b-instruct-trl-sft-Action",  # Directory to save the model
    num_train_epochs=2,  # Number of training epochs
    per_device_train_batch_size=4,  # Batch size for training
    per_device_eval_batch_size=4,  # Batch size for evaluation
    gradient_accumulation_steps=8,  # Steps to accumulate gradients
    gradient_checkpointing_kwargs={"use_reentrant": False},  # Options for gradient checkpointing
    max_length=None,
    # Optimizer and scheduler settings
    optim="paged_adamw_8bit",  # Optimizer type
    learning_rate=2e-4,  # Learning rate for training
    # Logging and evaluation
    logging_steps=500,  # Steps interval for logging
    eval_steps=500,  # Steps interval for evaluation
    eval_strategy="steps",  # Strategy for evaluation
    save_strategy="steps",  # Strategy for saving the model
    save_steps=200,  # Steps interval for saving
    # Mixed precision and gradient settings
    bf16=True,  # Use bfloat16 precision
    max_grad_norm=0.3,  # Maximum norm for gradient clipping
    warmup_ratio=0.03,  # Ratio of total steps for warmup
    # Hub and reporting
    # push_to_hub=True,  # Whether to push model to Hugging Face Hub
    # report_to="trackio",  # Reporting tool for tracking metrics
)

In [13]:
trackio.init(
    project="qwen2-7b-instruct-trl-sft-Action",
    name="qwen2-7b-instruct-trl-sft-Action",
    config=training_args.to_dict(),
    space_id=training_args.output_dir + "-trackio",
)

* Trackio project initialized: qwen2-7b-instruct-trl-sft-Action
* Trackio metrics will be synced to Hugging Face Dataset: rlawlsgus/qwen2-7b-instruct-trl-sft-ChartQA-trackio-dataset
* Found existing space: https://huggingface.co/spaces/rlawlsgus/qwen2-7b-instruct-trl-sft-ChartQA-trackio
* View dashboard by going to: https://huggingface.co/spaces/rlawlsgus/qwen2-7b-instruct-trl-sft-ChartQA-trackio
* Created new run: qwen2-7b-instruct-trl-sft-Action


<trackio.run.Run at 0x7fe88d5aef60>

In [15]:
# 1. SFTTrainer를 초기화할 때는 원본 peft_config '객체'를 그대로 전달합니다.
trainer = SFTTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    peft_config=peft_config,
    processing_class=processor,
)

# 2. trainer.train()을 호출하기 전, 로깅을 위해 trainer 내부의 peft_config를 딕셔너리로 교체합니다.
# 이렇게 하면 초기화는 객체로, 로깅은 딕셔너리로 처리되어 두 요구사항을 모두 만족시킬 수 있습니다.
if hasattr(trainer, "peft_config"):
    trainer.peft_config = trainer.peft_config.to_dict()

# 3. 학습을 시작합니다.
trainer.train()

trainer.save_model(training_args.output_dir)

The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'bos_token_id': None, 'pad_token_id': 151643}.


* Trackio project initialized: huggingface
* Trackio metrics logged to: /home/rlawlsgus/.cache/huggingface/trackio
* View dashboard by running in your terminal:
[1m[93mtrackio show --project "huggingface"[0m
* or by running in Python: trackio.show(project="huggingface")
* Created new run: dainty-sunset-0


Step,Training Loss,Validation Loss


Bad pipe message: %s [b'0166 HTTP/1.1\r\nHost: 127.0.0.1:42851\r\nConnection: keep-a', b've\r\nUpgrade-Insecure-Requests: 1\r\nUser-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 ', b'HTML, like Gecko) Code/1.104.1 Chrome/1', b'.0.7204.235 Electron/37.3.1 Safari/537.36\r\nAccept:']
Bad pipe message: %s [b'ext/html,application/xhtml+xml,']
Bad pipe message: %s [b'plication/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;']


KeyboardInterrupt: 