In [1]:
from transformers import Qwen2_5_VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from qwen_vl_utils import process_vision_info
from peft import LoraConfig, get_peft_model
from trl import SFTConfig, SFTTrainer
import torch

model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
    "Qwen/Qwen2.5-VL-7B-Instruct",
    torch_dtype=torch.bfloat16,
    attn_implementation="flash_attention_2",
    device_map="cuda",
)

min_pixels = 256*28*28
max_pixels = 1280*28*28
processor = AutoProcessor.from_pretrained("Qwen/Qwen2.5-VL-7B-Instruct", min_pixels=min_pixels, max_pixels=max_pixels)

messages = [
    {
        "role": "user",
        "content": [
            {
                "type": "image",
                "image": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg",
            },
            {"type": "text", "text": "Describe this image."},
        ],
    }
]


ImportError: cannot import name 'AdamW' from 'transformers' (/workspace/.venv/lib/python3.11/site-packages/transformers/__init__.py)

In [5]:
def format_data(sample):
    """
    하나의 데이터 샘플(row)을 Qwen 모델의 대화 형식으로 변환합니다.
    """
    system_prompt = "You are a helpful multimodal assistant. Your task is to follow the user's instructions carefully and provide an accurate response based on the provided image and/or text."
    
    # 태스크에 따라 유저 프롬프트를 동적으로 생성
    task = sample['task']
    question = sample.get('question', '')
    context = sample['input'] if sample['input_type'] == 'text' else ''
    
    user_prompt = f"Task: {task}\n"
    if context:
        user_prompt += f"Context: {context}\n"
    if pd.notna(question) and question:
        user_prompt += f"Question: {question}\n"
    
    user_prompt = user_prompt.strip()

    # 이미지 처리
    image = None
    if sample['input_type'] == 'image':
        try:
            image = sample['input'] # URL을 그대로 전달
        except Exception:
            image = None # 오류 발생 시 이미지 없음

    # 최종 대화 구조 생성
    message = [
        {"role": "system", "content": [{"type": "text", "text": system_prompt}]},
        {
            "role": "user",
            "content": [
                {"type": "image", "image": image},
                {"type": "text", "text": user_prompt},
            ],
        },
        {
            "role": "assistant",
            "content": [{"type": "text", "text": sample["output"]}],
        },
    ]
    # 이미지 없는 샘플에서 이미지 노드 제거
    if not image:
        message[1]['content'] = [item for item in message[1]['content'] if item['type'] != 'image']
        
    return message


In [6]:
hf_dataset = Dataset.from_pandas(df)
formatted_dataset = hf_dataset.map(lambda sample: {'formatted_data': format_data(sample)}, remove_columns=list(df.columns))

# 데이터 분할 (예시)
train_test_split = formatted_dataset.train_test_split(test_size=0.2)
train_dataset = train_test_split['train']
eval_dataset = train_test_split['test']


AttributeError: type object 'Dataset' has no attribute 'from_pandas'

In [None]:
# --- 6. 학습 인자 (SFTConfig) 설정 ---
training_args = SFTConfig(
    output_dir="qwen_vl_finetuned",
    num_train_epochs=3,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    gradient_accumulation_steps=4,
    gradient_checkpointing=True,
    optim="adamw_torch_fused",
    learning_rate=2e-4,
    lr_scheduler_type="constant",
    logging_steps=10,
    eval_strategy="steps",
    eval_steps=20,
    save_strategy="steps",
    save_steps=20,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
    load_best_model_at_end=True,
    bf16=True,
    report_to="wandb",
    push_to_hub=False,
    remove_unused_columns=False, # `formatted_data`를 사용하므로 이 옵션이 중요
    dataset_text_field="formatted_data", # TRL에게 포맷팅된 데이터가 어디 있는지 알려줌
    # max_seq_length=2048, # 필요 시 시퀀스 길이 제한
)


# --- 7. SFTTrainer 초기화 ---
trainer = SFTTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    data_collator=collate_fn,
    peft_config=peft_config,
    tokenizer=processor.tokenizer,
)

# --- 8. 학습 시작 ---
print("\n파인튜닝을 시작합니다...")
trainer.train()
print("학습 완료!")

# --- 9. 모델 저장 ---
print("최적의 모델을 저장합니다...")
trainer.save_model(training_args.output_dir)
processor.save_pretrained(training_args.output_dir)
print(f"모델과 프로세서가 '{training_args.output_dir}'에 저장되었습니다.")

wandb.finish()