## Install

In [None]:
!pip install -q qwen_vl_utils decord librosa icecream

In [None]:
!pip install --upgrade --no-cache-dir ms-swift -U

In [None]:
!pip install transformers==4.57.0

In [None]:
!pip show unsloth

In [None]:
!pip show ms-swift
!pip show transformers
!pip show flash-attn
!pip show torch
!pip show deepspeed
!pip show liger-kernel
!pip show hf_transfer
!pip show qwen_vl_utils
!pip show decord
!pip show librosa

## Utils

#### **Hugging Face Hub**

In [None]:
from huggingface_hub import login
from google.colab import userdata

HF_TOKEN = userdata.get('HF_TOKEN')
login(token=HF_TOKEN)

#### Free Memory

In [None]:
import gc
import torch
import time

def flush():
  gc.collect()
  torch.cuda.empty_cache()
  torch.cuda.reset_peak_memory_stats()
  for _ in range(10):
        gc.collect()
        with torch.no_grad():
          torch.cuda.empty_cache()
        time.sleep(0.1)

### Setup

In [None]:
!nvidia-smi

Sun Dec  7 21:40:12 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA A100-SXM4-80GB          Off |   00000000:00:05.0 Off |                    0 |
| N/A   35C    P0             56W /  400W |       0MiB /  81920MiB |      0%      Default |
|                                         |                        |             Disabled |
+-----------------------------------------+------------------------+----------------------+
                                                

In [None]:
%cd quantum-assistant

/content/quantum-assistant


In [None]:
!git switch dev
!git pull

In [None]:
!pip install -e .

In [None]:
!curl --proto '=https' --tlsv1.2 -sSf https://raw.githubusercontent.com/huggingface/xet-core/refs/heads/main/git_xet/install.sh | sh
!git xet install

## Data

In [None]:
!finetune prepare --hub-id samuellimabraz/quantum-assistant --output-dir /content/swift_data

[36m‚ï≠‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚ïÆ[0m
[36m‚îÇ[0m [1;36mms-swift Dataset Preparation[0m [36m‚îÇ[0m
[36m‚ï∞‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚ïØ[0m
[2m [0m[2mDataset (Hub)   [0m[2m [0m[32m [0m[32msamuellimabraz/quantum-assistant[0m[32m [0m
[2m [0m[2mOutput directory[0m[2m [0m[32m [0m[32m/content/swift_data             [0m[32m [0m
[2m [0m[2mImage max size  [0m[2m [0m[32m [0m[32m640                             [0m[32m [0m
[2m [0m[2mImage format    [0m[2m [0m[32m [0m[32mJPEG                            [0m[32m [0m
[2m [0m[2mSystem prompt   [0m[2m [0m[32m [0m[32mYes                             [0m[32m [0m

Loading dataset from HuggingFace Hub: samuellimabraz/quantum-assistant
README.md: 100% 767/767 [00:00<00:00, 7.07MB/s]
data/train-00000-of-00001.parquet: 100% 222M/222M [00:01<00:00, 136MB/s]
data/vali

## SwiftSft

In [None]:
import torch

In [None]:
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '0'
os.environ['USE_HF'] = '1'
os.environ['MAX_PIXELS'] = str(1280 * 28 * 28)


from swift.utils import get_logger, get_model_parameter_info, plot_images, seed_everything
from swift.llm.train.sft import SwiftSft
from swift.llm import TrainArguments
from transformers import EarlyStoppingCallback

logger = get_logger()
seed_everything(42)

[INFO:swift] Successfully registered `/usr/local/lib/python3.12/dist-packages/swift/llm/dataset/data/dataset_info.json`.
[INFO:swift] Global seed set to 42


42

In [None]:
torch.cuda.is_available()

True

In [None]:
model_id_or_path = "Qwen/Qwen3-VL-2B-Instruct"
model_name = model_id_or_path.split('/')[-1] + '-r8-rslora-bf16-tuned'
output_dir = os.path.join(os.getcwd(), "train", model_name)

logger.info(f'output_dir: {output_dir}')

args = TrainArguments(
    model=model_id_or_path,
    model_name = model_name,
    model_author="samuellimabraz",
    dataset=['/content/swift_data/train.jsonl'],
    val_dataset=['/content/swift_data/validation.jsonl'],
    load_from_cache_file=True,
    torch_dtype='bfloat16',
    max_pixels=1280 * 28 * 28,
    attn_impl='flash_attn', #'sdpa'
    padding_side='left',
    padding_free=True,
    # packing=True,
    lazy_tokenize=False,
    #max_length=8192,
    max_new_tokens=4096,
    temperature=0.,

    # LoRA
    train_type='lora',
    lora_rank=8,
    lora_alpha=16,
    lora_dropout=0.05,
    target_modules='all-linear',
    init_weights='true',
    use_rslora=True,
    freeze_llm=False,
    freeze_vit=True,
    freeze_aligner=True,

    # Train
    output_dir=output_dir,
    learning_rate=2e-4,
    #auto_find_batch_size=True,
    per_device_train_batch_size=16,
    gradient_accumulation_steps=2,
    torch_empty_cache_steps = 50,
    gradient_checkpointing=True,
    weight_decay=0.01,
    warmup_steps = 10,
    warmup_ratio=0.05,
    num_train_epochs=1,
    lr_scheduler_type='cosine',
    optim = "adamw_torch",
    # neftune_noise_alpha=10,
    logging_first_step=True,
    logging_steps=5,
    logging_strategy="steps",
    fp16 = not torch.cuda.is_bf16_supported(),
    bf16 = torch.cuda.is_bf16_supported(),

    # Eval
    fp16_full_eval = True,
    per_device_eval_batch_size = 8,
    eval_accumulation_steps = 4,
    eval_strategy = "steps",
    eval_steps = 20,
    metric_for_best_model="eval_loss",
    load_best_model_at_end=True,

    save_strategy='best',
    save_total_limit=5,
    dataloader_num_workers=4,
    remove_unused_columns=False,
    data_seed=42,
    report_to=["wandb", "tensorboard"],

    use_hf=True,
    push_to_hub=True,
    hub_private_repo=True,
    hub_strategy="end",
    hub_model_id=f"samuellimabraz/{model_name}",
    project="quantum-assistant",
    run_name=model_name
)
sft = SwiftSft(args)
sft.callbacks.append(
    EarlyStoppingCallback(
        early_stopping_patience=5,
        early_stopping_threshold=0.001
    )
)

In [None]:
os.environ["WANDB_PROJECT"] = "quantum-assistant"

In [None]:
result = sft.main()

[INFO:swift] Start time of running main: 2025-12-07 18:57:35.248979
[INFO:swift] swift.__version__: 3.10.3
[INFO:swift] SelfCognitionPreprocessor has been successfully configured with name: ('Qwen3-VL-4B-Instruct-r4-rslora-bf16-tuned', 'Qwen3-VL-4B-Instruct-r4-rslora-bf16-tuned'), author: ('samuellimabraz', 'samuellimabraz').


Map:   0%|          | 0/5837 [00:00<?, ? examples/s]

Map:   0%|          | 0/1239 [00:00<?, ? examples/s]

[INFO:swift] train_dataset: Dataset({
    features: ['messages', 'images'],
    num_rows: 5837
})
[INFO:swift] val_dataset: Dataset({
    features: ['messages', 'images'],
    num_rows: 1239
})
[INFO:swift] [INPUT_IDS] [151644, 8948, 198, 2610, 525, 264, 30128, 24231, 6203, 17847, 57294, 304, 1207, 3187, 275, 13, 39565, 13382, 11, 2797, 11, 323, 1632, 12, 51143, 14507, 911, 30128, 24231, 18940, 11, 25185, 11, 323, 2038, 8129, 13, 5443, 1207, 3187, 275, 220, 17, 13, 15, 1850, 12378, 13, 151645, 198, 151644, 872, 198, 151652, 151655, 151655, 151655, 151655, 151655, 151655, 151655, 151655, 151655, 151655, 151655, 151655, 151655, 151655, 151655, 151655, 151655, 151655, 151655, 151655, 151655, 151655, 151655, 151655, 151655, 151655, 151655, 151655, 151655, 151655, 151655, 151655, 151655, 151655, 151655, 151655, 151655, 151655, 151655, 151655, 151655, 151655, 151655, 151655, 151655, 151655, 151655, 151655, 151655, 151655, 151655, 151655, 151655, 151655, 151655, 151655, 151655, 151655, 151655

Train:   0%|          | 0/366 [00:00<?, ?it/s][INFO:swift] use_logits_to_keep: False
Train:   0%|          | 1/366 [01:05<6:35:27, 65.01s/it]

{'loss': 1.47564614, 'grad_norm': 3.08612037, 'learning_rate': 2e-05, 'token_acc': 0.75079031, 'epoch': 0.01, 'global_step/max_steps': '1/366', 'percentage': '0.27%', 'elapsed_time': '1m 5s', 'remaining_time': '6h 35m 29s', 'memory(GiB)': 32.71, 'train_speed(iter/s)': 0.015382}


Train:   1%|          | 2/366 [01:52<5:31:39, 54.67s/it]

{'loss': 1.64435577, 'grad_norm': 2.97596097, 'learning_rate': 4e-05, 'token_acc': 0.7217824, 'epoch': 0.01, 'global_step/max_steps': '2/366', 'percentage': '0.55%', 'elapsed_time': '1m 52s', 'remaining_time': '5h 41m 4s', 'memory(GiB)': 36.85, 'train_speed(iter/s)': 0.017787}


Train:   1%|          | 3/366 [03:02<6:14:36, 61.92s/it]

{'loss': 1.37458622, 'grad_norm': 2.69652987, 'learning_rate': 6e-05, 'token_acc': 0.75743661, 'epoch': 0.02, 'global_step/max_steps': '3/366', 'percentage': '0.82%', 'elapsed_time': '3m 2s', 'remaining_time': '6h 9m 1s', 'memory(GiB)': 60.25, 'train_speed(iter/s)': 0.016395}


Train:   1%|          | 4/366 [04:03<6:10:46, 61.45s/it]

{'loss': 1.22692871, 'grad_norm': 2.56138492, 'learning_rate': 8e-05, 'token_acc': 0.77798551, 'epoch': 0.02, 'global_step/max_steps': '4/366', 'percentage': '1.09%', 'elapsed_time': '4m 3s', 'remaining_time': '6h 7m 37s', 'memory(GiB)': 60.25, 'train_speed(iter/s)': 0.016411}


Train:   1%|‚ñè         | 5/366 [05:27<6:57:23, 69.37s/it]

{'loss': 1.22136486, 'grad_norm': 1.9648397, 'learning_rate': 0.0001, 'token_acc': 0.74672152, 'epoch': 0.03, 'global_step/max_steps': '5/366', 'percentage': '1.37%', 'elapsed_time': '5m 27s', 'remaining_time': '6h 33m 39s', 'memory(GiB)': 74.27, 'train_speed(iter/s)': 0.015284}


Train:   2%|‚ñè         | 6/366 [06:45<7:14:56, 72.49s/it]

{'loss': 1.15417612, 'grad_norm': 2.03546643, 'learning_rate': 0.00012, 'token_acc': 0.76683697, 'epoch': 0.03, 'global_step/max_steps': '6/366', 'percentage': '1.64%', 'elapsed_time': '6m 45s', 'remaining_time': '6h 45m 41s', 'memory(GiB)': 74.27, 'train_speed(iter/s)': 0.01479}


Train:   2%|‚ñè         | 7/366 [08:03<7:24:16, 74.25s/it]

{'loss': 1.19977474, 'grad_norm': 1.48850739, 'learning_rate': 0.00014, 'token_acc': 0.73864521, 'epoch': 0.04, 'global_step/max_steps': '7/366', 'percentage': '1.91%', 'elapsed_time': '8m 3s', 'remaining_time': '6h 53m 20s', 'memory(GiB)': 74.27, 'train_speed(iter/s)': 0.014476}


Train:   2%|‚ñè         | 8/366 [09:33<7:53:02, 79.28s/it]

{'loss': 1.14413464, 'grad_norm': 0.84120828, 'learning_rate': 0.00016, 'token_acc': 0.73426408, 'epoch': 0.04, 'global_step/max_steps': '8/366', 'percentage': '2.19%', 'elapsed_time': '9m 33s', 'remaining_time': '7h 7m 49s', 'memory(GiB)': 74.27, 'train_speed(iter/s)': 0.013947}


Train:   2%|‚ñè         | 9/366 [11:46<9:30:51, 95.94s/it]

{'loss': 1.07826519, 'grad_norm': 0.69512147, 'learning_rate': 0.00018, 'token_acc': 0.72084548, 'epoch': 0.05, 'global_step/max_steps': '9/366', 'percentage': '2.46%', 'elapsed_time': '11m 46s', 'remaining_time': '7h 46m 52s', 'memory(GiB)': 74.27, 'train_speed(iter/s)': 0.012744}


Train:   3%|‚ñé         | 10/366 [12:20<7:36:42, 76.97s/it]

{'loss': 1.0660578, 'grad_norm': 0.67294222, 'learning_rate': 0.0002, 'token_acc': 0.74028668, 'epoch': 0.05, 'global_step/max_steps': '10/366', 'percentage': '2.73%', 'elapsed_time': '12m 20s', 'remaining_time': '7h 19m 28s', 'memory(GiB)': 74.27, 'train_speed(iter/s)': 0.013501}


Train:   3%|‚ñé         | 11/366 [14:17<8:47:54, 89.22s/it]

{'loss': 1.12724876, 'grad_norm': 0.87927151, 'learning_rate': 0.0002, 'token_acc': 0.71587938, 'epoch': 0.06, 'global_step/max_steps': '11/366', 'percentage': '3.01%', 'elapsed_time': '14m 17s', 'remaining_time': '7h 41m 19s', 'memory(GiB)': 74.27, 'train_speed(iter/s)': 0.012825}


In [None]:
!PYTORCH_CUDA_ALLOC_CONF='expandable_segments:True' \
IMAGE_MAX_TOKEN_NUM=1024 \
CUDA_VISIBLE_DEVICES=0 \
swift sft \
    --model Qwen/Qwen3-VL-4B-Instruct \
    --dataset '/content/swift_data/train.jsonl' \
    --val_dataset '/content/swift_data/validation.jsonl' \
    --load_from_cache_file true \
    --train_type lora \
    --torch_dtype bfloat16 \
    --num_train_epochs 5 \
    --per_device_train_batch_size 4 \
    --per_device_eval_batch_size 4 \
    --attn_impl sdpa \
    --learning_rate 2e-4 \
    --lr_scheduler_type cosine \
    --lora_rank 8 \
    --lora_alpha 16 \
    --target_modules all-linear \
    --freeze_vit true \
    --freeze_aligner false \
    --gradient_checkpointing true \
    --vit_gradient_checkpointing false \
    --gradient_accumulation_steps 2 \
    --eval_steps 100 \
    --save_steps 100 \
    --save_total_limit 2 \
    --logging_steps 5 \
    --output_dir output \
    --warmup_ratio 0.05 \
    --dataset_num_proc 4 \
    --dataloader_num_workers 4 \
    --report_to tensorboard wandb \
    --use_hf

run sh: `/usr/bin/python3 /usr/local/lib/python3.12/dist-packages/swift/cli/sft.py --model Qwen/Qwen3-VL-2B-Instruct --dataset /content/swift_data/train.jsonl --val_dataset /content/swift_data/validation.jsonl --load_from_cache_file true --train_type lora --torch_dtype bfloat16 --num_train_epochs 1 --per_device_train_batch_size 2 --per_device_eval_batch_size 2 --attn_impl sdpa --learning_rate 1e-4 --lora_rank 16 --lora_alpha 32 --target_modules all-linear --freeze_vit true --freeze_aligner true --gradient_checkpointing true --vit_gradient_checkpointing false --gradient_accumulation_steps 2 --eval_steps 100 --save_steps 100 --save_total_limit 2 --logging_steps 5 --max_length 4096 --output_dir output --warmup_ratio 0.05 --dataset_num_proc 4 --dataloader_num_workers 4 --report_to tensorboard wandb --use_hf`
2025-12-05 08:47:35.663359: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors