In [1]:
%load_ext autoreload

In [2]:
%autoreload 2

### 学术加速

如果是在 AutoDL 租的服务器，下载模型前运行下面 cell 以获取学术加速

In [3]:
import subprocess
import os

result = subprocess.run('bash -c "source /etc/network_turbo && env | grep proxy"', shell=True, capture_output=True, text=True)
output = result.stdout
for line in output.splitlines():
    if '=' in line:
        var, value = line.split('=', 1)
        os.environ[var] = value

In [4]:
# 查看 GPU
!nvidia-smi

Thu Mar 20 23:29:09 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.14              Driver Version: 550.54.14      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA L20                     On  |   00000000:71:01.0 Off |                  Off |
| N/A   39C    P8             36W /  350W |       0MiB /  49140MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

## Qwen2.5-VL-finetune

In [5]:
import torch

MAX_LENGTH = 8192
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
DEVICE

'cuda'

### Download model

In [6]:
model_id_3b = 'Qwen/Qwen2.5-VL-3B-Instruct-AWQ'
model_id_7b = 'Qwen/Qwen2.5-VL-7B-Instruct-AWQ'
save_dir_3b = './model/base/vl3b/'  # change to your save path
save_dir_7b = './model/base/vl7b/' 

In [7]:
from huggingface_hub import snapshot_download

# snapshot_download(repo_id='Qwen/Qwen2.5-VL-3B-Instruct-AWQ', local_dir='./vl3b/', local_dir_use_symlinks=False)
snapshot_download(repo_id=model_id_7b, local_dir=save_dir_7b, local_dir_use_symlinks=False)

For more details, check out https://huggingface.co/docs/huggingface_hub/main/en/guides/download#download-files-to-local-folder.


Fetching 16 files:   0%|          | 0/16 [00:00<?, ?it/s]

model-00002-of-00002.safetensors:  70%|######9   | 2.06G/2.94G [00:00<?, ?B/s]

model-00001-of-00002.safetensors:  59%|#####8    | 2.35G/3.98G [00:00<?, ?B/s]

'/root/autodl-tmp/HOCR/finetune/model/base/vl7b'

In [9]:
import transformers
from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor, AutoTokenizer
from qwen_vl_utils import process_vision_info


model_7b = Qwen2_5_VLForConditionalGeneration.from_pretrained(
    save_dir_7b, torch_dtype=torch.float16, 
    attn_implementation="flash_attention_2", device_map='auto',
)
model_7b.to(DEVICE)
tokenizer = AutoTokenizer.from_pretrained(save_dir_7b)
processor = AutoProcessor.from_pretrained(save_dir_7b)

model_7b.enable_input_require_grads()   # 开启梯度检查点时(training_args.gradient_checkpointing=True,)要执行该方法

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [10]:
print(model_7b)

Qwen2_5_VLForConditionalGeneration(
  (visual): Qwen2_5_VisionTransformerPretrainedModel(
    (patch_embed): Qwen2_5_VisionPatchEmbed(
      (proj): Conv3d(3, 1280, kernel_size=(2, 14, 14), stride=(2, 14, 14), bias=False)
    )
    (rotary_pos_emb): Qwen2_5_VisionRotaryEmbedding()
    (blocks): ModuleList(
      (0-31): 32 x Qwen2_5_VLVisionBlock(
        (norm1): Qwen2RMSNorm((1280,), eps=1e-06)
        (norm2): Qwen2RMSNorm((1280,), eps=1e-06)
        (attn): Qwen2_5_VLVisionFlashAttention2(
          (qkv): Linear(in_features=1280, out_features=3840, bias=True)
          (proj): Linear(in_features=1280, out_features=1280, bias=True)
        )
        (mlp): Qwen2_5_VLMLP(
          (gate_proj): Linear(in_features=1280, out_features=3420, bias=True)
          (up_proj): Linear(in_features=1280, out_features=3420, bias=True)
          (down_proj): Linear(in_features=3420, out_features=1280, bias=True)
          (act_fn): SiLU()
        )
      )
    )
    (merger): Qwen2_5_VLPatchMerg

### 数据集加载、划分、映射

In [11]:
def preprocess_func(example):
    # print(f'example.ids: {example,keys()}'
    input_ids, attention_mask, labels = [], [], []
    url = example["message"][0]["conversation"][0]['url']
    caption = example["message"][0]["conversation"][1]['caption']
    
    messages = [
        {
            "role": "system", 
            "content": "You are a helpful assistant in recognize math equations in either handwritten or printed text."
        },
        {
            "role": "user",
            "content": [
                {
                    "type": "text", "text": "Recognize the equation in the image, write its LaTeX code between $$\n and \n$$"
                },
                {
                    "type": "image",
                    "image": url,
                    "resized_height": 280,
                    "resized_width": 280,
                },
            ]
        },
        {
            "role": "assistant",
            "content": [
                {
                    "type": "text",
                    "text": caption
                }
            ]
        }
    ]

    text = processor.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=True
    )
    img_inputs, _ = process_vision_info(messages)
    inputs = processor(
        text=[text],
        images=img_inputs,
        padding=True,
        return_tensors='pt'
    )
    inputs = {key: value.tolist() for key, value in inputs.items()}
    instruction = inputs
    response = tokenizer(f'{caption}', add_special_tokens=False)
    input_ids = (
        instruction["input_ids"][0] + response['input_ids'] + [tokenizer.pad_token_id]
    )
    attention_mask = instruction['attention_mask'][0] + response['attention_mask'] + [1]
    labels = (
        [-100] * len(instruction['input_ids'][0])
        + response['input_ids']
        + [tokenizer.pad_token_id]
    )

    if len(input_ids) > MAX_LENGTH:
        input_ids = input_ids[:MAX_LENGTH]
        attention_mask = attention_mask[:MAX_LENGTH]
        labels = labels[:MAX_LENGTH]
    
    input_ids = torch.tensor(input_ids)
    attention_mask = torch.tensor(attention_mask)
    labels = torch.tensor(labels)

    inputs['pixel_values'] = torch.tensor(inputs['pixel_values'])
    # 由 (1, h, w) 变换为 (h, w)
    inputs['image_grid_thw'] = torch.tensor(inputs['image_grid_thw']).squeeze(0)  
    return {
        "input_ids": input_ids, 
        "attention_mask": attention_mask, 
        "labels": labels,
        "pixel_values": inputs['pixel_values'], 
        "image_grid_thw": inputs['image_grid_thw']
    }

In [12]:
""" 数据集准备 """
import json
import random
from datasets import load_dataset, Dataset

dataset_dir = 'data/ft_data.json'
dataset = load_dataset('json', data_files=dataset_dir)  # load dataset
dataset = dataset['train'].train_test_split(test_size=0.15, shuffle=True, seed=5525)  # split
dataset.save_to_disk('data/ft_dataset')  # save dataset

training_dataset, test_dataset = dataset['train'], dataset['test']
training_dataset = training_dataset.map(preprocess_func)  # mapping training dataset

training_dataset, test_dataset

Saving the dataset (0/1 shards):   0%|          | 0/1019 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/180 [00:00<?, ? examples/s]

(Dataset({
     features: ['message', 'input_ids', 'attention_mask', 'labels', 'pixel_values', 'image_grid_thw'],
     num_rows: 1019
 }),
 Dataset({
     features: ['message'],
     num_rows: 180
 }))

### 配置 swanlab

如果想记录下训练可视化数据，可以用 swanlab，没有在下面 Trainer 里把 `callbacks=[swanlab_callback],  # 没有注释即可` 注释即可，同时设置 `test_results` 函数的 `return_list=False`

In [13]:
import swanlab
from swanlab.integration.transformers import SwanLabCallback

# 设置SwanLab回调
swanlab_callback = SwanLabCallback(
    project="Qwen2.5-VL-7b-finetune-with-test",
    experiment_name="qwen2.5-vl-crohme2019",
    config={
        "model": "https://huggingface.co/Qwen/Qwen2.5-VL-7B-Instruct-AWQ",
        "dataset": "https://disk.pku.edu.cn/anyshare/en-us/link/AAF10CCC4D539543F68847A9010C607139?_tb=none&expires_at=1970-01-01T08%3A00%3A00%2B08%3A00&item_type=&password_required=false&title=HMER%20Dataset&type=anonymous",
        "github": "https://github.com/Wooonster/HOCR",
        "prompt": "Recognize the equation in the image, write its LaTeX code bettwen $$\t and \t$$",
        "train_data_number": len(training_dataset),
        "lora_rank": 8,
        "lora_alpha": 16,
        "lora_dropout": 0.1,
    },
)

### 预测、测试函数

In [14]:
from test_funcs import compute_bleu, compute_exprate
from collections import defaultdict

In [15]:
def predict(messages, model):
    # 准备推理
    text = processor.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=True
    )
    image_inputs, video_inputs = process_vision_info(messages)
    inputs = processor(
        text=[text],
        images=image_inputs,
        videos=video_inputs,
        padding=True,
        return_tensors="pt",
    )
    inputs = inputs.to(model.device)

    # 生成输出
    generated_ids = model.generate(**inputs, max_new_tokens=128)
    generated_ids_trimmed = [
        out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
    ]
    output_text = processor.batch_decode(
        generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
    )

    return output_text[0]

def test_results(dataset, model, return_list=False):
    # test_dataset = dataset['test']
    test_outputs = []
    swan_list = []
    
    for item in dataset:
        url = item["message"][0]["conversation"][0]['url']
        caption = item["message"][0]["conversation"][1]['caption']
        # Create the conversation prompt
        messages = [
            {
                "role": "system", 
                "content": "You are a helpful assistant in recognizing math equations in either handwritten or printed text."
            },
            {
                "role": "user",
                "content": [
                    {
                        "type": "text",
                        "text": "Recognize the equation in the image, directly write its LaTeX code between `<start_latex>` and `<end_latex>` without other words."
                    },
                    {
                        "type": "image",
                        "image": url,
                        "resized_height": 280,
                        "resized_width": 280,
                    },
                ]
            }
        ]
    
        # Generate a prediction using the model
        response = predict(messages, model)
        # Save the prediction keyed by image URL
        test_outputs.append([url, response, caption])
        swan_list.append(swanlab.Image(url, caption=response))
        
    # Compute evaluation metrics
    compute_bleu(test_outputs)
    compute_exprate(test_outputs)
    
    return swan_list if return_list else None

In [16]:
test_results(test_dataset, model_7b, return_list=False)

  return F.conv3d(


--------------------------------------------------Computing BLEU--------------------------------------------------
['b^xa^{y+n}===b^{x}a^{y+n}', '\\left[\\begin{array}{cc}\n\\overline{a}+i\\frac{\\beta}{2}&b+i\\frac{\\beta}{2}\n\\end{array}\\right]===[a+i\\frac{\\beta}{2},b+i\\frac{\\beta}{2}]', 'g+\\lambda+n=(n-1)+\\lambda+n=2n===g+1+n=(n-1)+1+n=2n', '\\sum_{r=1}^{r_{\\max}}\\frac{1}{r}===\\sum\\limits_{r=1}^{r_{max}}\\frac{1}{r}', '\\lambda=\\Lambda\\div\\lambda_0===l=1\\div10', '\\inte=0===\\intb=0', '\\frac{3}{xP(x)}===\\frac{3}{xP(x)}', 'x_1+ix_2=(x_1+ix_2)+\\lambda===x_{1}+ix_{2}=(x_{1}+ix_{2})+1', '\\sinL_t===\\sinLt', '\\Deltay=q_yx===xy=qyx', '\\frac{6}{\\sqrt{360}}===\\frac{6}{\\sqrt{360}}', '\\frac{b}{a_b}===b_{ab}n^{a}n^{b}', 'y^2=x^2(x+a)===y^{2}=x^{2}(x+a)', '\\frac{M}{\\sqrt{\\epsilon}}===M\\rightarrow\\frac{M}{\\sqrt{c}}', '\\frac{1}{64}(m+2)(3m^2+22m+40)===\\frac{1}{64}(n+2)(3n^{2}+22n+40)', '\\sin^2(t-t_0)===\\sin^{n}(t-t_{0})', '3\\timesn===3\\timesn', 'x^6-x^9===x^{

### 配置 LoRA 参数

In [22]:
from peft import LoraConfig, TaskType, get_peft_model, PeftModel

# 配置LoRA
config = LoraConfig(
    task_type=TaskType.CAUSAL_LM,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
    inference_mode=False,  # 训练模式
    r=8,  # Lora 秩
    lora_alpha=16,
    lora_dropout=0.1,  # Dropout 比例
    bias="none",
)

# 获取LoRA模型
peft_model_7b = get_peft_model(model_7b, config)
peft_model_7b.enable_input_require_grads()

### 配置预训练参数

In [18]:
from transformers import (
    TrainingArguments,
    Trainer,
    DataCollatorForSeq2Seq,
    Qwen2VLForConditionalGeneration,
    AutoProcessor,
)

# 配置训练参数
training_args = TrainingArguments(
    output_dir="./model/output/Qwen2.5-VL-7B-ft/",
    do_train=True,
    do_eval=True,
    per_device_train_batch_size=4,  # total batch size = per_device_train_batch_size * gradient_accumulation_steps
    gradient_accumulation_steps=4,
    logging_strategy='steps',
    logging_steps=25,
    logging_first_step=True,
    num_train_epochs=3,
    # eval_strategy='epoch',
    save_strategy='epoch',
    learning_rate=1e-4,
    lr_scheduler_type='cosine',
    save_on_each_node=True,
    gradient_checkpointing=True,
    report_to="none",
    # load_best_model_at_end=True,
)

开始训练，并且稳定后，在终端 `watch -n 1 nvidia-smi` 观测一下 GPU 显存占用情况，可以记录一下

In [19]:
# 配置Trainer
trainer = Trainer(
    model=peft_model_7b,
    args=training_args,
    train_dataset=training_dataset,
    eval_dataset=test_dataset,
    data_collator=DataCollatorForSeq2Seq(tokenizer=tokenizer, padding=True),
    callbacks=[swanlab_callback],  # 没有注释即可
)

# 开启模型训练
trainer.train()

No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


[1m[34mswanlab[0m[0m: Tracking run with swanlab version 0.4.8                                   
[1m[34mswanlab[0m[0m: Run data will be saved locally in [35m[1m/root/autodl-tmp/HOCR/finetune/swanlog/run-20250320_234301-a3b1799d[0m[0m
[1m[34mswanlab[0m[0m: 👋 Hi [1m[39mWonster[0m[0m, welcome to swanlab!
[1m[34mswanlab[0m[0m: Syncing run [33mqwen2.5-vl-crohme2019[0m to the cloud
[1m[34mswanlab[0m[0m: 🌟 Run `[1mswanlab watch /root/autodl-tmp/HOCR/finetune/swanlog[0m` to view SwanLab Experiment Dashboard locally
[1m[34mswanlab[0m[0m: 🏠 View project at [34m[4mhttps://swanlab.cn/@Wonster/Qwen2.5-VL-7b-finetune-with-test[0m[0m
[1m[34mswanlab[0m[0m: 🚀 View run at [34m[4mhttps://swanlab.cn/@Wonster/Qwen2.5-VL-7b-finetune-with-test/runs/oxj92mh6897rqo10054nr[0m[0m


  return F.conv3d(
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


Step,Training Loss
1,1.3795
25,0.1051
50,0.0
75,0.0
100,0.0
125,0.0
150,0.0
175,0.0




TrainOutput(global_step=189, training_loss=0.02066893583005319, metrics={'train_runtime': 1564.9777, 'train_samples_per_second': 1.953, 'train_steps_per_second': 0.121, 'total_flos': 5364319648972800.0, 'train_loss': 0.02066893583005319, 'epoch': 2.9568627450980394})

In [26]:
from peft import LoraConfig, TaskType, get_peft_model, PeftModel

# 配置测试参数
val_config = LoraConfig(
    task_type=TaskType.CAUSAL_LM,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
    inference_mode=True,  # 训练模式
    r=8,  # Lora 秩
    lora_alpha=16,
    lora_dropout=0.1,  # Dropout 比例
    bias="none",
)

# 获取测试模型
val_peft_model = PeftModel.from_pretrained(
    model_7b, 
    model_id="./model/output/Qwen2.5-VL-7B-ft/checkpoint-189/",  # 换到自己微调好的模型 ckpt 路径
    config=val_config,
    # local_files_only=True
)

In [27]:
swan_list = test_results(test_dataset, val_peft_model, return_list=True)  # 没有 swanlab, return_list=False
if swan_list is not None:
    swanlab.log({"Prediction": swan_list})
    # 在 Jupyter Notebook 中运行时要停止SwanLab记录 需要调用swanlab.finish()
    swanlab.finish()

--------------------------------------------------Computing BLEU--------------------------------------------------
['$$b^{x}a^{y+n}$$===b^{x}a^{y+n}', '$$\\left[\\begin{array}{l}{-a+i\\frac{\\beta}{2}}\\\\{b+i\\frac{\\beta}{2}}\\end{array}\\right]$$===[a+i\\frac{\\beta}{2},b+i\\frac{\\beta}{2}]', '$$g+\\lambda+n=(n-1)+\\lambda+n=2n$$===g+1+n=(n-1)+1+n=2n', '\\sum_{r=1}^{r_{\\max}}\\frac{1}{r}===\\sum\\limits_{r=1}^{r_{max}}\\frac{1}{r}', '\\lambda=\\Lambda\\div\\lambda_0===l=1\\div10', '$$\\inte=0$$===\\intb=0', '$$\\frac{3}{xP(x)}$$===\\frac{3}{xP(x)}', 'x_{1}+ix_{2}=\\left(x_{1}+ix_{2}\\right)+i===x_{1}+ix_{2}=(x_{1}+ix_{2})+1', '$$\\sinLt$$===\\sinLt', '\\Deltay=q_y\\Deltax===xy=qyx', '$$\\frac{6}{\\sqrt{360}}$$===\\frac{6}{\\sqrt{360}}', '$$b_{ab}^{mn}$$===b_{ab}n^{a}n^{b}', '$$y^{2}=x^{2}(x+a)$$===y^{2}=x^{2}(x+a)', '$$\\vec{M}\\rightarrow\\frac{M}{V}$$===M\\rightarrow\\frac{M}{\\sqrt{c}}', '$$\\frac{1}{64}(m+2)(3m^{2}+22m+40)$$===\\frac{1}{64}(n+2)(3n^{2}+22n+40)', '\\sin^2(t-t_0