In [1]:
import os
import pandas as pd
from operator import index
import polars as pl
import torch
import numpy as np
from PIL import Image
import io
from tqdm import tqdm
import random
import logging
from sklearn.model_selection import train_test_split
from unsloth import FastVisionModel
from unsloth.trainer import UnslothVisionDataCollator
from trl import SFTTrainer, SFTConfig
import pyarrow.parquet as pq

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.


  from .autonotebook import tqdm as notebook_tqdm


🦥 Unsloth Zoo will now patch everything to make training faster!


In [2]:
# set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

In [3]:
# datsaet path
DATASET_PATH = '/home/hh/math ocr/unified_dataset/unified_math_ocr_dataset.parquet'
OUTPUT_DIR = '/home/hh/math ocr/model_outputs'
os.makedirs(OUTPUT_DIR, exist_ok=True)

In [4]:
# min_size=28
def bytes_to_image(byte_data):
    """Convert bytes to a PIL image object."""
    img = Image.open(io.BytesIO(byte_data))
    # width, height = img.size
    # if width < min_size or height < min_size:
    #     # Maintain aspect ratio while ensuring minimum dimensions
    #     new_width = max(min_size, width)
    #     new_height = max(min_size, height)
    #
    #     # Resize the image
    #     img = img.resize((new_width, new_height), Image.LANCZOS)
    # weight and height should be multiples of 28
    return img.resize((140, 210), Image.Resampling.LANCZOS)

In [5]:
"""Load and prepare the dataset."""
logger.info("load dataset...")
# parquet_file = pq.ParquetFile(DATASET_PATH)
# for i in range(parquet_file.num_row_groups):
#     df = parquet_file.read_row_group(i).to_pandas()
dl = pl.read_parquet(DATASET_PATH)
df= dl.to_pandas()
# df = pd.read_parquet(DATASET_PATH)
logger.info(f"The dataset has been successfully loaded，There are {len(df)} records in total.")

2025-03-22 22:39:57,363 - INFO - load dataset...
2025-03-22 22:40:00,735 - INFO - The dataset has been successfully loaded，There are 197024 records in total.


In [6]:
df.iloc[3]

image    {'bytes': b'\x89PNG\r\n\x1a\n\x00\x00\x00\rIHD...
text     N H _ { 4 } C l + N a O H = N a C l + H _ { 2 ...
Name: 3, dtype: object

In [7]:
print(df.iloc[3]['text'])

N H _ { 4 } C l + N a O H = N a C l + H _ { 2 } O + N H _ { 3 } \uparrow


In [8]:
# Convert the dataset into the format required for fine-tuning.
converted_dataset = []

for _, row in tqdm(df.iterrows(), total=len(df), desc="convert dataset"):
    img = bytes_to_image(row['image']['bytes'])
    latex_code = row['text']

    # Create the converted data format
    conversation = {
        "messages": [
            {
                "role": "user",
                "content": [
                    {"type": "text", "text": "Write the LaTeX representation for this image."},
                    {"type": "image", "image": img}
                ]
            },
            {
                "role": "assistant",
                "content": [
                    {"type": "text", "text": latex_code}
                ]
            }
        ]
    }

    converted_dataset.append(conversation)

convert dataset: 100%|██████████| 197024/197024 [02:06<00:00, 1561.96it/s]


In [9]:
converted_dataset[3]

{'messages': [{'role': 'user',
   'content': [{'type': 'text',
     'text': 'Write the LaTeX representation for this image.'},
    {'type': 'image', 'image': <PIL.Image.Image image mode=L size=140x210>}]},
  {'role': 'assistant',
   'content': [{'type': 'text',
     'text': 'N H _ { 4 } C l + N a O H = N a C l + H _ { 2 } O + N H _ { 3 } \\uparrow'}]}]}

In [10]:
# split training set and test set (8:2)
train_size = int(len(converted_dataset) * 0.8)
train_dataset = converted_dataset[:train_size]
val_dataset = converted_dataset[train_size:]

logger.info(f"training set size: {len(train_dataset)}")
logger.info(f"validation set size: {len(val_dataset)}")

2025-03-22 22:42:07,025 - INFO - training set size: 157619
2025-03-22 22:42:07,026 - INFO - validation set size: 39405


In [11]:
logger.info("Start fine-tuning the Qwen2.5 Vision model...")
# 检查CUDA是否可用
if not torch.cuda.is_available():
    logger.error("CUDA is not available. Please ensure that your graphics card and drivers are properly configured.")

logger.info(f"Using GPU: {torch.cuda.get_device_name(0)}")
logger.info(f"CUDA Version: {torch.version.cuda}")


2025-03-22 22:42:07,074 - INFO - Start fine-tuning the Qwen2.5 Vision model...
2025-03-22 22:42:07,075 - INFO - Using GPU: NVIDIA GeForce RTX 4090
2025-03-22 22:42:07,075 - INFO - CUDA Version: 12.4


In [12]:
# 加载模型
model, tokenizer = FastVisionModel.from_pretrained(
    "unsloth/Qwen2.5-VL-3B-Instruct",
    load_in_4bit=True,  # Use 4bit to reduce memory use. False for 16bit LoRA.
    use_gradient_checkpointing="unsloth",  # True or "unsloth" for long context
)

==((====))==  Unsloth 2025.3.17: Fast Qwen2 patching. Transformers: 4.49.0.
   \\   /|    NVIDIA GeForce RTX 4090. Num GPUs = 1. Max memory: 23.617 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 8.9. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.29.post3. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.48, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


In [13]:
model = FastVisionModel.get_peft_model(
    model,
    finetune_vision_layers     = True, # False if not finetuning vision layers
    finetune_language_layers   = True, # False if not finetuning language layers
    finetune_attention_modules = True, # False if not finetuning attention layers
    finetune_mlp_modules       = True, # False if not finetuning MLP layers

    r = 32,           # The larger, the higher the accuracy, but might overfit
    lora_alpha = 16,  # Recommended alpha == r at least
    lora_dropout = 0,
    bias = "none",
    random_state = 3407,
    use_rslora = False,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
    # target_modules = "all-linear", # Optional now! Can specify a list if needed
)

In [14]:
from unsloth import is_bf16_supported
from unsloth.trainer import UnslothVisionDataCollator
from trl import SFTTrainer, SFTConfig

FastVisionModel.for_training(model) # Enable for training!

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    data_collator = UnslothVisionDataCollator(model, tokenizer), # Must use!
    train_dataset = converted_dataset,
    args = SFTConfig(
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4,
        warmup_steps = 5,
        max_steps = 100,
        # num_train_epochs = 1, # Set this instead of max_steps for full training runs
        learning_rate = 2e-4,
        fp16 = not is_bf16_supported(),
        bf16 = is_bf16_supported(),
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "cosine",
        seed = 3407,
        output_dir = "outputs",
        report_to = "none",     # For Weights and Biases

        # You MUST put the below items for vision finetuning:
        remove_unused_columns = False,
        dataset_text_field = "",
        dataset_kwargs = {"skip_prepare_dataset": True},
        dataset_num_proc = 4,
        max_seq_length = 4096,
        greater_is_better=False,
    ),
)

Unsloth: Model does not have a default image size - using 512


In [15]:
#@title Show current memory stats
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

GPU = NVIDIA GeForce RTX 4090. Max memory = 23.617 GB.
3.783 GB of memory reserved.


In [16]:
trainer_stats = trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 197,024 | Num Epochs = 1 | Total steps = 100
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 4 x 1) = 8
 "-____-"     Trainable parameters = 82,169,856/3,000,000,000 (2.74% trained)


Step,Training Loss
1,2.8988
2,3.9447
3,2.9776
4,3.1669
5,2.7836
6,2.2098
7,2.0534
8,1.9107
9,1.6971
10,1.2806


Unsloth: Will smartly offload gradients to save VRAM!


In [17]:
#@title Show final memory and time stats
used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
used_memory_for_lora = round(used_memory - start_gpu_memory, 3)
used_percentage = round(used_memory         /max_memory*100, 3)
lora_percentage = round(used_memory_for_lora/max_memory*100, 3)
print(f"{trainer_stats.metrics['train_runtime']} seconds used for training.")
print(f"{round(trainer_stats.metrics['train_runtime']/60, 2)} minutes used for training.")
print(f"Peak reserved memory = {used_memory} GB.")
print(f"Peak reserved memory for training = {used_memory_for_lora} GB.")
print(f"Peak reserved memory % of max memory = {used_percentage} %.")
print(f"Peak reserved memory for training % of max memory = {lora_percentage} %.")

114.1113 seconds used for training.
1.9 minutes used for training.
Peak reserved memory = 4.396 GB.
Peak reserved memory for training = 0.613 GB.
Peak reserved memory % of max memory = 18.614 %.
Peak reserved memory for training % of max memory = 2.596 %.


In [None]:
FastVisionModel.for_inference(model) # Enable for inference!
image = Image.open('/home/hh/math ocr/MLHME38K/train_images/train_5.jpg').resize((140, 210), Image.Resampling.LANCZOS)
instruction = r'''You are an AI assistant specialized in converting PDF images to Markdown format. Please follow these instructions for the conversion:

        1. Text Processing:
        - Accurately recognize all text content in the PDF image without guessing or inferring.
        - Convert the recognized text into Markdown format.
        - Maintain the original document structure, including headings, paragraphs, lists, etc.

        2. Mathematical Formula Processing:
        - Convert all mathematical formulas to LaTeX format.
        - Enclose inline formulas with \( \). For example: This is an inline formula \( E = mc^2 \)
        - Enclose block formulas with \\[ \\]. For example: \[ \frac{-b \pm \sqrt{b^2 - 4ac}}{2a} \]

        3. Table Processing:
        - Convert tables to HTML format.
        - Wrap the entire table with <table> and </table>.

        4. Figure Handling:
        - Ignore figures content in the PDF image. Do not attempt to describe or convert images.

        5. Output Format:
        - Ensure the output Markdown document has a clear structure with appropriate line breaks between elements.
        - For complex layouts, try to maintain the original document's structure and format as closely as possible.

        Please strictly follow these guidelines to ensure accuracy and consistency in the conversion. Your task is to accurately convert the content of the PDF image into Markdown format without adding any extra explanations or comments.
        '''

messages = [
    {"role": "user", "content": [
        {"type": "image"},
        {"type": "text", "text": instruction}
    ]}
]
input_text = tokenizer.apply_chat_template(messages, add_generation_prompt = True)
inputs = tokenizer(
    image,
    input_text,
    add_special_tokens = False,
    return_tensors = "pt",
).to("cuda")

from transformers import TextStreamer
text_streamer = TextStreamer(tokenizer, skip_prompt = True)
_ = model.generate(**inputs, streamer = text_streamer, max_new_tokens = 1024,
                   use_cache = True, temperature = 1.0, min_p = 0.1)

In [19]:
model.save_pretrained("test_model") # Local saving
tokenizer.save_pretrained("test_model")

In [None]:
# doing quantization for the model, then we can use it in ollama platform
model.save_pretrained_gguf("model_path", tokenizer,)