<a href="https://colab.research.google.com/github/rolandtannous/unsloth_scratchpad/blob/main/nb/gemma3-vision-test/gemma3_vision_testing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from unsloth import FastVisionModel

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
forward 1 working and being replaced
🦥 Unsloth Zoo will now patch everything to make training faster!
INFO 06-18 05:14:05 [__init__.py:244] Automatically detected platform cuda.


# Dataset Preparation

In [2]:
from datasets import load_dataset

dataset = load_dataset("lbourdois/OCR-liboaccn-OPUS-MIT-5M-clean", 'en', split="train")
# To select the first 2000 examples
train_dataset = dataset.select(range(2000))

# To select the next 200 examples for evaluation
eval_dataset = dataset.select(range(2000, 2200))

# Convert dataset to OAI messages       
def format_data(sample):
    return {"messages": [
                {
                    "role": "user",
                    "content": [
                        {
                            "type": "text",
                            "text": system_message + sample["question"],
                        },{
                            "type": "image",
                            "image": sample["image"].convert("RGB"),
                        }
                    ],
                },
                {
                    "role": "assistant",
                    "content": [{"type": "text", "text": sample["answer"]}],
                },
            ],
        }

system_message = "You are an expert french ocr system."
# Convert dataset to OAI messages
# need to use list comprehension to keep Pil.Image type, .mape convert image to bytes
train_dataset = [format_data(sample) for sample in train_dataset]
eval_dataset = [format_data(sample) for sample in eval_dataset]

Resolving data files:   0%|          | 0/50 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/50 [00:00<?, ?it/s]

Loading dataset shards:   0%|          | 0/50 [00:00<?, ?it/s]

In [3]:
import os
import torch
from tqdm import tqdm
import pandas as pd
from jiwer import wer, cer
from typing import Any, List, Dict, Tuple, Optional
#import matplotlib.pyplot as plt
import traceback
from PIL import Image


class OCRModelEvaluator:
    """
    A comprehensive OCR model evaluator that supports Gemma3 and other vision-language models.
    """

    def __init__(self):
        """Initialize the OCR evaluator."""
        self.model_comparison_results = {}

    def evaluate_model(
        self,
        model: Any,
        processor: Any,
        dataset: List[Dict],
        output_dir: str = "ocr_evaluation_results",
        max_new_tokens: int = 256,
        temperature: float = 0.8,
        top_p: float = 1.0,
        top_k: int = 64,
        do_sample: bool = True,
        verbose: bool = True
    ) -> Tuple[Optional[float], Optional[float]]:
        """
        Evaluate a Gemma3 model on an OCR dataset.
        """
        # Create output directory if it doesn't exist
        os.makedirs(output_dir, exist_ok=True)

        # Initialize results storage
        results = []

        # Process each sample in the dataset
        for i, sample in enumerate(tqdm(dataset, desc="Evaluating OCR performance", disable=not verbose)):
            try:
                # Extract components from sample
                messages = sample['messages']

                # Get ground truth, image, and question, input_messages
                ground_truth, image, question, input_messages = self._extract_sample_components(
                    messages, i, verbose
                )

                if ground_truth is None or image is None or question is None:
                    continue

                # Generate model response
                generated_response = self._generate_response(
                    model, processor, input_messages, max_new_tokens, temperature, top_p, top_k, do_sample
                )

                # Calculate metrics
                word_error = wer(ground_truth, generated_response)
                char_error = cer(ground_truth, generated_response)

                # Save individual result
                self._save_individual_result(
                    output_dir, i, question, generated_response, ground_truth, word_error, char_error
                )

                # Store results for summary
                results.append({
                    'sample_id': i,
                    'wer': word_error,
                    'cer': char_error,
                    'model_output': generated_response.strip(),
                    'ground_truth': ground_truth,
                    'question': question
                })

            except Exception as e:
                if verbose:
                    print(f"Error processing sample {i}: {str(e)}")
                    traceback.print_exc()

        # Generate summary report
        return self._generate_summary_report(results, output_dir, verbose)

    def _extract_sample_components(
        self,
        messages: List[Dict],
        sample_idx: int,
        verbose: bool
    ) -> Tuple[Optional[str], Optional[Any], Optional[str], List[Dict]]:
        """Extract ground truth, image, question, and input messages from sample."""

        # Extract system message (if present)
        system_message = next((msg for msg in messages if msg['role'] == 'system'), None)

        # Extract user message with the image and question
        user_message = next((msg for msg in messages if msg['role'] == 'user'), None)
        if not user_message:
            if verbose:
                print(f"Skipping sample {sample_idx}: No user message found")
            return None, None, None, []

        # Extract assistant message with ground truth
        assistant_message = next((msg for msg in messages if msg['role'] == 'assistant'), None)
        if not assistant_message:
            if verbose:
                print(f"Skipping sample {sample_idx}: No assistant message (ground truth) found")
            return None, None, None, []

        # Extract ground truth text
        ground_truth = None
        for content_item in assistant_message['content']:
            if content_item['type'] == 'text':
                ground_truth = content_item['text']
                break

        if not ground_truth:
            if verbose:
                print(f"Skipping sample {sample_idx}: No text found in assistant message")
            return None, None, None, []

        # Extract image and question from user message
        image = None
        question = None

        for content_item in user_message['content']:
            if content_item['type'] == 'image':
                image = content_item['image']
                # Ensure image is in RGB format
                if hasattr(image, 'convert'):
                    image = image.convert('RGB')
            elif content_item['type'] == 'text':
                question = content_item['text']

        if not image:
            if verbose:
                print(f"Skipping sample {sample_idx}: No image found in user message")
            return None, None, None, []

        if not question:
            if verbose:
                print(f"Skipping sample {sample_idx}: No question found in user message")
            return None, None, None, []

        # Construct messages for the model input (excluding assistant message)
        input_messages = []
        if system_message:
            input_messages.append(system_message)
        input_messages.append(user_message)

        return ground_truth, image, question, input_messages

    def _process_vision_info(self, messages: List[Dict]) -> List[Image.Image]:
        """Extract images from messages in Gemma3 format."""
        image_inputs = []
        # Iterate through each conversation
        for msg in messages:
            # Get content (ensure it's a list)
            content = msg.get("content", [])
            if not isinstance(content, list):
                content = [content]

            # Check each content element for images
            for element in content:
                if isinstance(element, dict) and (
                    "image" in element or element.get("type") == "image"
                ):
                    # Get the image and convert to RGB
                    if "image" in element:
                        image = element["image"]
                    else:
                        image = element
                    if hasattr(image, 'convert'):
                        image_inputs.append(image.convert("RGB"))
                    else:
                        image_inputs.append(image)
        return image_inputs

    def _generate_response(
        self,
        model: Any,
        processor: Any,
        input_messages: List[Dict],
        max_new_tokens: int,
        temperature: float,
        top_p: float,
        top_k: int,
        do_sample: bool,
    ) -> str:
        """Generate response from the Gemma3 model using the official approach."""

        # Apply chat template to convert messages to text
        text = processor.apply_chat_template(
            input_messages, tokenize=False, add_generation_prompt=True
        )

        # Process the images using the official vision processing function
        image_inputs = self._process_vision_info(input_messages)

        # Tokenize the text and process the images
        inputs = processor(
            text=[text],
            images=image_inputs,
            padding=True,
            return_tensors="pt",
        )

        # Move the inputs to the device
        inputs = inputs.to(model.device)

        # Set up stop tokens (following the official implementation)
        stop_token_ids = [
            processor.tokenizer.eos_token_id, 
            processor.tokenizer.convert_tokens_to_ids("<end_of_turn>")
        ]

        # Generate the output with proper parameters
        with torch.inference_mode():
            generated_ids = model.generate(
                **inputs, 
                max_new_tokens=max_new_tokens, 
                top_p=top_p,
                top_k=top_k,
                do_sample=do_sample, 
                temperature=temperature, 
                eos_token_id=stop_token_ids,
                disable_compile=True  # Following official implementation
            )

        # Trim the generation (remove input tokens)
        generated_ids_trimmed = [
            out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
        ]

        # Decode the generated text
        output_text = processor.batch_decode(
            generated_ids_trimmed, 
            skip_special_tokens=True, 
            clean_up_tokenization_spaces=False
        )

        return output_text[0] if output_text else ""

    def _save_individual_result(
        self,
        output_dir: str,
        sample_idx: int,
        question: str,
        generated_response: str,
        ground_truth: str,
        word_error: float,
        char_error: float
    ):
        """Save individual sample result to file."""
        output_file = os.path.join(output_dir, f"sample_{sample_idx}.txt")
        with open(output_file, 'w', encoding='utf-8') as f:
            f.write(f"Sample {sample_idx}\n")
            f.write(f"Question: {question}\n\n")
            f.write(f"Model output:\n{generated_response.strip()}\n\n")
            f.write(f"Ground truth:\n{ground_truth}\n\n")
            f.write(f"WER: {word_error:.4f}, CER: {char_error:.4f}")

    def _generate_summary_report(
        self,
        results: List[Dict],
        output_dir: str,
        verbose: bool
    ) -> Tuple[Optional[float], Optional[float]]:
        """Generate and save summary report."""
        if not results:
            if verbose:
                print("No results to summarize.")
            return None, None

        df = pd.DataFrame(results)

        # Calculate overall averages
        avg_wer = df['wer'].mean()
        avg_cer = df['cer'].mean()

        # Save average metrics
        with open(os.path.join(output_dir, "avg_metrics.txt"), 'w') as f:
            f.write(f"Average WER: {avg_wer:.4f}\n")
            f.write(f"Average CER: {avg_cer:.4f}\n")

        # Save detailed results
        df.to_csv(os.path.join(output_dir, "detailed_results.csv"), index=False)

        if verbose:
            print("\nResults Summary:")
            print(f"Average WER: {avg_wer:.4f}")
            print(f"Average CER: {avg_cer:.4f}")
            print(f"\nDetailed results saved to {output_dir}/")

        return avg_wer, avg_cer

    def add_to_comparison(self, model_name: str, wer: float, cer: float):
        """Add model results to the comparison tracker."""
        self.model_comparison_results[model_name] = {
            "wer": wer,
            "cer": cer
        }

    def print_model_comparison(self, save_csv: bool = True, save_plot: bool = True) -> Optional[pd.DataFrame]:
        """Print a comparison of all models evaluated so far."""
        if not self.model_comparison_results:
            print("No model results available for comparison")
            return None

        print("\n==== MODEL COMPARISON REPORT ====")

        # Create a comparison dataframe
        comparison_df = pd.DataFrame({
            "Model": list(self.model_comparison_results.keys()),
            "WER": [results["wer"] for results in self.model_comparison_results.values()],
            "CER": [results["cer"] for results in self.model_comparison_results.values()]
        })

        # Sort by WER (best performance first)
        comparison_df = comparison_df.sort_values("WER")

        # Display the comparison table
        print("\nComparison Table (sorted by WER):")
        print(comparison_df.to_string(index=False))

        # Save the comparison table
        if save_csv:
            comparison_file = "model_comparison_results.csv"
            comparison_df.to_csv(comparison_file, index=False)
            print(f"\nComparison table saved to {comparison_file}")

        # Generate a bar chart visualization
        if save_plot:
            self._create_comparison_plot(comparison_df)

        return comparison_df

    def _create_comparison_plot(self, comparison_df: pd.DataFrame):
        """Create and save comparison plot."""
        plt.figure(figsize=(12, 6))

        # Plot WER
        plt.subplot(1, 2, 1)
        plt.bar(comparison_df["Model"], comparison_df["WER"], color='skyblue')
        plt.title('Word Error Rate Comparison')
        plt.ylabel('WER (lower is better)')
        plt.ylim(bottom=0)
        plt.xticks(rotation=45, ha='right')

        # Plot CER
        plt.subplot(1, 2, 2)
        plt.bar(comparison_df["Model"], comparison_df["CER"], color='lightgreen')
        plt.title('Character Error Rate Comparison')
        plt.ylabel('CER (lower is better)')
        plt.ylim(bottom=0)
        plt.xticks(rotation=45, ha='right')

        plt.tight_layout()
        plt.savefig('ocr_model_comparison.png')
        plt.show()

        print(f"\nVisualization saved to ocr_model_comparison.png")

    def get_comparison_results(self) -> Dict[str, Dict[str, float]]:
        """Get the current comparison results."""
        return self.model_comparison_results.copy()

    def clear_comparison_results(self):
        """Clear all comparison results."""
        self.model_comparison_results.clear()


# Convenience functions for backward compatibility
def evaluate_ocr_model(model, processor, dataset, output_dir="ocr_evaluation_results", **kwargs):
    """
    Convenience function that maintains backward compatibility with the original function.
    """
    evaluator = OCRModelEvaluator()
    return evaluator.evaluate_model(model, processor, dataset, output_dir, **kwargs)


def create_evaluator():
    """Create a new OCR evaluator instance."""
    return OCRModelEvaluator()

# Load and finetune gema3 model

In [4]:
import torch
model, processor = FastVisionModel.from_pretrained(
    model_name = "unsloth/gemma-3-4b-it",
    #model_name = "meta-llama/Llama-3.2-11B-Vision-Instruct",
    max_seq_length = 2048, # Choose any for long context!
    load_in_4bit = True,  # 4 bit quantization to reduce memory
    load_in_8bit = False, # [NEW!] A bit more accurate, uses 2x memory
    full_finetuning = False, # [NEW!] We have full finetuning now!
)

==((====))==  Unsloth 2025.6.2: Fast Gemma3 patching. Transformers: 4.52.4. vLLM: 0.9.1.
   \\   /|    NVIDIA H100 80GB HBM3. Num GPUs = 1. Max memory: 79.179 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.7.0+cu126. CUDA: 9.0. CUDA Toolkit: 12.6. Triton: 3.3.0
\        /    Bfloat16 = TRUE. FA [Xformers = None. FA2 = True]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


In [5]:
ocr_evaluator = OCRModelEvaluator()
model_comparison_results = {}

In [6]:
# benchmark lora model performance
model_name = "Base model" 
avg_wer, avg_cer = ocr_evaluator.evaluate_model(model=model, processor=processor, dataset=eval_dataset, top_p=0.95, top_k=64, output_dir="base_model_results", max_new_tokens=64, temperature=1.0)
ocr_evaluator.add_to_comparison(model_name, avg_wer, avg_cer)

Evaluating OCR performance: 100%|████████████████████████████████████████████████████████████████████████████████████████| 200/200 [08:01<00:00,  2.41s/it]


Results Summary:
Average WER: 0.8584
Average CER: 0.6946

Detailed results saved to base_model_results/





In [7]:
FastVisionModel.for_inference(model) # Enable for inference!

sample = dataset[1]
image =  sample["image"].convert('RGB')
messages =  [
                {
                    "role": "user",
                    "content": [
                        {
                            "type": "text",
                            "text": sample["question"],
                        },{
                            "type": "image",
                        }
                    ],
                },
            ]
input_text = processor.apply_chat_template(messages, add_generation_prompt = True)
inputs = processor(
    image,
    input_text,
    add_special_tokens = False,
    return_tensors = "pt",
).to("cuda")

from transformers import TextStreamer
text_streamer = TextStreamer(processor.tokenizer, skip_prompt = True)
_ = model.generate(**inputs, streamer = text_streamer, max_new_tokens = 128,
                   use_cache = True, temperature = 1.5, min_p = 0.1)

You have set `compile_config`, but we are unable to meet the criteria for compilation. Compilation will be skipped.


Here's the transcription of the text in the image:

“Beaucoup d'entre vous savent à quel point Jimmy était pour nous, surtout sa maman.”<end_of_turn>


In [9]:
model = FastVisionModel.get_peft_model(
    model,
    finetune_vision_layers     = True, # False if not finetuning vision layers
    finetune_language_layers   = True, # False if not finetuning language layers
    finetune_attention_modules = True, # False if not finetuning attention layers
    finetune_mlp_modules       = True, # False if not finetuning MLP layers

    r = 16,           # The larger, the higher the accuracy, but might overfit
    lora_alpha = 16,  # Recommended alpha == r at least
    lora_dropout = 0,
    bias = "none",
    random_state = 3407,
    use_rslora = False,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
    target_modules = "all-linear", # Optional now! Can specify a list if needed
    modules_to_save=[
        "lm_head",
        "embed_tokens",
    ],
)

Unsloth: Making `base_model.model.model.vision_tower.vision_model` require gradients


In [10]:
from unsloth import is_bf16_supported
from unsloth.trainer import UnslothVisionDataCollator
from trl import SFTConfig, SFTTrainer
FastVisionModel.for_training(model) # Enable for training!
model.config.use_cache = False


args = SFTConfig(
        per_device_train_batch_size = 1,
        gradient_accumulation_steps = 4,
        gradient_checkpointing=True,
        gradient_checkpointing_kwargs = {"use_reentrant": False}, # use reentrant checkpointing
        max_grad_norm=0.3,                      # max gradient norm based on QLoRA paper
        warmup_ratio=0.03,
        max_steps=60,
        #num_train_epochs = 2, # Set this instead of max_steps for full training runs
        learning_rate = 2e-4,
        fp16 = not is_bf16_supported(),
        bf16 = is_bf16_supported(),
        logging_steps = 5,
        save_strategy="epoch",
        optim = "adamw_torch_fused",
        weight_decay = 0.01,
        lr_scheduler_type = "cosine",
        seed = 3407,
        output_dir = "gemma3-french-ocr-checkpoints",
        report_to = "none",     # For Weights and Biases

        # You MUST put the below items for vision finetuning:
        remove_unused_columns = False,
        dataset_text_field = "",
        dataset_kwargs = {"skip_prepare_dataset": True},
        dataset_num_proc = 4,
        max_seq_length = 2048,
    )

In [11]:
from trl import SFTTrainer
from unsloth.trainer import UnslothVisionDataCollator
trainer = SFTTrainer(
    model=model,
    args=args,
    train_dataset=train_dataset,
    processing_class=processor.tokenizer,
    data_collator=UnslothVisionDataCollator(model,processor),
)

In [12]:
trainer_stats = trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 2,000 | Num Epochs = 1 | Total steps = 60
O^O/ \_/ \    Batch size per device = 1 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (1 x 4 x 1) = 4
 "-____-"     Trainable parameters = 38,497,792/4,000,000,000 (0.96% trained)
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.


Step,Training Loss
5,22.4012
10,5.5527
15,1.1274
20,0.7059
25,0.6261
30,0.5196
35,0.5302
40,0.4143
45,0.5128
50,0.401


# save qlora adapter

In [13]:
sample=dataset[9]
image =  sample["image"].convert('RGB')
messages =  [
                {
                    "role": "user",
                    "content": [
                        {
                            "type": "text",
                            "text": sample["question"],
                        },{
                            "type": "image",
                        }
                    ],
                },
            ]
input_text = processor.apply_chat_template(messages, add_generation_prompt = True)
inputs = processor(
    image,
    input_text,
    add_special_tokens = False,
    return_tensors = "pt",
).to("cuda")

from transformers import TextStreamer
text_streamer = TextStreamer(processor.tokenizer, skip_prompt = True)
_ = model.generate(**inputs, streamer = text_streamer, max_new_tokens = 128,
                   use_cache = True, temperature = 1.5, min_p = 0.1)


Tu aurais dû voir ces hommes, mère.<end_of_turn>


In [16]:
model.save_pretrained("unsloth-gemma3-ocr-adapter", processor)
processor.save_pretrained("unsloth-gemma3-ocr-adapter")

['unsloth-gemma3-ocr-adapter/processor_config.json']

In [18]:
# benchmark lora model performance
model_name = "Peft model" 
avg_wer, avg_cer = ocr_evaluator.evaluate_model(model=model, processor=processor, dataset=eval_dataset, top_p=0.95, top_k=64, output_dir="peft_model_results", max_new_tokens=64, temperature=1.0)
ocr_evaluator.add_to_comparison(model_name, avg_wer, avg_cer)

Evaluating OCR performance: 100%|████████████████████████████████████████████████████████████████████████████████████████| 200/200 [08:26<00:00,  2.53s/it]


Results Summary:
Average WER: 0.0451
Average CER: 0.0084

Detailed results saved to peft_model_results/





In [19]:
FastVisionModel.for_inference(model) # Enable for inference!   

sample=dataset[9]
image =  sample["image"].convert('RGB')
messages =  [
                {
                    "role": "user",
                    "content": [
                        {
                            "type": "text",
                            "text": sample["question"],
                        },{
                            "type": "image",
                        }
                    ],
                },
            ]
input_text = processor.apply_chat_template(messages, add_generation_prompt = True)
inputs = processor(
    image,
    input_text,
    add_special_tokens = False,
    return_tensors = "pt",
).to("cuda")

from transformers import TextStreamer
text_streamer = TextStreamer(processor.tokenizer, skip_prompt = True)
_ = model.generate(**inputs, streamer = text_streamer, max_new_tokens = 128,
                   use_cache = True, temperature = 1.5, min_p = 0.1)


Tu aurais dû voir ces hommes, mère.<end_of_turn>


# Merge model

In [21]:
# merge default 16 bits
model.save_pretrained_merged(save_directory="gemma3-merged-finetune-merge-16bit", tokenizer=processor)

Found HuggingFace hub cache directory: /mnt/disks/unslothai/.cache/huggingface/hub
Checking cache directory for required files...
Successfully copied all 2 files from cache to gemma3-merged-finetune-merge-16bit.
Downloading safetensors index for unsloth/gemma-3-4b-it...


Unsloth: Merging weights into 16bit: 100%|███████████████████████████████████████████████████████████████████████████████████| 2/2 [00:23<00:00, 11.69s/it]


# Load Merged model and benchmark

In [22]:
del model
del trainer
torch.cuda.empty_cache()

In [23]:
# load model in 4 bits
model, tokenizer = FastVisionModel.from_pretrained("./gemma3-merged-finetune-merge-16bit",load_in_4bit=True, load_in_8bit=False)

==((====))==  Unsloth 2025.6.2: Fast Gemma3 patching. Transformers: 4.52.4. vLLM: 0.9.1.
   \\   /|    NVIDIA H100 80GB HBM3. Num GPUs = 1. Max memory: 79.179 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.7.0+cu126. CUDA: 9.0. CUDA Toolkit: 12.6. Triton: 3.3.0
\        /    Bfloat16 = TRUE. FA [Xformers = None. FA2 = True]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [24]:
# benchmark 4bits lora model performance
model_name = "Merged model 4bits" 
avg_wer, avg_cer = ocr_evaluator.evaluate_model(model=model, processor=tokenizer, dataset=eval_dataset, top_p=0.95, top_k=64, output_dir="merged_model_load4bits_results", max_new_tokens=64, temperature=1.0)
ocr_evaluator.add_to_comparison(model_name, avg_wer, avg_cer)

Evaluating OCR performance: 100%|████████████████████████████████████████████████████████████████████████████████████████| 200/200 [05:39<00:00,  1.70s/it]


Results Summary:
Average WER: 0.0540
Average CER: 0.0103

Detailed results saved to merged_model_load4bits_results/





In [25]:
del model
torch.cuda.empty_cache()

In [26]:
# load model in 16 bits
model, processor = FastVisionModel.from_pretrained("./gemma3-merged-finetune-merge-16bit",load_in_4bit=False, load_in_8bit=False)

==((====))==  Unsloth 2025.6.2: Fast Gemma3 patching. Transformers: 4.52.4. vLLM: 0.9.1.
   \\   /|    NVIDIA H100 80GB HBM3. Num GPUs = 1. Max memory: 79.179 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.7.0+cu126. CUDA: 9.0. CUDA Toolkit: 12.6. Triton: 3.3.0
\        /    Bfloat16 = TRUE. FA [Xformers = None. FA2 = True]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
Unsloth: QLoRA and full finetuning all not selected. Switching to 16bit LoRA.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [27]:
# benchmark 4bits lora model performance
model_name = "Merged model 16bits" 
avg_wer, avg_cer = ocr_evaluator.evaluate_model(model=model, processor=processor, dataset=eval_dataset, top_p=0.95, top_k=64, output_dir="merged_model_load16bits_results", max_new_tokens=64, temperature=1.0)
ocr_evaluator.add_to_comparison(model_name, avg_wer, avg_cer)

Evaluating OCR performance: 100%|████████████████████████████████████████████████████████████████████████████████████████| 200/200 [03:14<00:00,  1.03it/s]


Results Summary:
Average WER: 0.0496
Average CER: 0.0090

Detailed results saved to merged_model_load16bits_results/



