# Introduction

Calculating the CER by taking the results from the fine-tuned SmolVLM model and comparing them with the Qwen2-VL 2B annotations.

In [1]:
import glob
import jiwer
import torch

from transformers import AutoModelForVision2Seq, AutoProcessor
from tqdm.auto import tqdm
from PIL import Image

## Function to Calculate CER

In [2]:
def calculate_cer(sroie_boxes, ground_truth):
    """
    :param sroie_boxes: List containing the box/text data from the SROIE v2 dataset
        e.g. ['tan woon yann\nbook ta.k', 'are not returnable or']
    :param ground_truth: VLM generated annotations
        e.g. ['tan woon yann\nbook ta.k', 'are not returnable or']
    """

    error = jiwer.cer(sroie_boxes, ground_truth)
    print(f"CER: {error}")

## Calculate CER for Test Data using Trained Model

In [3]:
# Get the text from the Qwen VL annotations.
vlm_data = []

all_vlm_txt_test_paths = glob.glob('../input/qwen2_vl_2b_sroiev2_test_annots/*.txt')
all_vlm_txt_test_paths.sort()

for file_path in all_vlm_txt_test_paths:
    data = open(file_path).read()
    vlm_data.append(data.lower())

Inference using the trained model.

In [4]:
model_id = "HuggingFaceTB/SmolVLM-256M-Instruct"

In [5]:
model = AutoModelForVision2Seq.from_pretrained(
    model_id,
    device_map="auto",
    torch_dtype=torch.bfloat16,
    _attn_implementation="flash_attention_2" # Use `flash_attention_2` on Ampere GPUs and above and `eager` on older GPUs.
    # _attn_implementation="eager", # Use `flash_attention_2` on Ampere GPUs and above and `eager` on older GPUs.
)

processor = AutoProcessor.from_pretrained(model_id)

In [6]:
adapter_path = "../notebooks/trained_adapters/smolvlm_receipt_qwengt_ft/"
model.load_adapter(adapter_path)

In [7]:
all_image_paths = glob.glob('../input/sroie_v2/SROIE2019/test/img/*.jpg')
all_image_paths.sort()
print(len(all_image_paths))

347


In [8]:
def test(model, processor, image, max_new_tokens=500, device="cuda"):
    messages = [
        {
            "role": "user",
            "content": [
                {"type": "image"},
                {"type": "text", "text": "OCR this image accurately"}
            ]
        },
    ]
    
    # Prepare the text input by applying the chat template
    text_input = processor.apply_chat_template(
        messages,  # Use the sample without the system message
        add_generation_prompt=True
    )

    image_inputs = []
    if image.mode != 'RGB':
        image = image.convert('RGB')
        
    image_inputs.append([image])

    # Prepare the inputs for the model
    model_inputs = processor(
        #text=[text_input],
        text=text_input,
        images=image_inputs,
        return_tensors="pt",
    ).to(device)  # Move inputs to the specified device

    # Generate text with the model
    generated_ids = model.generate(**model_inputs, max_new_tokens=max_new_tokens)

    # Trim the generated ids to remove the input ids
    trimmed_generated_ids = [
        out_ids[len(in_ids):] for in_ids, out_ids in zip(model_inputs.input_ids, generated_ids)
    ]

    # Decode the output text
    output_text = processor.batch_decode(
        trimmed_generated_ids,
        skip_special_tokens=True,
        clean_up_tokenization_spaces=False
    )

    return output_text[0]  # Return the first decoded output text

In [9]:
inference_results = []

for i, image_path in tqdm(enumerate(all_image_paths), total=len(all_image_paths)):
    # if i == 2:
    #     break
    image = Image.open(image_path).convert('RGB')
    image = image.resize((512, 768))

    output = test(model, processor, image)

    inference_results.append(output)

  0%|          | 0/347 [00:00<?, ?it/s]

In [10]:
calculate_cer(vlm_data, inference_results)

CER: 0.9499435473283445
