In [21]:
import torch
from PIL import Image
import numpy as np
import matplotlib.pyplot as plt
from pathlib import Path
import pandas as pd
from tqdm import tqdm
import os

from transformers import (
    VisionEncoderDecoderModel,
    TrOCRProcessor,
)

In [1]:
model_name = "../models/finetuned_trocr_weights"
processor = TrOCRProcessor.from_pretrained(model_name)
model = VisionEncoderDecoderModel.from_pretrained(model_name)

model.config.decoder_start_token_id = processor.tokenizer.cls_token_id
model.config.pad_token_id = processor.tokenizer.pad_token_id
model.config.vocab_size = model.config.decoder.vocab_size

  from .autonotebook import tqdm as notebook_tqdm
Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


In [51]:
def levenshtein_distance(s1: str, s2: str) -> int:
    """Calculate the Levenshtein distance between two strings."""
    if len(s1) < len(s2):
        return levenshtein_distance(s2, s1)
    if len(s2) == 0:
        return len(s1)
    previous_row = range(len(s2) + 1)
    for i, c1 in enumerate(s1):
        current_row = [i + 1]
        for j, c2 in enumerate(s2):
            insertions = previous_row[j + 1] + 1
            deletions = current_row[j] + 1
            substitutions = previous_row[j] + (c1 != c2)
            current_row.append(min(insertions, deletions, substitutions))
        previous_row = current_row
    return previous_row[-1]

def character_accuracy(predicted: str, ground_truth: str) -> float:
    """Calculate character-level accuracy."""
    if len(ground_truth) == 0:
        return 1.0 if len(predicted) == 0 else 0.0
    edit_distance = levenshtein_distance(predicted, ground_truth)
    return 1.0 - (edit_distance / len(ground_truth)) # CER is normalized by GT length

def normalize_text(text: str) -> str:
    """Normalize text for comparison (lowercase, strip whitespace)."""
    return re.sub(r'\s+', ' ', text.strip().lower())

def evaluate_on_test_set(test_df, model, processor, device):
    """Evaluate the model on the test dataframe and return detailed results."""
    results = []
    model.to(device)
    model.eval()

    print(f"\n🚀 Starting evaluation on {len(test_df)} test samples...")

    for _, row in tqdm(test_df.iterrows(), total=len(test_df), desc="Evaluating on test set"):
        image_path = os.path.join('/content/data/processed/val', row['image_path'])
        ground_truth = str(row['text'])

        try:
            image = Image.open(image_path).convert("RGB")
        except FileNotFoundError:
            print(f"Warning: Could not find image {image_path}. Skipping.")
            continue

        pixel_values = processor(images=image, return_tensors="pt").pixel_values.to(device)

        with torch.no_grad():
            generated_ids = model.generate(pixel_values)

        prediction = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]

        pred_norm = normalize_text(prediction)
        gt_norm = normalize_text(ground_truth)

        char_acc = character_accuracy(pred_norm, gt_norm)

        results.append({
            'image_path': image_path,
            'ground_truth': ground_truth,
            'prediction': prediction,
            'char_accuracy': char_acc,
            'edit_distance': levenshtein_distance(pred_norm, gt_norm),
            'gt_length': len(gt_norm)
        })

    return pd.DataFrame(results)

In [52]:
test_df = pd.read_csv(os.path.join('/content/data/processed/val', 'labels.csv'))

test_df.columns = ['image_path', 'text']
evaluation_results_df = evaluate_on_test_set(test_df, model, processor, "cuda")


🚀 Starting evaluation on 20809 test samples...


Evaluating on test set:  12%|█▏        | 2442/20809 [02:54<20:20, 15.05it/s]The channel dimension is ambiguous. Got image shape (3, 8, 3). Assuming channels are the first dimension. Use the [input_data_format](https://huggingface.co/docs/transformers/main/internal/image_processing_utils#transformers.image_transforms.rescale.input_data_format) parameter to assign the channel dimension.
Evaluating on test set:  26%|██▌       | 5456/20809 [06:26<17:25, 14.69it/s]The channel dimension is ambiguous. Got image shape (3, 20, 3). Assuming channels are the first dimension. Use the [input_data_format](https://huggingface.co/docs/transformers/main/internal/image_processing_utils#transformers.image_transforms.rescale.input_data_format) parameter to assign the channel dimension.
Evaluating on test set:  30%|███       | 6324/20809 [07:28<20:06, 12.01it/s]The channel dimension is ambiguous. Got image shape (3, 5, 3). Assuming channels are the first dimension. Use the [input_data_format](https://huggi

In [53]:
print("📊 Generating Evaluation Summary...")
print("===================================")

# Calculate Character Error Rate (CER)
# CER = Edit Distance / Length of Ground Truth
evaluation_results_df['cer'] = evaluation_results_df['edit_distance'] / evaluation_results_df['gt_length']
evaluation_results_df['cer'] = evaluation_results_df['cer'].fillna(0)


mean_char_accuracy = evaluation_results_df['char_accuracy'].mean()
mean_cer = evaluation_results_df['cer'].mean()

# Calculate Exact Match Rate (where edit distance is 0)
exact_matches = (evaluation_results_df['edit_distance'] == 0).sum()
total_samples = len(evaluation_results_df)
exact_match_rate = exact_matches / total_samples

print("\n--- Top-Level Metrics ---")
print(f"Total Samples Evaluated: {total_samples}")
print(f"✅ Mean Character Accuracy: {mean_char_accuracy:.4f}  (Higher is better)")
print(f"❌ Mean Character Error Rate (CER): {mean_cer:.4f}  (Lower is better)")
print(f"🎯 Exact Match Rate: {exact_match_rate:.4f} ({exact_matches}/{total_samples} perfect predictions)")


# --- 2. Distribution of Scores ---
print("\n--- Distribution of Character Accuracy ---")
print(evaluation_results_df['char_accuracy'].describe())


# --- 3. Qualitative Analysis (Worst & Best Cases) ---

results_sorted = evaluation_results_df.sort_values(by='char_accuracy', ascending=True)

print("\n--- Worst Performing Examples (Lowest Accuracy) ---")
pd.set_option('display.max_colwidth', None)
print(results_sorted[['ground_truth', 'prediction', 'char_accuracy', 'cer']].head(5))


print("\n--- Best Performing Examples (Highest Accuracy) ---")
print(results_sorted[['ground_truth', 'prediction', 'char_accuracy', 'cer']].tail(5))

📊 Generating Evaluation Summary...

--- Top-Level Metrics ---
Total Samples Evaluated: 20809
✅ Mean Character Accuracy: 0.6717  (Higher is better)
❌ Mean Character Error Rate (CER): 0.3283  (Lower is better)
🎯 Exact Match Rate: 0.4726 (9835/20809 perfect predictions)

--- Distribution of Character Accuracy ---
count    20809.000000
mean         0.671731
std          0.442648
min        -11.400000
25%          0.400000
50%          0.857143
75%          1.000000
max          1.000000
Name: char_accuracy, dtype: float64

--- Worst Performing Examples (Lowest Accuracy) ---
      ground_truth  \
13590        kuchy   
18849       OWDOWN   
7213      MOTOROLA   
2765     FOKEIGNES   
20048          JGS   

                                                                        prediction  \
13590             EXCLUDING ON ONLY ON ONLY ONCLUSED ON FACE NOT RECE ON RECE REIN   
18849       AMPROVING ON ONLY ON ONLY ON ONLY ON ONLY ON ON ON ON ON ONLY ON ON ON   
7213      CASHIER ONCLOSED ON ON