# TrOCR Bentham Dataset Evaluation
## Character Error Rate (CER) Evaluation of Fine-tuned TrOCR Model

This notebook loads a pre-trained TrOCR model that has been fine-tuned on the Bentham dataset and evaluates its performance using Character Error Rate (CER) metrics.

**Key Features:**
- Skip training phase and directly load trained model
- Comprehensive CER evaluation on test dataset
- Sample predictions and analysis
- Detailed performance metrics


In [1]:
!pip install -q transformers datasets jiwer evaluate torch torchvision tqdm

In [2]:
import os
import pandas as pd
from PIL import Image
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import TrOCRProcessor, VisionEncoderDecoderModel
from tqdm.notebook import tqdm
import evaluate
import json
import numpy as np

def load_iam_format_dataset(dataset_dir, split='train'):
    """
    Load IAM format dataset (works with original IAM or converted Bentham)

    Args:
        dataset_dir: Path to dataset directory containing images/ and gt files
        split: 'train', 'val', or 'all'
    """

    images_dir = os.path.join(dataset_dir, 'images')

    # Choose the appropriate ground truth file
    if split == 'train':
        gt_file = os.path.join(dataset_dir, 'gt_train.txt')
    elif split == 'val':
        gt_file = os.path.join(dataset_dir, 'gt_val.txt')
    else:  # 'all' or fallback
        gt_file = os.path.join(dataset_dir, 'gt.txt')

    # Fallback to main gt.txt if split files don't exist
    if not os.path.exists(gt_file):
        gt_file = os.path.join(dataset_dir, 'gt.txt')
        print(f"Using fallback gt.txt for {split} split")

    data = []

    with open(gt_file, 'r', encoding='utf-8') as f:
        for line in f:
            line = line.strip()
            if line:
                # Split at first space: image_id text
                parts = line.split(' ', 1)
                if len(parts) == 2:
                    image_id, text = parts

                    # Find image file (try different extensions)
                    image_path = None
                    for ext in ['.png', '.jpg', '.jpeg', '.tiff', '.bmp']:
                        potential_path = os.path.join(images_dir, f"{image_id}{ext}")
                        if os.path.exists(potential_path):
                            image_path = potential_path
                            break

                    if image_path:
                        data.append({
                            'image_id': image_id,
                            'file_name': f"{image_id}{os.path.splitext(image_path)[1]}",
                            'text': text,
                            'image_path': image_path
                        })
                    else:
                        print(f"Warning: Image not found for {image_id}")

    return pd.DataFrame(data)

print('Libraries imported successfully')

Libraries imported successfully


In [6]:
# Load the dataset
dataset_dir = 'B:\\SaravanaVel\\Documents\\Academics MCA\\Project\\Final Code\\Final Code\\Dataset\\bentham_iam_format'

# Load training and test data
print('Loading datasets...')
train_df = load_iam_format_dataset(dataset_dir, split='train')
test_df = load_iam_format_dataset(dataset_dir, split='val')  # Using val as test

print(f'Training samples: {len(train_df)}')
print(f'Test samples: {len(test_df)}')

# Display sample data
print('\nSample from test dataset:')
print(test_df.head())

# Check if images exist
print(f'\nFirst test image exists: {os.path.exists(test_df.iloc[0]["image_path"])}')
print(f'Sample text: {test_df.iloc[0]["text"]}')
print(f'Sample image path: {test_df.iloc[0]["image_path"]}')

Loading datasets...
Training samples: 18356
Test samples: 4590

Sample from test dataset:
         image_id           file_name  \
0  bentham_004262  bentham_004262.png   
1  bentham_008658  bentham_008658.png   
2  bentham_018238  bentham_018238.png   
3  bentham_009145  bentham_009145.png   
4  bentham_014041  bentham_014041.png   

                                                text  \
0       in general , or to the party in particular .   
1  upon specific acts  :  upon the wounding  , st...   
2  it necessary , and thereby preserve them from be-   
3  it remained a doubt whether it could be accomp...   
4                                         envelopped   

                                          image_path  
0  B:\SaravanaVel\Documents\Academics MCA\Project...  
1  B:\SaravanaVel\Documents\Academics MCA\Project...  
2  B:\SaravanaVel\Documents\Academics MCA\Project...  
3  B:\SaravanaVel\Documents\Academics MCA\Project...  
4  B:\SaravanaVel\Documents\Academics MCA\Project..

In [7]:
class BenthamDataset(Dataset):
    def __init__(self, df, processor, max_target_length=128):
        self.df = df
        self.processor = processor
        self.max_target_length = max_target_length

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        # Get image path and text
        image_path = self.df.loc[idx, 'image_path']
        text = self.df.loc[idx, 'text']

        # Load and process image (resize + normalize)
        image = Image.open(image_path).convert("RGB")
        pixel_values = self.processor(image, return_tensors="pt").pixel_values

        # Process text (tokenize)
        labels = self.processor.tokenizer(text,
                                        padding="max_length",
                                        max_length=self.max_target_length,
                                        truncation=True,
                                        return_tensors="pt").input_ids

        # Important: make sure PAD tokens are ignored by loss function
        labels[labels == self.processor.tokenizer.pad_token_id] = -100

        return {
            "pixel_values": pixel_values.squeeze(),
            "labels": labels.squeeze(),
            "text": text,  # Keep original text for evaluation
            "image_path": image_path
        }

print('Dataset class defined successfully')

Dataset class defined successfully


In [14]:
# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Load the fine-tuned model and processor
# Update this path to point to your trained model directory
MODEL_PATH = r"B:\\SaravanaVel\\Documents\\Academics MCA\\Project\\Final Code\\Final Code\\Models\\fine_tuned"  # Change this to your model path

print(f'Loading model from: {MODEL_PATH}')

try:
    # Load processor and model
    processor = TrOCRProcessor.from_pretrained(MODEL_PATH)
    model = VisionEncoderDecoderModel.from_pretrained(MODEL_PATH)
    model.to(device)
    model.eval()  # Set to evaluation mode
    
    print(' Model and processor loaded successfully')
    print(f'Model config: {model.config}')
    
except Exception as e:
    print(f' Error loading model: {e}')
    print('\nTrying to load from alternative paths...')
    
    # Try alternative paths
    alternative_paths = [
        "bentham-trocr-epoch-3",
        "bentham-trocr-epoch-2",
        "bentham-trocr-epoch-1"
    ]
    
    for alt_path in alternative_paths:
        try:
            print(f'Trying: {alt_path}')
            processor = TrOCRProcessor.from_pretrained(alt_path)
            model = VisionEncoderDecoderModel.from_pretrained(alt_path)
            model.to(device)
            model.eval()
            MODEL_PATH = alt_path
            print(f'✓ Successfully loaded from: {alt_path}')
            break
        except:
            continue
    else:
        print(' Could not find any trained model. Please check the model path.')

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


Using device: cpu
Loading model from: B:\\SaravanaVel\\Documents\\Academics MCA\\Project\\Final Code\\Final Code\\Models\\fine_tuned
 Model and processor loaded successfully
Model config: VisionEncoderDecoderConfig {
  "architectures": [
    "VisionEncoderDecoderModel"
  ],
  "decoder": {
    "activation_dropout": 0.0,
    "activation_function": "gelu",
    "add_cross_attention": true,
    "attention_dropout": 0.0,
    "classifier_dropout": 0.0,
    "cross_attention_hidden_size": 768,
    "d_model": 1024,
    "decoder_attention_heads": 16,
    "decoder_ffn_dim": 4096,
    "decoder_layerdrop": 0.0,
    "decoder_layers": 12,
    "dropout": 0.1,
    "dtype": "float32",
    "init_std": 0.02,
    "is_decoder": true,
    "layernorm_embedding": true,
    "max_position_embeddings": 512,
    "model_type": "trocr",
    "scale_embedding": false,
    "use_cache": false,
    "use_learned_position_embeddings": true,
    "vocab_size": 50265
  },
  "dtype": "float32",
  "encoder": {
    "attention_pro

In [15]:
# Create test dataset
test_dataset = BenthamDataset(df=test_df, processor=processor)

print(f"Created test dataset with {len(test_dataset)} samples")

# Create data loader for evaluation
test_dataloader = DataLoader(test_dataset, batch_size=8, shuffle=False)

print(f"Test batches: {len(test_dataloader)}")

Created test dataset with 4590 samples
Test batches: 574


In [16]:
# Load CER evaluation metric
cer_metric = evaluate.load("cer")
wer_metric = evaluate.load("wer")

print('Evaluation metrics loaded successfully')

Downloading builder script: 0.00B [00:00, ?B/s]

Evaluation metrics loaded successfully


In [17]:
def compute_cer(pred_ids, label_ids):
    """Compute Character Error Rate"""
    # Decode predictions
    pred_str = processor.batch_decode(pred_ids, skip_special_tokens=True)

    # Decode labels (replace -100 with pad token)
    label_ids_copy = label_ids.clone()
    label_ids_copy[label_ids_copy == -100] = processor.tokenizer.pad_token_id
    label_str = processor.batch_decode(label_ids_copy, skip_special_tokens=True)

    # Compute CER
    cer = cer_metric.compute(predictions=pred_str, references=label_str)
    return cer, pred_str, label_str

def compute_wer(pred_str, label_str):
    """Compute Word Error Rate"""
    wer = wer_metric.compute(predictions=pred_str, references=label_str)
    return wer

def predict_single_image(image_path, model, processor, device):
    """Predict text from a single image"""
    # Load and process image
    image = Image.open(image_path).convert("RGB")
    pixel_values = processor(image, return_tensors="pt").pixel_values.to(device)
    
    # Generate prediction
    with torch.no_grad():
        generated_ids = model.generate(pixel_values)
        predicted_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
    
    return predicted_text

print('CER computation functions defined successfully')

CER computation functions defined successfully


In [18]:
# Test the model on a few sample images first
print("Testing model on sample images...\n")

num_samples = 5
sample_results = []

for i in range(min(num_samples, len(test_df))):
    image_path = test_df.iloc[i]['image_path']
    ground_truth = test_df.iloc[i]['text']
    
    # Make prediction
    predicted_text = predict_single_image(image_path, model, processor, device)
    
    # Compute individual CER
    individual_cer = cer_metric.compute(predictions=[predicted_text], references=[ground_truth])
    individual_wer = wer_metric.compute(predictions=[predicted_text], references=[ground_truth])
    
    sample_results.append({
        'image_path': image_path,
        'ground_truth': ground_truth,
        'prediction': predicted_text,
        'cer': individual_cer,
        'wer': individual_wer
    })
    
    print(f"Sample {i+1}:")
    print(f"Image: {os.path.basename(image_path)}")
    print(f"Ground Truth: '{ground_truth}'")
    print(f"Prediction:   '{predicted_text}'")
    print(f"CER: {individual_cer:.4f}")
    print(f"WER: {individual_wer:.4f}")
    print("-" * 80)

# Calculate average for samples
avg_sample_cer = np.mean([r['cer'] for r in sample_results])
avg_sample_wer = np.mean([r['wer'] for r in sample_results])

print(f"\nSample Results Summary:")
print(f"Average CER on {num_samples} samples: {avg_sample_cer:.4f}")
print(f"Average WER on {num_samples} samples: {avg_sample_wer:.4f}")

Testing model on sample images...

Sample 1:
Image: bentham_004262.png
Ground Truth: 'in general , or to the party in particular .'
Prediction:   'in general , or to the party in particular .'
CER: 0.0000
WER: 0.0000
--------------------------------------------------------------------------------
Sample 2:
Image: bentham_008658.png
Ground Truth: 'upon specific acts  :  upon the wounding  , starving or poi'
Prediction:   'upon specific acts , upon the wounding , starving or poor-'
CER: 0.1186
WER: 0.1818
--------------------------------------------------------------------------------
Sample 3:
Image: bentham_018238.png
Ground Truth: 'it necessary , and thereby preserve them from be-'
Prediction:   'it necessary , and thereby pressure them from her'
CER: 0.1020
WER: 0.2222
--------------------------------------------------------------------------------
Sample 4:
Image: bentham_009145.png
Ground Truth: 'it remained a doubt whether it could be accomplished to any considerable extent .'
Pre

In [None]:
# Comprehensive evaluation on the full test dataset
print("Starting comprehensive evaluation on full test dataset...")
print(f"Total test samples: {len(test_dataset)}")
print(f"Total batches: {len(test_dataloader)}")
print("=" * 60)

model.eval()
total_cer = 0.0
total_wer = 0.0
num_batches = 0
all_predictions = []
all_ground_truths = []
batch_results = []

with torch.no_grad():
    for batch_idx, batch in enumerate(tqdm(test_dataloader, desc="Evaluating")):
        # Move pixel values to device
        pixel_values = batch["pixel_values"].to(device)
        
        # Generate predictions
        outputs = model.generate(pixel_values)
        
        # Compute CER for this batch
        cer, pred_str, label_str = compute_cer(pred_ids=outputs, label_ids=batch["labels"])
        wer = compute_wer(pred_str, label_str)
        
        # Store results
        total_cer += cer
        total_wer += wer
        num_batches += 1
        
        # Collect all predictions and ground truths
        all_predictions.extend(pred_str)
        all_ground_truths.extend(label_str)
        
        # Store batch results
        batch_results.append({
            'batch_idx': batch_idx,
            'batch_size': len(pred_str),
            'cer': cer,
            'wer': wer,
            'predictions': pred_str,
            'ground_truths': label_str
        })
        
        # Print progress every 10 batches
        if (batch_idx + 1) % 10 == 0:
            avg_cer_so_far = total_cer / num_batches
            avg_wer_so_far = total_wer / num_batches
            print(f"Processed {batch_idx + 1}/{len(test_dataloader)} batches | "
                  f"Running CER: {avg_cer_so_far:.4f} | "
                  f"Running WER: {avg_wer_so_far:.4f}")

# Calculate final metrics
final_cer = total_cer / num_batches
final_wer = total_wer / num_batches

# Also compute overall CER/WER on all predictions at once
overall_cer = cer_metric.compute(predictions=all_predictions, references=all_ground_truths)
overall_wer = wer_metric.compute(predictions=all_predictions, references=all_ground_truths)

print("\n" + "=" * 60)
print("EVALUATION RESULTS")
print("=" * 60)
print(f"Model Path: {MODEL_PATH}")
print(f"Test Dataset Size: {len(test_dataset)} samples")
print(f"Number of Batches: {num_batches}")
print("\nBatch-averaged Metrics:")
print(f"Final CER: {final_cer:.4f} ({final_cer*100:.2f}%)")
print(f"Final WER: {final_wer:.4f} ({final_wer*100:.2f}%)")
print("\nOverall Metrics (all predictions):")
print(f"Overall CER: {overall_cer:.4f} ({overall_cer*100:.2f}%)")
print(f"Overall WER: {overall_wer:.4f} ({overall_wer*100:.2f}%)")
print(f"Character Accuracy: {(1 - overall_cer) * 100:.2f}%")
print(f"Word Accuracy: {(1 - overall_wer) * 100:.2f}%")

Starting comprehensive evaluation on full test dataset...
Total test samples: 4590
Total batches: 574


Evaluating:   0%|          | 0/574 [00:00<?, ?it/s]

In [None]:
# Detailed analysis of results
print("DETAILED ANALYSIS")
print("=" * 60)

# CER distribution analysis
individual_cers = []
individual_wers = []

for pred, gt in zip(all_predictions, all_ground_truths):
    cer = cer_metric.compute(predictions=[pred], references=[gt])
    wer = wer_metric.compute(predictions=[pred], references=[gt])
    individual_cers.append(cer)
    individual_wers.append(wer)

individual_cers = np.array(individual_cers)
individual_wers = np.array(individual_wers)

print(f"\nCER Statistics:")
print(f"Mean CER: {np.mean(individual_cers):.4f}")
print(f"Median CER: {np.median(individual_cers):.4f}")
print(f"Std CER: {np.std(individual_cers):.4f}")
print(f"Min CER: {np.min(individual_cers):.4f}")
print(f"Max CER: {np.max(individual_cers):.4f}")

print(f"\nWER Statistics:")
print(f"Mean WER: {np.mean(individual_wers):.4f}")
print(f"Median WER: {np.median(individual_wers):.4f}")
print(f"Std WER: {np.std(individual_wers):.4f}")
print(f"Min WER: {np.min(individual_wers):.4f}")
print(f"Max WER: {np.max(individual_wers):.4f}")

# Perfect predictions analysis
perfect_predictions = sum(1 for cer in individual_cers if cer == 0.0)
perfect_word_predictions = sum(1 for wer in individual_wers if wer == 0.0)

print(f"\nPerfect Predictions:")
print(f"Perfect character predictions: {perfect_predictions}/{len(individual_cers)} ({perfect_predictions/len(individual_cers)*100:.2f}%)")
print(f"Perfect word predictions: {perfect_word_predictions}/{len(individual_wers)} ({perfect_word_predictions/len(individual_wers)*100:.2f}%)")

# High error predictions analysis
high_cer_threshold = 0.5  # 50% character error
high_cer_count = sum(1 for cer in individual_cers if cer > high_cer_threshold)

print(f"\nHigh Error Analysis:")
print(f"Predictions with CER > {high_cer_threshold}: {high_cer_count}/{len(individual_cers)} ({high_cer_count/len(individual_cers)*100:.2f}%)")

In [None]:
# Show best and worst predictions
print("BEST AND WORST PREDICTIONS")
print("=" * 60)

# Get indices sorted by CER
cer_indices = np.argsort(individual_cers)

print("\n BEST PREDICTIONS (Lowest CER):")
print("-" * 40)
for i in range(min(5, len(cer_indices))):
    idx = cer_indices[i]
    print(f"\nRank {i+1} (CER: {individual_cers[idx]:.4f}):")
    print(f"Ground Truth: '{all_ground_truths[idx]}'")
    print(f"Prediction:   '{all_predictions[idx]}'")

print("\n\n WORST PREDICTIONS (Highest CER):")
print("-" * 40)
for i in range(min(5, len(cer_indices))):
    idx = cer_indices[-(i+1)]  # Get from the end
    print(f"\nRank {i+1} (CER: {individual_cers[idx]:.4f}):")
    print(f"Ground Truth: '{all_ground_truths[idx]}'")
    print(f"Prediction:   '{all_predictions[idx]}'")
    
    # Try to identify common error patterns
    gt_len = len(all_ground_truths[idx])
    pred_len = len(all_predictions[idx])
    print(f"Length: GT={gt_len}, Pred={pred_len}, Diff={pred_len-gt_len}")

In [None]:
# Performance comparison with benchmarks
print("PERFORMANCE COMPARISON")
print("=" * 60)

# Benchmark comparisons based on literature
benchmarks = {
    "TrOCR Base (IAM)": {"cer": 0.03, "source": "Original TrOCR paper"},
    "TrOCR Base (handwritten)": {"cer": 0.062, "source": "Fine-tuning example"},
    "Our Fine-tuned Model": {"cer": overall_cer, "source": "Current evaluation"}
}

print("\nCharacter Error Rate Comparison:")
print("-" * 50)
for model, metrics in benchmarks.items():
    print(f"{model:<25}: {metrics['cer']:.4f} ({metrics['cer']*100:.2f}%) - {metrics['source']}")

print("\n Model Performance Summary:")
print(f"- Dataset: Bentham historical handwriting dataset")
print(f"- Test samples: {len(test_dataset)}")
print(f"- Character Error Rate: {overall_cer:.4f} ({overall_cer*100:.2f}%)")
print(f"- Word Error Rate: {overall_wer:.4f} ({overall_wer*100:.2f}%)")
print(f"- Character Accuracy: {(1-overall_cer)*100:.2f}%")
print(f"- Word Accuracy: {(1-overall_wer)*100:.2f}%")
print(f"- Perfect character predictions: {perfect_predictions/len(individual_cers)*100:.2f}%")
print(f"- Perfect word predictions: {perfect_word_predictions/len(individual_wers)*100:.2f}%")

In [None]:
# Save detailed results to files
print("SAVING RESULTS")
print("=" * 60)

# Prepare results dictionary
results = {
    "model_path": MODEL_PATH,
    "dataset_size": len(test_dataset),
    "batch_averaged_metrics": {
        "cer": float(final_cer),
        "wer": float(final_wer)
    },
    "overall_metrics": {
        "cer": float(overall_cer),
        "wer": float(overall_wer),
        "character_accuracy": float((1 - overall_cer) * 100),
        "word_accuracy": float((1 - overall_wer) * 100)
    },
    "statistics": {
        "cer_mean": float(np.mean(individual_cers)),
        "cer_median": float(np.median(individual_cers)),
        "cer_std": float(np.std(individual_cers)),
        "cer_min": float(np.min(individual_cers)),
        "cer_max": float(np.max(individual_cers)),
        "perfect_char_predictions": int(perfect_predictions),
        "perfect_word_predictions": int(perfect_word_predictions),
        "perfect_char_percentage": float(perfect_predictions/len(individual_cers)*100),
        "perfect_word_percentage": float(perfect_word_predictions/len(individual_wers)*100)
    }
}

# Save results as JSON
results_filename = f"evaluation_results_{MODEL_PATH.replace('/', '_')}.json"
with open(results_filename, 'w') as f:
    json.dump(results, f, indent=2)

print(f"✓ Results saved to: {results_filename}")

# Save detailed predictions as CSV
detailed_results = []
for i, (pred, gt, cer, wer) in enumerate(zip(all_predictions, all_ground_truths, individual_cers, individual_wers)):
    detailed_results.append({
        'sample_id': i,
        'ground_truth': gt,
        'prediction': pred,
        'cer': cer,
        'wer': wer,
        'ground_truth_length': len(gt),
        'prediction_length': len(pred)
    })

detailed_df = pd.DataFrame(detailed_results)
detailed_filename = f"detailed_predictions_{MODEL_PATH.replace('/', '_')}.csv"
detailed_df.to_csv(detailed_filename, index=False)

print(f"✓ Detailed predictions saved to: {detailed_filename}")
print(f"✓ Total files saved: 2")

# Display final summary
print("\n" + "=" * 60)
print("EVALUATION COMPLETE!")
print("=" * 60)
print(f"Final CER: {overall_cer:.4f} ({overall_cer*100:.2f}%)")
print(f"Final WER: {overall_wer:.4f} ({overall_wer*100:.2f}%)")
print(f"Character Accuracy: {(1-overall_cer)*100:.2f}%")
print(f"Word Accuracy: {(1-overall_wer)*100:.2f}%")