# Complete TrOCR Fine-tuning + Model Calibration on Bentham Dataset



In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## Setup Environment

In [1]:
!pip install -q transformers torch torchvision

In [2]:
!pip install -q datasets jiwer evaluate tqdm

In [3]:
import os
import pandas as pd
import torch
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from PIL import Image
from transformers import VisionEncoderDecoderModel, TrOCRProcessor
from torch.optim import AdamW
import evaluate
from tqdm.notebook import tqdm
import numpy as np
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings('ignore')

# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


##  Load Bentham Dataset (IAM Format)

Make sure your dataset is converted to IAM format with structure:
```
bentham_iam_format/
├── images/           # Line images
├── gt.txt           # Ground truth file
├── gt_train.txt     # Training split (optional)
└── gt_val.txt       # Validation split (optional)
```

In [4]:
cd /content/drive/MyDrive/My progenAI

/content/drive/MyDrive/My progenAI


In [None]:
!unzip -q bentham_iam_format.zip -d dataset/

In [5]:
cd /content/drive/MyDrive/My progenAI/dataset/

/content/drive/MyDrive/My progenAI/dataset


In [6]:
def load_iam_format_dataset(dataset_dir, split='train'):
    """Load IAM format dataset (works with converted Bentham)"""

    images_dir = os.path.join(dataset_dir, 'images')

    # Choose ground truth file
    if split == 'train':
        gt_file = os.path.join(dataset_dir, 'gt_train.txt')
    elif split == 'val':
        gt_file = os.path.join(dataset_dir, 'gt_val.txt')
    else:
        gt_file = os.path.join(dataset_dir, 'gt.txt')

    # Fallback to main gt.txt
    if not os.path.exists(gt_file):
        gt_file = os.path.join(dataset_dir, 'gt.txt')
        print(f"Using fallback gt.txt for {split} split")

    data = []

    with open(gt_file, 'r', encoding='utf-8') as f:
        for line in f:
            line = line.strip()
            if line:
                parts = line.split(' ', 1)
                if len(parts) == 2:
                    image_id, text = parts

                    # Find image file
                    image_path = None
                    for ext in ['.png', '.jpg', '.jpeg', '.tiff', '.bmp']:
                        potential_path = os.path.join(images_dir, f"{image_id}{ext}")
                        if os.path.exists(potential_path):
                            image_path = potential_path
                            break

                    if image_path:
                        data.append({
                            'image_id': image_id,
                            'file_name': f"{image_id}{os.path.splitext(image_path)[1]}",
                            'text': text,
                            'image_path': image_path
                        })
                    else:
                        print(f"Warning: Image not found for {image_id}")

    return pd.DataFrame(data)

In [7]:
#  UPDATE THIS PATH TO YOUR DATASET LOCATION
DATASET_PATH = './bentham_iam_format'  # Change this to your actual path

# Load datasets
print("Loading Bentham dataset...")

# Try to load train/val splits first
train_df = load_iam_format_dataset(DATASET_PATH, split='train')
test_df = load_iam_format_dataset(DATASET_PATH, split='val')

# If no splits exist, create them from main dataset
if len(train_df) == 0 or len(test_df) == 0:
    print("No pre-made splits found. Creating train/val split from gt.txt...")
    df = load_iam_format_dataset(DATASET_PATH, split='all')
    train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

train_df.reset_index(drop=True, inplace=True)
test_df.reset_index(drop=True, inplace=True)

print(f"Dataset loaded successfully!")
print(f"Training samples: {len(train_df)}")
print(f"Validation samples: {len(test_df)}")

# Show sample data
print(f"\nSample training data:")
print(train_df.head())

Loading Bentham dataset...
Dataset loaded successfully!
Training samples: 18356
Validation samples: 4590

Sample training data:
         image_id           file_name  \
0  bentham_019702  bentham_019702.png   
1  bentham_016498  bentham_016498.png   
2  bentham_013987  bentham_013987.png   
3  bentham_016775  bentham_016775.png   
4  bentham_017994  bentham_017994.png   

                                                text  \
0  therefore it is further agreed that in pursuan...   
1                                        comparative   
2                                                 29   
3             +  v . Hale ' s Hist . P.C le Common .   
4  guilty , and from which none but the guilty ev...   

                                       image_path  
0  ./bentham_iam_format/images/bentham_019702.png  
1  ./bentham_iam_format/images/bentham_016498.png  
2  ./bentham_iam_format/images/bentham_013987.png  
3  ./bentham_iam_format/images/bentham_016775.png  
4  ./bentham_iam_format/imag

##  Create Dataset and DataLoaders

In [8]:
class BenthamDataset(Dataset):
    def __init__(self, df, processor, max_target_length=128):
        self.df = df
        self.processor = processor
        self.max_target_length = max_target_length

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        image_path = self.df.loc[idx, 'image_path']
        text = self.df.loc[idx, 'text']

        # Process image
        image = Image.open(image_path).convert("RGB")
        pixel_values = self.processor(image, return_tensors="pt").pixel_values

        # Process text
        labels = self.processor.tokenizer(text,
                                        padding="max_length",
                                        max_length=self.max_target_length,
                                        truncation=True,
                                        return_tensors="pt").input_ids

        # Replace PAD tokens with -100 for loss computation
        labels[labels == self.processor.tokenizer.pad_token_id] = -100

        return {
            "pixel_values": pixel_values.squeeze(),
            "labels": labels.squeeze(),
            "text": text  # Keep original text for calibration
        }

In [9]:
# Initialize processor
#processor = TrOCRProcessor.from_pretrained("microsoft/trocr-base-handwritten")

#Added to increase training speed
# Resize images smaller during processing to save compute
processor = TrOCRProcessor.from_pretrained("microsoft/trocr-base-handwritten")

# Create datasets
train_dataset = BenthamDataset(df=train_df, processor=processor)
eval_dataset = BenthamDataset(df=test_df, processor=processor)

# Create dataloaders
BATCH_SIZE = 8  # Adjust based on your GPU memory # Use smaller batch size if memory constrained
train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
eval_dataloader = DataLoader(eval_dataset, batch_size=BATCH_SIZE, shuffle=False)

#Added to increase training speed
train_dataloader = DataLoader(
    train_dataset,
    batch_size=BATCH_SIZE,
    shuffle=True,
    num_workers=15,  # Increase if possible
    pin_memory=True
)

eval_dataloader = DataLoader(
    eval_dataset,
    batch_size=30,
    shuffle=False,
    num_workers=8,
    pin_memory=True
)


print(f"Created datasets:")
print(f"Batch Size:", BATCH_SIZE)
print(f"Training batches: {len(train_dataloader)}")
print(f"Validation batches: {len(eval_dataloader)}")

# Test dataset
sample = train_dataset[0]
print(f"\nSample shapes:")
for k, v in sample.items():
    if k != 'text':
        print(f"{k}: {v.shape}")
    else:
        print(f"{k}: '{v[:50]}...'")

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

preprocessor_config.json:   0%|          | 0.00/224 [00:00<?, ?B/s]

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/772 [00:00<?, ?B/s]

Created datasets:
Batch Size: 8
Training batches: 2295
Validation batches: 153

Sample shapes:
pixel_values: torch.Size([3, 384, 384])
labels: torch.Size([128])
text: 'therefore it is further agreed that in pursuance o...'


## Initialize TrOCR Model

In [10]:
# Load and configure model
print("Loading TrOCR model...")
model = VisionEncoderDecoderModel.from_pretrained("microsoft/trocr-base-stage1")
model.to(device)

# Configure model for Bentham dataset
model.config.decoder_start_token_id = processor.tokenizer.cls_token_id
model.config.pad_token_id = processor.tokenizer.pad_token_id
model.config.vocab_size = model.config.decoder.vocab_size

# Beam search parameters
model.config.eos_token_id = processor.tokenizer.sep_token_id
model.config.max_length = 64
model.config.early_stopping = True
model.config.no_repeat_ngram_size = 3
model.config.length_penalty = 2.0
model.config.num_beams = 4

print("Model configured for Bentham dataset")

Loading TrOCR model...


config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/1.54G [00:00<?, ?B/s]

Some weights of VisionEncoderDecoderModel were not initialized from the model checkpoint at microsoft/trocr-base-stage1 and are newly initialized: ['encoder.pooler.dense.bias', 'encoder.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


generation_config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Model configured for Bentham dataset


## Setup Evaluation Metrics

In [11]:
# Load CER metric
cer_metric = evaluate.load("cer")

def compute_cer(pred_ids, label_ids):
    """Compute Character Error Rate for batches"""
    pred_str = processor.batch_decode(pred_ids, skip_special_tokens=True)

    label_ids[label_ids == -100] = processor.tokenizer.pad_token_id
    label_str = processor.batch_decode(label_ids, skip_special_tokens=True)

    cer = cer_metric.compute(predictions=pred_str, references=label_str)
    return cer

def compute_cer_single(pred_str, label_str):
    """Compute CER for single strings"""
    return cer_metric.compute(predictions=[pred_str], references=[label_str])

print("Evaluation metrics ready")

Downloading builder script: 0.00B [00:00, ?B/s]

Evaluation metrics ready


# Phase 1: Standard Fine-tuning

First, we perform standard fine-tuning on the Bentham dataset.

In [None]:
# Training parameters
LEARNING_RATE = 5e-5
NUM_EPOCHS = 2  # Standard fine-tuning epochs

print(f" Starting Standard Fine-tuning")
print(f"Learning rate: {LEARNING_RATE}")
print(f"Epochs: {NUM_EPOCHS}")
print(f"Training on {len(train_dataset)} samples")
print(f"Validating on {len(eval_dataset)} samples")
print(f"Device: {device}")

# Setup optimizer
optimizer = AdamW(model.parameters(), lr=LEARNING_RATE)

 Starting Standard Fine-tuning
Learning rate: 5e-05
Epochs: 2
Training on 18356 samples
Validating on 4590 samples
Device: cuda


In [None]:
# Standard Fine-tuning Loop
training_history = []

for epoch in range(NUM_EPOCHS):
    print(f"\n{'='*60}")
    print(f"EPOCH {epoch + 1}/{NUM_EPOCHS} - Standard Fine-tuning")
    print(f"{'='*60}")

    # Training phase
    model.train()
    train_loss = 0.0

    from torch.cuda.amp import GradScaler, autocast
    scaler = GradScaler()

    print("Training...")
    for batch in tqdm(train_dataloader, desc=f"Training Epoch {epoch + 1}"):
        optimizer.zero_grad()
        pixel_values = batch['pixel_values'].to(device)
        labels = batch['labels'].to(device)

        with autocast():
            outputs = model(pixel_values=pixel_values, labels=labels)
            loss = outputs.loss

        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()

        train_loss += loss.item()

    avg_train_loss = train_loss / len(train_dataloader)

    # Validation phase
    model.eval()
    valid_cer = 0.0

    print("Validating...")
    with torch.no_grad():
        for batch in tqdm(eval_dataloader, desc=f"Validation Epoch {epoch + 1}"):
            pixel_values = batch['pixel_values'].to(device)
            labels = batch['labels']

            # Generate predictions
            outputs = model.generate(pixel_values, num_beams=1)

            # Compute CER
            cer = compute_cer(pred_ids=outputs, label_ids=labels)
            valid_cer += cer

    avg_valid_cer = valid_cer / len(eval_dataloader)

    # Log results
    print(f"\n Epoch {epoch + 1} Results:")
    print(f"   Training Loss: {avg_train_loss:.4f}")
    print(f"   Validation CER: {avg_valid_cer:.4f}")

    # Store history
    training_history.append({
        'epoch': epoch + 1,
        'train_loss': avg_train_loss,
        'val_cer': avg_valid_cer
    })

    # Save checkpoint every 2 epochs
    if (epoch + 1) % 2 == 0:
        checkpoint_dir = f"bentham-trocr-epoch-{epoch + 1}"
        model.save_pretrained(checkpoint_dir)
        processor.save_pretrained(checkpoint_dir)
        print(f"Checkpoint saved: {checkpoint_dir}")

print("\n Standard Fine-tuning Completed!")

# Save final fine-tuned model
model.save_pretrained("bentham-trocr-finetuned")
processor.save_pretrained("bentham-trocr-finetuned")
print("Fine-tuned model saved: bentham-trocr-finetuned/")


EPOCH 1/2 - Standard Fine-tuning
Training...


Training Epoch 1:   0%|          | 0/2295 [00:00<?, ?it/s]

Validating...


Validation Epoch 1:   0%|          | 0/153 [00:00<?, ?it/s]

The following generation flags are not valid and may be ignored: ['early_stopping', 'length_penalty']. Set `TRANSFORMERS_VERBOSITY=info` for more details.



 Epoch 1 Results:
   Training Loss: 3.9705
   Validation CER: 0.8010

EPOCH 2/2 - Standard Fine-tuning
Training...


Training Epoch 2:   0%|          | 0/2295 [00:00<?, ?it/s]

Validating...


Validation Epoch 2:   0%|          | 0/153 [00:00<?, ?it/s]


 Epoch 2 Results:
   Training Loss: 3.0545
   Validation CER: 0.7389
Checkpoint saved: bentham-trocr-epoch-2

 Standard Fine-tuning Completed!
Fine-tuned model saved: bentham-trocr-finetuned/


In [None]:
# Show training history
print("Training History:")
print("Epoch | Train Loss | Val CER")
print("-" * 30)
for record in training_history:
    print(f"{record['epoch']:5d} | {record['train_loss']:10.4f} | {record['val_cer']:7.4f}")

Training History:
Epoch | Train Loss | Val CER
------------------------------
    1 |     3.9705 |  0.8010
    2 |     3.0545 |  0.7389


#  Phase 2: Model Calibration

Now we calibrate the fine-tuned model to improve confidence predictions and overall performance.

## Load the model

In [12]:
cd /content/drive/MyDrive/My progenAI/dataset/

/content/drive/MyDrive/My progenAI/dataset


In [None]:
mkdir bentham_model

In [13]:
!pwd

/content/drive/MyDrive/My progenAI/dataset


In [None]:
!unzip -q bentham-trocr-finetuned-001.zip -d bentham_model/

In [14]:
#load the model
from transformers import VisionEncoderDecoderModel, TrOCRProcessor

# Paths to your saved fine-tuned model and processor
MODEL_DIR = "/content/drive/MyDrive/My progenAI/dataset/bentham_model/bentham-trocr-finetuned"

# Load the saved fine-tuned model and processor
print("Loading fine-tuned model and processor...")
model = VisionEncoderDecoderModel.from_pretrained(MODEL_DIR)
processor = TrOCRProcessor.from_pretrained(MODEL_DIR)

model.to(device)
model.eval()
print("Bentham Model and processor loaded and ready.")


Loading fine-tuned model and processor...
Bentham Model and processor loaded and ready.


In [15]:
print("MODEL CALIBRATION PHASE")
print("=" * 50)
print("Calibration will:")
print("• Align model confidence with actual performance")
print("• Use beam search to generate multiple candidates")
print("• Apply margin loss to maximize probability gaps")
print("• Improve overall CER and reliability")
print("=" * 50)

MODEL CALIBRATION PHASE
Calibration will:
• Align model confidence with actual performance
• Use beam search to generate multiple candidates
• Apply margin loss to maximize probability gaps
• Improve overall CER and reliability


In [16]:
class TrOCRCalibrator:
    """TrOCR Model Calibrator using Margin Loss"""

    def __init__(self, model, processor, device, num_beams=4):
        self.model = model
        self.processor = processor
        self.device = device
        self.num_beams = num_beams

    def generate_candidates(self, pixel_values, num_return_sequences=4):
        """Generate multiple candidate outputs using beam search"""
        with torch.no_grad():
            outputs = self.model.generate(
                pixel_values,
                num_beams=self.num_beams,
                num_return_sequences=min(num_return_sequences, self.num_beams),
                return_dict_in_generate=True,
                output_scores=True,
                early_stopping=True
            )

        sequences = outputs.sequences
        scores = outputs.sequences_scores

        # Decode sequences
        decoded_sequences = self.processor.batch_decode(sequences, skip_special_tokens=True)

        # Convert scores to probabilities
        probabilities = F.softmax(scores, dim=0)

        return decoded_sequences, probabilities.cpu().numpy()

    def compute_margin_loss(self, pixel_values, ground_truth, margin=0.1):
        """Compute margin loss for calibration (best performing method)"""
        try:
            candidates, probs = self.generate_candidates(pixel_values)

            if len(candidates) < 2:
                return torch.tensor(0.0, device=self.device)

            # Compute CER for each candidate
            cers = []
            for candidate in candidates:
                cer = compute_cer_single(candidate, ground_truth)
                cers.append(cer)

            cers = np.array(cers)
            probs_tensor = torch.tensor(probs, device=self.device, requires_grad=True)

            # Find best and worst candidates
            best_idx = np.argmin(cers)
            worst_idx = np.argmax(cers)

            if best_idx != worst_idx:
                prob_best = probs_tensor[best_idx]
                prob_worst = probs_tensor[worst_idx]

                # Margin loss: maximize gap between best and worst
                loss = torch.clamp(margin - (prob_best - prob_worst), min=0)
                return loss
            else:
                return torch.tensor(0.0, device=self.device)

        except Exception as e:
            return torch.tensor(0.0, device=self.device)

In [17]:
def calibrate_model(model, train_dataloader, processor, device, num_epochs=2, lr=1e-5, max_batches=100):
    """Calibrate the fine-tuned model using margin loss"""

    print(f"Starting Model Calibration...")
    print(f"Calibration epochs: {num_epochs}")
    print(f"Learning rate: {lr}")
    print(f"Max batches per epoch: {max_batches}")

    # Create calibrator
    calibrator = TrOCRCalibrator(model, processor, device)

    # Setup optimizer with lower learning rate
    optimizer = AdamW(model.parameters(), lr=lr)

    model.train()
    calibration_history = []

    for epoch in range(num_epochs):
        total_loss = 0.0
        valid_batches = 0

        print(f"\n=== Calibration Epoch {epoch + 1}/{num_epochs} ===")

        progress_bar = tqdm(train_dataloader, desc=f"Calibration Epoch {epoch + 1}")

        for batch_idx, batch in enumerate(progress_bar):
            if batch_idx >= max_batches:
                break

            optimizer.zero_grad()

            pixel_values = batch['pixel_values'].to(device)
            texts = batch['text']

            batch_loss = 0.0
            valid_samples = 0

            # Process each sample in the batch
            for i in range(len(pixel_values)):
                sample_pixel_values = pixel_values[i:i+1]
                ground_truth = texts[i]

                try:
                    loss = calibrator.compute_margin_loss(sample_pixel_values, ground_truth)
                    if loss.item() > 0:
                        batch_loss += loss
                        valid_samples += 1
                except Exception as e:
                    continue

            if valid_samples > 0:
                batch_loss = batch_loss / valid_samples  # Average over valid samples
                batch_loss.backward()
                optimizer.step()

                total_loss += batch_loss.item()
                valid_batches += 1

                # Update progress bar
                progress_bar.set_postfix({'loss': f'{batch_loss.item():.6f}'})

        avg_loss = total_loss / valid_batches if valid_batches > 0 else 0
        print(f"Average Calibration Loss: {avg_loss:.6f}")
        print(f"Valid batches processed: {valid_batches}")

        calibration_history.append({
            'epoch': epoch + 1,
            'calibration_loss': avg_loss,
            'valid_batches': valid_batches
        })

    print("\nModel Calibration Completed!")
    return model, calibration_history

In [21]:
# Run Calibration
CALIBRATION_EPOCHS = 2
CALIBRATION_LR = 1e-5
MAX_CALIBRATION_BATCHES = 150 # Limit for faster calibration


# Optional subset
import random
subset_size = 500
subset_indices = random.sample(range(len(train_dataset)), subset_size)
subset_dataset = torch.utils.data.Subset(train_dataset, subset_indices)
calibration_dataloader = DataLoader(subset_dataset, batch_size=4, shuffle=True, num_workers=8, pin_memory=True)



print("Running Model Calibration...")
calibrated_model, cal_history = calibrate_model(
    model=model,
    train_dataloader=calibration_dataloader,
    processor=processor,
    device=device,
    num_epochs=CALIBRATION_EPOCHS,
    lr=CALIBRATION_LR,
    max_batches=MAX_CALIBRATION_BATCHES
)

# Save calibrated model
calibrated_model.save_pretrained("bentham-trocr-calibrated")
processor.save_pretrained("bentham-trocr-calibrated")
print("Calibrated model saved: bentham-trocr-calibrated/")

Running Model Calibration...
Starting Model Calibration...
Calibration epochs: 2
Learning rate: 1e-05
Max batches per epoch: 150

=== Calibration Epoch 1/2 ===


Calibration Epoch 1:   0%|          | 0/125 [00:00<?, ?it/s]

Average Calibration Loss: 0.097953
Valid batches processed: 125

=== Calibration Epoch 2/2 ===


Calibration Epoch 2:   0%|          | 0/125 [00:00<?, ?it/s]

Average Calibration Loss: 0.098712
Valid batches processed: 125

Model Calibration Completed!
Calibrated model saved: bentham-trocr-calibrated/


In [22]:
# Show calibration history
print("Calibration History:")
print("Epoch | Calibration Loss | Valid Batches")
print("-" * 40)
for record in cal_history:
    print(f"{record['epoch']:5d} | {record['calibration_loss']:16.6f} | {record['valid_batches']:12d}")

Calibration History:
Epoch | Calibration Loss | Valid Batches
----------------------------------------
    1 |         0.097953 |          125
    2 |         0.098712 |          125


# Phase 3: Model Evaluation & Comparison

Now let's evaluate and compare the performance of both models.

In [50]:
def evaluate_model_detailed(model, eval_dataloader, model_name, confidence_threshold=0.5):
    """Detailed evaluation with confidence scoring"""

    model.eval()

    total_cer = 0.0
    total_prob = 0.0
    correct_count = 0
    incorrect_count = 0
    skipped_count = 0
    num_samples = 0

    all_results = []

    print(f"Evaluating {model_name}...")

    with torch.no_grad():
        for batch in tqdm(eval_dataloader, desc=f"Evaluating {model_name}"):
            pixel_values = batch['pixel_values'].to(device)
            texts = batch['text']

            outputs = model.generate(
                pixel_values,
                return_dict_in_generate=True,
                output_scores=True,
                num_beams=1
            )

            sequences = outputs.sequences

            # sequences_scores not always available - use ones
            try:
                scores = outputs.sequences_scores
                probabilities = torch.sigmoid(scores).cpu().numpy()
            except AttributeError:
                probabilities = np.ones(len(sequences))



            # Decode predictions
            predictions = processor.batch_decode(sequences, skip_special_tokens=True)

            # Process each sample
            for i in range(len(predictions)):
                pred = predictions[i]
                ground_truth = texts[i]
                prob = probabilities[i]

                # Compute CER
                cer = compute_cer_single(pred, ground_truth)

                # Classification based on confidence threshold
                if prob < confidence_threshold:
                    skipped_count += 1
                    classification = "skipped"
                else:
                    if cer == 0:
                        correct_count += 1
                        classification = "correct"
                    else:
                        incorrect_count += 1
                        classification = "incorrect"

                # Store results
                all_results.append({
                    'prediction': pred,
                    'ground_truth': ground_truth,
                    'cer': cer,
                    'confidence': prob,
                    'classification': classification
                })

                total_cer += cer
                total_prob += prob
                num_samples += 1

    # Calculate averages
    avg_cer = total_cer / num_samples if num_samples > 0 else 0
    avg_prob = total_prob / num_samples if num_samples > 0 else 0

    # Print results
    print(f"\n=== {model_name} Results ===")
    print(f"Average CER: {avg_cer:.4f}")
    print(f"Average confidence: {avg_prob:.4f}")
    print(f"Correct: {correct_count}")
    print(f"Incorrect: {incorrect_count}")
    print(f"Skipped (low confidence): {skipped_count}")
    print(f"Total samples: {num_samples}")

    return {
        'avg_cer': avg_cer,
        'avg_confidence': avg_prob,
        'correct': correct_count,
        'incorrect': incorrect_count,
        'skipped': skipped_count,
        'total': num_samples,
        'all_results': all_results
    }

In [51]:
import random
from torch.utils.data import Subset, DataLoader

VALID_SUBSET_SIZE = 50 # Pick 50-100 for very fast turnaround

# Randomly choose subset indices
subset_indices = random.sample(range(len(eval_dataset)), min(VALID_SUBSET_SIZE, len(eval_dataset)))
eval_subset = Subset(eval_dataset, subset_indices)

# Use a small batch size for quick inference per batch
eval_subset_loader = DataLoader(
    eval_subset,
    batch_size=2,  # 2–4 recommended for autoregressive evaluation speed/memory
    shuffle=False,
    num_workers=4,
    pin_memory=True
)

In [52]:
# Load original fine-tuned model for fair comparison
print("Loading original fine-tuned model for comparison...")
BENTHAM_MODEL_DIR = "/content/drive/MyDrive/My progenAI/dataset/bentham_model/bentham-trocr-finetuned"
original_model = VisionEncoderDecoderModel.from_pretrained(BENTHAM_MODEL_DIR)
original_model.to(device)

# Evaluate both models
print("\n COMPREHENSIVE MODEL EVALUATION")
print("=" * 50)


original_results = evaluate_model_detailed(original_model, eval_subset_loader, "Original Fine-tuned Model")
calibrated_results = evaluate_model_detailed(calibrated_model, eval_subset_loader, "Calibrated Model")

Loading original fine-tuned model for comparison...

 COMPREHENSIVE MODEL EVALUATION
Evaluating Original Fine-tuned Model...


Evaluating Original Fine-tuned Model:   0%|          | 0/25 [00:00<?, ?it/s]


=== Original Fine-tuned Model Results ===
Average CER: 0.7327
Average confidence: 1.0000
Correct: 5
Incorrect: 45
Skipped (low confidence): 0
Total samples: 50
Evaluating Calibrated Model...


Evaluating Calibrated Model:   0%|          | 0/25 [00:00<?, ?it/s]


=== Calibrated Model Results ===
Average CER: 0.7327
Average confidence: 1.0000
Correct: 5
Incorrect: 45
Skipped (low confidence): 0
Total samples: 50


In [53]:
# Comprehensive Results Comparison
def compare_models(original_results, calibrated_results):
    """Compare original vs calibrated model performance"""

    print("\n" + "="*70)
    print(" FINAL RESULTS: ORIGINAL vs CALIBRATED MODEL")
    print("="*70)

    # CER improvement
    if original_results['avg_cer'] > 0:
        cer_improvement = ((original_results['avg_cer'] - calibrated_results['avg_cer'])
                          / original_results['avg_cer']) * 100
        print(f" CHARACTER ERROR RATE (CER):")
        print(f"   Original CER:    {original_results['avg_cer']:.4f}")
        print(f"   Calibrated CER:  {calibrated_results['avg_cer']:.4f}")
        print(f"   Improvement:     {cer_improvement:+.1f}%")

        if cer_improvement > 0:
            print(f"   CER IMPROVED by {cer_improvement:.1f}%!")
        else:
            print(f"   CER worsened by {abs(cer_improvement):.1f}%")

    # Confidence improvement
    if original_results['avg_confidence'] > 0:
        conf_improvement = ((calibrated_results['avg_confidence'] - original_results['avg_confidence'])
                           / original_results['avg_confidence']) * 100
        print(f"\n CONFIDENCE SCORES:")
        print(f"   Original Confidence:  {original_results['avg_confidence']:.4f}")
        print(f"   Calibrated Confidence: {calibrated_results['avg_confidence']:.4f}")
        print(f"   Change:               {conf_improvement:+.1f}%")

    # Accuracy comparison
    orig_total_classified = original_results['correct'] + original_results['incorrect']
    cal_total_classified = calibrated_results['correct'] + calibrated_results['incorrect']

    if orig_total_classified > 0 and cal_total_classified > 0:
        orig_accuracy = original_results['correct'] / orig_total_classified * 100
        cal_accuracy = calibrated_results['correct'] / cal_total_classified * 100

        print(f"\n ACCURACY (Correct Predictions):")
        print(f"   Original Accuracy:   {orig_accuracy:.1f}% ({original_results['correct']}/{orig_total_classified})")
        print(f"   Calibrated Accuracy: {cal_accuracy:.1f}% ({calibrated_results['correct']}/{cal_total_classified})")
        print(f"   Change:              {cal_accuracy - orig_accuracy:+.1f} percentage points")

    # Skipped examples comparison
    print(f"\n LOW CONFIDENCE SAMPLES (Skipped):")
    print(f"   Original Skipped:   {original_results['skipped']}")
    print(f"   Calibrated Skipped: {calibrated_results['skipped']}")

    if original_results['skipped'] > 0:
        skip_change = ((original_results['skipped'] - calibrated_results['skipped'])
                      / original_results['skipped']) * 100
        print(f"   Change:             {skip_change:+.1f}%")

        if skip_change > 0:
            print(f"    Fewer samples skipped (more confident predictions)")
        elif skip_change < 0:
            print(f"     More samples skipped (more conservative)")

    print("\n" + "="*70)
    print(" CALIBRATION SUMMARY:")

    if original_results['avg_cer'] > calibrated_results['avg_cer']:
        print(" Model calibration was SUCCESSFUL!")
        print("   • Better Character Error Rate")
        print("   • More reliable confidence scores")
        print("   • Ready for production use")
    else:
        print(" Calibration results mixed - consider different parameters")
        print("   • May need more calibration epochs")
        print("   • Try different calibration methods")
        print("   • Check dataset quality")

    print("="*70)

# Run comparison
compare_models(original_results, calibrated_results)


 FINAL RESULTS: ORIGINAL vs CALIBRATED MODEL
 CHARACTER ERROR RATE (CER):
   Original CER:    0.7327
   Calibrated CER:  0.7327
   Improvement:     +0.0%
   CER worsened by 0.0%

 CONFIDENCE SCORES:
   Original Confidence:  1.0000
   Calibrated Confidence: 1.0000
   Change:               +0.0%

 ACCURACY (Correct Predictions):
   Original Accuracy:   10.0% (5/50)
   Calibrated Accuracy: 10.0% (5/50)
   Change:              +0.0 percentage points

 LOW CONFIDENCE SAMPLES (Skipped):
   Original Skipped:   0
   Calibrated Skipped: 0

 CALIBRATION SUMMARY:
 Calibration results mixed - consider different parameters
   • May need more calibration epochs
   • Try different calibration methods
   • Check dataset quality


## Sample Predictions Comparison

In [54]:
def show_sample_predictions(original_model, calibrated_model, eval_dataset, num_samples=5):
    """Show side-by-side predictions from both models"""

    original_model.eval()
    calibrated_model.eval()

    print(f" SAMPLE PREDICTIONS COMPARISON")
    print("=" * 60)

    for i in range(min(num_samples, len(eval_dataset))):
        sample = eval_dataset[i]
        pixel_values = sample['pixel_values'].unsqueeze(0).to(device)
        ground_truth = sample['text']

        print(f"\n--- Sample {i + 1} ---")
        print(f"Ground Truth: '{ground_truth}'")

        # Original model prediction
        with torch.no_grad():
            orig_outputs = original_model.generate(
                pixel_values,
                return_dict_in_generate=True,
                output_scores=True
            )
            orig_pred = processor.batch_decode(orig_outputs.sequences, skip_special_tokens=True)[0]
            orig_conf = torch.sigmoid(orig_outputs.sequences_scores[0]).item()

        # Calibrated model prediction
        with torch.no_grad():
            cal_outputs = calibrated_model.generate(
                pixel_values,
                return_dict_in_generate=True,
                output_scores=True
            )
            cal_pred = processor.batch_decode(cal_outputs.sequences, skip_special_tokens=True)[0]
            cal_conf = torch.sigmoid(cal_outputs.sequences_scores[0]).item()

        # Compute CERs
        orig_cer = compute_cer_single(orig_pred, ground_truth)
        cal_cer = compute_cer_single(cal_pred, ground_truth)

        # Display results
        print(f"\nOriginal Model:")
        print(f"  Prediction: '{orig_pred}'")
        print(f"  Confidence: {orig_conf:.4f}")
        print(f"  CER:        {orig_cer:.4f}")

        print(f"Calibrated Model:")
        print(f"  Prediction: '{cal_pred}'")
        print(f"  Confidence: {cal_conf:.4f}")
        print(f"  CER:        {cal_cer:.4f}")

        # Show improvement/change
        if cal_cer < orig_cer:
            print(f"   CER Improved by {(orig_cer - cal_cer):.4f}")
        elif cal_cer > orig_cer:
            print(f"   CER Worsened by {(cal_cer - orig_cer):.4f}")
        else:
            print(f"  ➖ CER Unchanged")

        if abs(cal_conf - orig_conf) > 0.01:
            conf_change = cal_conf - orig_conf
            print(f"   Confidence changed by {conf_change:+.4f}")

        print("-" * 60)

# Show sample comparisons
show_sample_predictions(original_model, calibrated_model, eval_dataset, num_samples=3)

 SAMPLE PREDICTIONS COMPARISON

--- Sample 1 ---
Ground Truth: 'in general , or to the party in particular .'

Original Model:
  Prediction: ' general or in in in particular . .'
  Confidence: 0.4794
  CER:        0.4091
Calibrated Model:
  Prediction: ' general or in in in particular . .'
  Confidence: 0.4794
  CER:        0.4091
  ➖ CER Unchanged
------------------------------------------------------------

--- Sample 2 ---
Ground Truth: 'upon specific acts  :  upon the wounding  , starving or poi'

Original Model:
  Prediction: 'upon acts    upon , ; , , , or ,- , or'
  Confidence: 0.4845
  CER:        0.5593
Calibrated Model:
  Prediction: 'upon acts    upon , ; , , , or ,- , or'
  Confidence: 0.4845
  CER:        0.5593
  ➖ CER Unchanged
------------------------------------------------------------

--- Sample 3 ---
Ground Truth: 'it necessary , and thereby preserve them from be-'

Original Model:
  Prediction: 'it necessary and presence be--'
  Confidence: 0.4685
  CER:        0.4