# ✍️ Handwriting Text Recognition (HTR) Model

This notebook covers:
1. **Architecture**: CNN + BiLSTM + CTC Loss
2. **Datasets**: IAM, EMNIST integration
3. **Training**: PyTorch implementation
4. **Inference**: Text extraction from images

---

## 1. Setup & Imports

In [4]:
# Install dependencies if needed
# !pip install torch torchvision torchaudio
# !pip install matplotlib numpy pillow

In [5]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import numpy as np
import matplotlib.pyplot as plt
from PIL import Image
from pathlib import Path
from typing import List, Tuple, Dict, Optional
import warnings
warnings.filterwarnings('ignore')

# Check device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"PyTorch version: {torch.__version__}")
print(f"Device: {device}")

PyTorch version: 2.9.1
Device: cpu


---
## 2. Character Set & Encoding

In [6]:
class CharacterSet:
    """
    Character vocabulary for HTR.
    
    Includes:
    - Lowercase letters (a-z)
    - Uppercase letters (A-Z)
    - Digits (0-9)
    - Common punctuation and math symbols
    - Special tokens: <blank>, <unk>
    """
    
    def __init__(self):
        # Character vocabulary
        self.chars = (
            'abcdefghijklmnopqrstuvwxyz'
            'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
            '0123456789'
            ' .,:;!?\'-+=*/()[]{}@#$%&'
        )
        
        # Special tokens
        self.blank_token = '<blank>'  # CTC blank
        self.unk_token = '<unk>'      # Unknown character
        
        # Create mappings
        self.char_to_idx = {c: i + 1 for i, c in enumerate(self.chars)}
        self.char_to_idx[self.blank_token] = 0
        self.idx_to_char = {v: k for k, v in self.char_to_idx.items()}
        
        self.vocab_size = len(self.char_to_idx)
    
    def encode(self, text: str) -> List[int]:
        """Convert text to index sequence."""
        return [self.char_to_idx.get(c, self.char_to_idx.get(self.unk_token, 0)) for c in text]
    
    def decode(self, indices: List[int], remove_blanks: bool = True) -> str:
        """Convert index sequence to text."""
        if remove_blanks:
            # CTC decoding: remove consecutive duplicates and blanks
            prev = None
            chars = []
            for idx in indices:
                if idx != prev and idx != 0:  # Not blank and not duplicate
                    chars.append(self.idx_to_char.get(idx, ''))
                prev = idx
            return ''.join(chars)
        else:
            return ''.join(self.idx_to_char.get(idx, '') for idx in indices)


# Test
charset = CharacterSet()
print(f"Vocabulary size: {charset.vocab_size}")
print(f"Encode 'F = ma': {charset.encode('F = ma')}")
print(f"Decode [6, 0, 62, 0, 13, 1]: {charset.decode([6, 0, 62, 0, 13, 1])}")

Vocabulary size: 87
Encode 'F = ma': [32, 63, 73, 63, 13, 1]
Decode [6, 0, 62, 0, 13, 1]: f9ma


---
## 3. HTR Model Architecture

**CNN + BiLSTM + CTC**

```
Image → CNN (feature extraction) → BiLSTM (sequence modeling) → Linear → CTC Loss
```

| Component | Purpose |
|-----------|--------|
| CNN | Extract visual features from image regions |
| BiLSTM | Model left-to-right and right-to-left context |
| CTC | Alignment-free sequence loss |


In [None]:
class CNNBackbone(nn.Module):
    """
    CNN feature extractor for HTR.
    
    Architecture:
    - 7 Conv layers with batch norm and ReLU
    - Max pooling to reduce spatial dimensions
    - Output: (batch, channels, 1, width)
    """
    
    def __init__(self, input_channels: int = 1):
        super().__init__()
        
        # Layer specifications: (out_channels, kernel_size, stride, padding)
        self.cnn = nn.Sequential(
            # Layer 1: 32 -> 64x32
            nn.Conv2d(input_channels, 32, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm2d(32),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=2, stride=2),  # H/2, W/2
            
            # Layer 2: 64 -> 32x16
            nn.Conv2d(32, 64, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm2d(64),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=2, stride=2),  # H/4, W/4
            
            # Layer 3: 128
            nn.Conv2d(64, 128, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm2d(128),
            nn.ReLU(inplace=True),
            
            # Layer 4: 128 -> 16x8
            nn.Conv2d(128, 128, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm2d(128),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=(2, 1)),  # H/8, W/4
            
            # Layer 5: 256
            nn.Conv2d(128, 256, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm2d(256),
            nn.ReLU(inplace=True),
            
            # Layer 6: 256 -> 8x4
            nn.Conv2d(256, 256, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm2d(256),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=(2, 1)),  # H/16, W/4
            
            # Layer 7: 512
            nn.Conv2d(256, 512, kernel_size=2, stride=1, padding=0),
            nn.BatchNorm2d(512),
            nn.ReLU(inplace=True),
        )
    
    def forward(self, x: torch.Tensor) -> torch.Tensor:
        return self.cnn(x)


class HTRModel(nn.Module):
    """
    Complete HTR Model: CNN + BiLSTM + CTC
    
    Input: (batch, 1, height, width) grayscale image
    Output: (seq_len, batch, vocab_size) log probabilities
    """
    
    def __init__(
        self,
        vocab_size: int,
        input_height: int = 64,
        hidden_size: int = 256,
        num_lstm_layers: int = 2,
        dropout: float = 0.3
    ):
        super().__init__()
        
        self.vocab_size = vocab_size
        self.hidden_size = hidden_size
        
        # CNN backbone
        self.cnn = CNNBackbone(input_channels=1)
        
        # Calculate CNN output size
        # After CNN: height becomes ~1, width becomes W/4
        # Features per time step: 512 channels * remaining_height
        self.cnn_output_size = 512  # Channels from last conv
        
        # BiLSTM
        self.lstm = nn.LSTM(
            input_size=self.cnn_output_size,
            hidden_size=hidden_size,
            num_layers=num_lstm_layers,
            batch_first=False,
            bidirectional=True,
            dropout=dropout if num_lstm_layers > 1 else 0
        )
        
        # Output projection
        self.fc = nn.Linear(hidden_size * 2, vocab_size)  # *2 for bidirectional
        
        # Log softmax for CTC
        self.log_softmax = nn.LogSoftmax(dim=2)
    
    def forward(self, x: torch.Tensor) -> torch.Tensor:
        """
        Forward pass.
        
        Args:
            x: (batch, 1, H, W) input images
            
        Returns:
            (seq_len, batch, vocab_size) log probabilities
        """
        # CNN features: (batch, 512, H, W)
        features = self.cnn(x)
        
        # Reshape for LSTM: (batch, channels, H, W) -> (W, batch, channels)
        # Collapse the height dimension (H) robustly (works whether H==1 or >1)
        batch_size = features.size(0)
        features = features.mean(dim=2)  # (batch, 512, W)
        features = features.permute(2, 0, 1)  # (W, batch, 512)
        
        # BiLSTM: (seq, batch, hidden*2)
        lstm_out, _ = self.lstm(features)
        
        # Projection: (seq, batch, vocab)
        output = self.fc(lstm_out)
        
        # Log softmax
        return self.log_softmax(output)


# Test model
model = HTRModel(vocab_size=charset.vocab_size)
print(f"\nModel architecture:")
print(f"  Vocab size: {charset.vocab_size}")
print(f"  Parameters: {sum(p.numel() for p in model.parameters()):,}")

# Test forward pass
test_input = torch.randn(2, 1, 64, 256)  # (batch=2, channels=1, H=64, W=256)
test_output = model(test_input)
print(f"\nTest:")
print(f"  Input shape: {test_input.shape}")
print(f"  Output shape: {test_output.shape}  # (seq_len, batch, vocab)")


Model architecture:
  Vocab size: 87
  Parameters: 4,851,607


RuntimeError: permute(sparse_coo): number of dimensions in the tensor input does not match the length of the desired ordering of dimensions i.e. input.dim() = 4 is not equal to len(dims) = 3

---
## 4. Synthetic Dataset for Training

For initial development, we create synthetic handwriting-like data using PIL/OpenCV.

In [None]:
import random
import cv2
from PIL import Image, ImageDraw, ImageFont

class SyntheticHTRDataset(Dataset):
    """
    Generate synthetic handwriting-like images for training.
    
    Uses PIL to render text with various fonts and augmentations.
    """
    
    def __init__(
        self,
        charset: CharacterSet,
        num_samples: int = 10000,
        img_height: int = 64,
        img_width: int = 256,
        max_text_len: int = 20
    ):
        self.charset = charset
        self.num_samples = num_samples
        self.img_height = img_height
        self.img_width = img_width
        self.max_text_len = max_text_len
        
        # Sample texts (formulas, words, numbers)
        self.sample_texts = [
            "F = ma", "E = mc2", "a + b = c", "x = 5",
            "Answer", "Calculate", "Solve", "Find x",
            "Step 1", "Step 2", "Therefore", "Hence",
            "123", "456", "789", "100",
            "y = mx + b", "Area = lw", "V = IR",
            "sin x", "cos x", "tan x", "log x",
        ]
    
    def __len__(self):
        return self.num_samples
    
    def generate_text(self) -> str:
        """Generate random text sample."""
        if random.random() < 0.5:
            # Use predefined sample
            return random.choice(self.sample_texts)
        else:
            # Generate random string
            length = random.randint(3, self.max_text_len)
            chars = [random.choice(self.charset.chars) for _ in range(length)]
            return ''.join(chars)
    
    def render_text(self, text: str) -> np.ndarray:
        """Render text to image with augmentations."""
        # Create blank image
        img = Image.new('L', (self.img_width, self.img_height), color=255)
        draw = ImageDraw.Draw(img)
        
        # Use default font (for simplicity - in production use handwriting fonts)
        try:
            font_size = random.randint(20, 36)
            font = ImageFont.truetype("/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf", font_size)
        except:
            font = ImageFont.load_default()
        
        # Calculate text position
        bbox = draw.textbbox((0, 0), text, font=font)
        text_width = bbox[2] - bbox[0]
        text_height = bbox[3] - bbox[1]
        
        x = random.randint(5, max(5, self.img_width - text_width - 5))
        y = (self.img_height - text_height) // 2 + random.randint(-5, 5)
        
        # Draw text
        draw.text((x, y), text, font=font, fill=random.randint(0, 50))
        
        # Convert to numpy
        img_array = np.array(img)
        
        # Add augmentations
        img_array = self.augment(img_array)
        
        return img_array
    
    def augment(self, img: np.ndarray) -> np.ndarray:
        """Apply random augmentations."""
        # Random noise
        if random.random() < 0.3:
            noise = np.random.normal(0, 10, img.shape).astype(np.int16)
            img = np.clip(img.astype(np.int16) + noise, 0, 255).astype(np.uint8)
        
        # Random blur
        if random.random() < 0.2:
            kernel_size = random.choice([3, 5])
            img = cv2.GaussianBlur(img, (kernel_size, kernel_size), 0)
        
        # Random erosion/dilation
        if random.random() < 0.2:
            kernel = np.ones((2, 2), np.uint8)
            if random.random() < 0.5:
                img = cv2.erode(img, kernel, iterations=1)
            else:
                img = cv2.dilate(img, kernel, iterations=1)
        
        return img
    
    def __getitem__(self, idx) -> Tuple[torch.Tensor, torch.Tensor, int]:
        """Get a sample."""
        text = self.generate_text()
        img = self.render_text(text)
        
        # Normalize image
        img = img.astype(np.float32) / 255.0
        
        # Add channel dimension
        img = torch.FloatTensor(img).unsqueeze(0)  # (1, H, W)
        
        # Encode text
        label = torch.LongTensor(self.charset.encode(text))
        label_length = len(label)
        
        return img, label, label_length


# Test dataset
dataset = SyntheticHTRDataset(charset, num_samples=100)
print(f"Dataset size: {len(dataset)}")

# Visualize samples
fig, axes = plt.subplots(2, 4, figsize=(16, 6))
for ax in axes.flatten():
    idx = random.randint(0, len(dataset) - 1)
    img, label, length = dataset[idx]
    text = charset.decode(label.tolist(), remove_blanks=False)
    ax.imshow(img.squeeze(), cmap='gray')
    ax.set_title(f'"{text}"')
    ax.axis('off')
plt.tight_layout()
plt.show()

---
## 5. Training Loop with CTC Loss

In [None]:
def collate_fn(batch):
    """
    Custom collate function for variable-length labels.
    """
    images = torch.stack([item[0] for item in batch])
    labels = [item[1] for item in batch]
    label_lengths = torch.LongTensor([item[2] for item in batch])
    
    # Pad labels to max length
    max_len = max(len(l) for l in labels)
    padded_labels = torch.zeros(len(batch), max_len, dtype=torch.long)
    for i, label in enumerate(labels):
        padded_labels[i, :len(label)] = label
    
    return images, padded_labels, label_lengths


def train_htr_model(
    model: nn.Module,
    train_dataset: Dataset,
    epochs: int = 10,
    batch_size: int = 32,
    learning_rate: float = 0.001,
    device: str = 'cpu'
):
    """
    Train HTR model with CTC loss.
    """
    model = model.to(device)
    model.train()
    
    # DataLoader
    train_loader = DataLoader(
        train_dataset,
        batch_size=batch_size,
        shuffle=True,
        collate_fn=collate_fn,
        num_workers=0
    )
    
    # Loss and optimizer
    criterion = nn.CTCLoss(blank=0, reduction='mean', zero_infinity=True)
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)
    scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=5, gamma=0.5)
    
    history = {'loss': []}
    
    for epoch in range(epochs):
        epoch_loss = 0.0
        num_batches = 0
        
        for images, labels, label_lengths in train_loader:
            images = images.to(device)
            labels = labels.to(device)
            
            optimizer.zero_grad()
            
            # Forward pass: (seq_len, batch, vocab)
            outputs = model(images)
            
            # CTC loss
            # input_lengths: all sequences have same length (width of feature map)
            input_lengths = torch.full(
                (images.size(0),), outputs.size(0), dtype=torch.long, device=device
            )
            
            # Flatten labels for CTC
            labels_flat = labels.view(-1)
            
            loss = criterion(outputs, labels, input_lengths, label_lengths)
            
            # Backward pass
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=5.0)
            optimizer.step()
            
            epoch_loss += loss.item()
            num_batches += 1
        
        avg_loss = epoch_loss / num_batches
        history['loss'].append(avg_loss)
        scheduler.step()
        
        print(f"Epoch [{epoch+1}/{epochs}] Loss: {avg_loss:.4f}")
    
    return history


# Quick training demo
print("Training HTR model (demo with small dataset)...")
small_dataset = SyntheticHTRDataset(charset, num_samples=200)
model = HTRModel(vocab_size=charset.vocab_size)
history = train_htr_model(model, small_dataset, epochs=3, batch_size=16)

# Plot loss
plt.figure(figsize=(8, 4))
plt.plot(history['loss'])
plt.xlabel('Epoch')
plt.ylabel('CTC Loss')
plt.title('Training Loss')
plt.grid(True)
plt.show()

---
## 6. Inference: Image to Text

In [None]:
def recognize_text(model: nn.Module, image: np.ndarray, charset: CharacterSet) -> str:
    """
    Recognize text from image.
    
    Args:
        model: Trained HTR model
        image: Grayscale image (H, W) or (1, H, W)
        charset: Character set for decoding
        
    Returns:
        Recognized text string
    """
    model.eval()
    
    # Preprocess
    if len(image.shape) == 2:
        image = image[np.newaxis, ...]  # Add channel dim
    if image.shape[0] != 1:
        image = image[np.newaxis, ...]  # Add batch dim
    
    # Normalize
    if image.max() > 1:
        image = image.astype(np.float32) / 255.0
    
    # To tensor
    image_tensor = torch.FloatTensor(image)
    if image_tensor.dim() == 3:
        image_tensor = image_tensor.unsqueeze(0)  # Add batch dim
    
    # Forward pass
    with torch.no_grad():
        outputs = model(image_tensor)  # (seq, batch, vocab)
    
    # Argmax decoding (greedy)
    _, predictions = outputs.max(2)  # (seq, batch)
    predictions = predictions.squeeze(1).tolist()  # (seq,)
    
    # Decode
    text = charset.decode(predictions, remove_blanks=True)
    
    return text


# Test inference
print("Testing inference...")
test_img, test_label, _ = small_dataset[0]
test_img_np = test_img.squeeze().numpy()
true_text = charset.decode(test_label.tolist(), remove_blanks=False)

predicted_text = recognize_text(model, test_img_np, charset)

plt.figure(figsize=(10, 3))
plt.imshow(test_img_np, cmap='gray')
plt.title(f'True: "{true_text}"\nPredicted: "{predicted_text}"')
plt.axis('off')
plt.show()

---
## 7. Export Model

Export the model code for production use.

In [None]:
# Save model weights
model_path = Path("../models/htr_model.pth")
model_path.parent.mkdir(parents=True, exist_ok=True)
torch.save(model.state_dict(), model_path)
print(f"✅ Model saved to: {model_path.resolve()}")

# Export model class to Python file
HTR_MODEL_CODE = '''
"""
HTR Model: CNN + BiLSTM + CTC

Handwriting Text Recognition model for answer sheet evaluation.
"""
import torch
import torch.nn as nn
import numpy as np
from typing import List


class CharacterSet:
    """Character vocabulary for HTR."""
    
    def __init__(self):
        self.chars = (
            \'abcdefghijklmnopqrstuvwxyz\'
            \'ABCDEFGHIJKLMNOPQRSTUVWXYZ\'
            \'0123456789\'
            \' .,:;!?\\\'\'-+=*/()[]{}@#$%&\'
        )
        self.blank_token = \'<blank>\'
        self.char_to_idx = {c: i + 1 for i, c in enumerate(self.chars)}
        self.char_to_idx[self.blank_token] = 0
        self.idx_to_char = {v: k for k, v in self.char_to_idx.items()}
        self.vocab_size = len(self.char_to_idx)
    
    def encode(self, text: str) -> List[int]:
        return [self.char_to_idx.get(c, 0) for c in text]
    
    def decode(self, indices: List[int], remove_blanks: bool = True) -> str:
        if remove_blanks:
            prev = None
            chars = []
            for idx in indices:
                if idx != prev and idx != 0:
                    chars.append(self.idx_to_char.get(idx, \'\'))
                prev = idx
            return \'\'.join(chars)
        return \'\'.join(self.idx_to_char.get(idx, \'\') for idx in indices)


class CNNBackbone(nn.Module):
    """CNN feature extractor."""
    
    def __init__(self, input_channels: int = 1):
        super().__init__()
        self.cnn = nn.Sequential(
            nn.Conv2d(input_channels, 32, 3, 1, 1), nn.BatchNorm2d(32), nn.ReLU(), nn.MaxPool2d(2, 2),
            nn.Conv2d(32, 64, 3, 1, 1), nn.BatchNorm2d(64), nn.ReLU(), nn.MaxPool2d(2, 2),
            nn.Conv2d(64, 128, 3, 1, 1), nn.BatchNorm2d(128), nn.ReLU(),
            nn.Conv2d(128, 128, 3, 1, 1), nn.BatchNorm2d(128), nn.ReLU(), nn.MaxPool2d((2, 1)),
            nn.Conv2d(128, 256, 3, 1, 1), nn.BatchNorm2d(256), nn.ReLU(),
            nn.Conv2d(256, 256, 3, 1, 1), nn.BatchNorm2d(256), nn.ReLU(), nn.MaxPool2d((2, 1)),
            nn.Conv2d(256, 512, 2, 1, 0), nn.BatchNorm2d(512), nn.ReLU(),
        )
    
    def forward(self, x): return self.cnn(x)


class HTRModel(nn.Module):
    """CNN + BiLSTM + CTC model."""
    
    def __init__(self, vocab_size: int, hidden_size: int = 256, num_layers: int = 2):
        super().__init__()
        self.cnn = CNNBackbone()
        self.lstm = nn.LSTM(512, hidden_size, num_layers, batch_first=False, bidirectional=True)
        self.fc = nn.Linear(hidden_size * 2, vocab_size)
        self.log_softmax = nn.LogSoftmax(dim=2)
    
    def forward(self, x):
        features = self.cnn(x).squeeze(2).permute(2, 0, 1)
        lstm_out, _ = self.lstm(features)
        return self.log_softmax(self.fc(lstm_out))


def recognize_text(model: nn.Module, image: np.ndarray, charset: CharacterSet) -> str:
    """Recognize text from image."""
    model.eval()
    if len(image.shape) == 2:
        image = image[np.newaxis, np.newaxis, ...]
    elif len(image.shape) == 3:
        image = image[np.newaxis, ...]
    
    if image.max() > 1:
        image = image.astype(np.float32) / 255.0
    
    with torch.no_grad():
        outputs = model(torch.FloatTensor(image))
    
    _, preds = outputs.max(2)
    return charset.decode(preds.squeeze(1).tolist())
'''

model_code_path = Path("../models/htr_model.py")
model_code_path.write_text(HTR_MODEL_CODE)
print(f"✅ Model code exported to: {model_code_path.resolve()}")

---
## Summary

This notebook implemented:

| Component | Description |
|-----------|-------------|
| `CharacterSet` | Char-index mapping for 80+ characters |
| `CNNBackbone` | 7-layer CNN for visual features |
| `HTRModel` | CNN + BiLSTM + CTC architecture |
| `SyntheticHTRDataset` | Generate training data |
| `train_htr_model()` | Training loop with CTC loss |
| `recognize_text()` | Inference function |

**Next steps:**
1. Train on real IAM dataset
2. Add beam search decoding
3. Integrate with evaluation pipeline

**Next notebook:** Answer Scoring