# Handwritten Text Recognition (HTR) — Notebook

This notebook contains a practical pipeline for converting handwritten note images to editable text using OpenCV preprocessing and a pretrained TrOCR model (Hugging Face). Run the cells sequentially. Where long installations or downloads are required, follow the comments in the first cell.

----

In [2]:
# 1 — Imports
import os
import glob
from PIL import Image
import numpy as np
import cv2
import matplotlib.pyplot as plt
from tqdm import tqdm

from transformers import TrOCRProcessor, VisionEncoderDecoderModel
import torch

from jiwer import wer, cer

print('imports ready')

  from .autonotebook import tqdm as notebook_tqdm
The cache for model files in Transformers v4.22.0 has been updated. Migrating your old cache. This is a one-time only operation. You can interrupt this and resume the migration later on by calling `transformers.utils.move_cache()`.
0it [00:00, ?it/s]


imports ready


## Device Configuration
### Description

This block detects whether a GPU is available and selects the appropriate computation device. Using a GPU significantly improves inference speed for transformer-based OCR models.

In [None]:
import torch
from transformers import TrOCRProcessor, VisionEncoderDecoderModel

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
MODEL_NAME = "microsoft/trocr-small-handwritten"

print("Loading processor (slow tokenizer)...")
processor = TrOCRProcessor.from_pretrained(
    MODEL_NAME,
    use_fast=False
)

print("Loading model...")
model = VisionEncoderDecoderModel.from_pretrained(
    MODEL_NAME
).to(DEVICE)

GEN_KWARGS = {
    "max_length": 512,
    "num_beams": 4,
    "early_stopping": True
}

print("Model ready ")


Loading processor (slow tokenizer)...




Loading model...


Some weights of VisionEncoderDecoderModel were not initialized from the model checkpoint at microsoft/trocr-small-handwritten and are newly initialized: ['encoder.pooler.dense.bias', 'encoder.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model ready ✅


## Model Configuration
### Description

Defines the pretrained TrOCR model to be used for handwritten text recognition. Generation parameters such as maximum output length and beam search settings are also initialized here.

In [21]:
# 3 — Image preprocessing utilities

def show_image(img, title=None):
    if isinstance(img, np.ndarray):
        if img.ndim == 2:
            plt.imshow(img, cmap='gray')
        else:
            plt.imshow(cv2.cvtColor(img, cv2.COLOR_BGR2RGB))
    else:
        plt.imshow(img)
    if title:
        plt.title(title)
    plt.axis('off')
    plt.show()


def load_image(path):
    return Image.open(path).convert('RGB')


def preprocess_image_cv(img_pil, show=False):
    img = np.array(img_pil.convert("RGB"))

    gray = cv2.cvtColor(img, cv2.COLOR_RGB2GRAY)

    # Light denoising only
    gray = cv2.fastNlMeansDenoising(gray, None, 10, 7, 21)

    # Normalize contrast
    gray = cv2.normalize(gray, None, alpha=0, beta=255, norm_type=cv2.NORM_MINMAX)

    processed = Image.fromarray(gray).convert("RGB")

    if show:
        plt.imshow(processed)
        plt.axis("off")

    return processed

print('preprocessing utils ready')

preprocessing utils ready


In [22]:
def segment_lines(img_pil):
    img = np.array(img_pil.convert("L"))

    # Invert for projection
    _, th = cv2.threshold(img, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)

    # Horizontal projection
    projection = np.sum(th, axis=1)

    lines = []
    start = None

    for i, val in enumerate(projection):
        if val > 0 and start is None:
            start = i
        elif val == 0 and start is not None:
            end = i
            if end - start > 15:  # minimum line height
                lines.append((start, end))
            start = None

    if start is not None:
        lines.append((start, len(projection)))

    line_images = []
    for (y1, y2) in lines:
        line = img[y1:y2, :]
        line_images.append(Image.fromarray(line).convert("RGB"))

    return line_images


In [23]:
def predict_image(img_pil, model, processor, device=DEVICE):
    if img_pil.mode != "RGB":
        img_pil = img_pil.convert("RGB")

    gen_kwargs = {
        "max_length": 512,
        "num_beams": 4,
        "early_stopping": True,
    }

    pixel_values = processor(images=img_pil, return_tensors="pt").pixel_values
    pixel_values = pixel_values.to(device)

    generated_ids = model.generate(pixel_values, **gen_kwargs)
    preds = processor.batch_decode(generated_ids, skip_special_tokens=True)
    return preds[0]


In [24]:
# 5 — Postprocessing & normalization
import re

def normalize_text(text):
    text = text.replace('\r','')
    text = re.sub(r"\s+", ' ', text)
    return text.strip()

print('postprocessing ready')

postprocessing ready


In [25]:
from symspellpy import SymSpell, Verbosity

sym_spell = SymSpell(max_dictionary_edit_distance=2)
sym_spell.load_dictionary(
    "frequency_dictionary_en_82_765.txt",
    term_index=0,
    count_index=1
)

def correct_text(text):
    corrected = []
    for word in text.split():
        suggestions = sym_spell.lookup(word, Verbosity.CLOSEST, max_edit_distance=2)
        corrected.append(suggestions[0].term if suggestions else word)
    return " ".join(corrected)


2026-01-02 18:57:18,742: E symspellpy.symspellpy] Dictionary file not found at frequency_dictionary_en_82_765.txt.


In [26]:
# 6 — Quick demo (add images in ./data/samples/)
SAMPLES_DIR = "./data/samples"

if not os.path.exists(SAMPLES_DIR):
    print("No sample folder found. Create", SAMPLES_DIR, "and add some images to run the demo.")
else:
    img_paths = sorted(
        glob.glob(os.path.join(SAMPLES_DIR, "*.png")) +
        glob.glob(os.path.join(SAMPLES_DIR, "*.jpg")) +
        glob.glob(os.path.join(SAMPLES_DIR, "*.jpeg"))
    )

    if not img_paths:
        print("Sample folder is empty. Add some handwritten images.")

    for p in img_paths:
        pil = load_image(p)

        # preprocess page
        proc = preprocess_image_cv(pil, show=False)

        # segment page into lines
        lines = segment_lines(proc)

        print("\n" + "=" * 60)
        print(f"FILE: {os.path.basename(p)}")
        print("=" * 60)

        for line in lines:
            # skip very small regions
            if line.size[1] < 20:
                continue

            text = predict_image(line, model, processor)
            text = normalize_text(text)
            text = correct_text(text)

            if len(text) > 2:
                print(text)



FILE: test.png
Circle Advocate
' That Automatic involves state and transition among
skilled in response to inputs. Frick Automala is a
mathematical model of a system with discrete inputs
and outputs. The system can be in any one of finite
member of stores and the state summarises the
childreny of past impetus and determines the behavior
of the system for subsequent input.


In [27]:
from pdf2image import convert_from_path

def pdf_to_images(pdf_path, dpi=300):
    """
    Converts a PDF into a list of PIL Images
    """
    pages = convert_from_path(pdf_path, dpi=dpi)
    return pages


In [28]:
import numpy as np

def is_valid_text_line(line_img):
    """
    Heuristic to reject non-text lines like:
    - separators
    - underline-only
    - tables
    - rows of zeros
    """
    gray = np.array(line_img.convert("L"))

    # Reject very thin lines
    if gray.shape[0] < 25:
        return False

    # Compute ink density
    ink_pixels = np.sum(gray < 200)
    total_pixels = gray.size
    ink_ratio = ink_pixels / total_pixels

    # Too little ink → separators
    if ink_ratio < 0.01:
        return False

    # Too much ink → solid bars / tables
    if ink_ratio > 0.6:
        return False

    return True


In [29]:
def ocr_pdf(pdf_path, max_pages=5):
    pages = pdf_to_images(pdf_path)
    full_text = []

    for page_num, page in enumerate(pages[:max_pages], start=1):
        print(f"\n--- Processing Page {page_num} ---")

        proc = preprocess_image_cv(page)
        lines = segment_lines(proc)

        page_lines = []
        for line in lines:
            if not is_valid_text_line(line):
                continue

            text = predict_image(line, model, processor)
            text = normalize_text(text)
            text = correct_text(text)

            # reject garbage outputs
            if len(text) < 3:
                continue
            if sum(c.isdigit() for c in text) / len(text) > 0.4:
                continue

            print(text)

            

            if len(text) > 2:
                page_lines.append(text)

        full_text.append("\n".join(page_lines))

    return "\n\n".join(full_text)


In [None]:

pdf_path = "./data/samples/sample.pdf"
final_text = ocr_pdf(pdf_path, max_pages=5)

print(final_text)



--- Processing Page 1 ---
1 Mechanisms _____________________________________
#______
1 References
implied string is acceptable or not.
A finite automation has a mechanism to
it will
I read impact, which is a strong over a given
", alphabetical. This impul is actually written on an
if fundamental strategies are used as an increase in

--- Processing Page 2 ---
1 Mechanisms _____________________________________
#______
a b c
displaystyle _ 0
2 Legal files
a man and wait
References

--- Processing Page 3 ---
1 Mechanisms _____________________________________
#______
# Other qualifications of the IFPI accepts the
" Language! of strings that have a subslying


In [None]:
from docx import Document

def export_to_docx(text, output_path):
    doc = Document()
    for line in text.split("\n"):
        doc.add_paragraph(line)
    doc.save(output_path)

export_to_docx(final_text, "output_first_5_pages.docx")


In [None]:
import pandas as pd
from jiwer import wer, cer
from tqdm import tqdm

def evaluate_predictions(csv_path):
    df = pd.read_csv(csv_path)

    preds = []
    gts = []

    for _, row in tqdm(df.iterrows(), total=len(df)):
        img = load_image(row["image_path"])
        proc = preprocess_image_cv(img)

        # segment page into lines
        lines = segment_lines(proc)

        # match GT line index
        gt_text = normalize_text(row["transcription"])

        # find best matching predicted line
        line_preds = []
        for line in lines:
            text = predict_image(line, model, processor)
            text = normalize_text(text)
            if len(text) > 2:
                line_preds.append(text)

        if not line_preds:
            continue

        # choose longest predicted line (simple heuristic)
        pred_text = max(line_preds, key=len)

        preds.append(pred_text)
        gts.append(gt_text)

    return {
        "CER": cer(gts, preds),
        "WER": wer(gts, preds),
        "samples": len(gts)
    }

print("✅ evaluation helper ready")


✅ evaluation helper ready


In [None]:
# 8 — Export functions (DOCX / PDF)
from docx import Document
from reportlab.lib.pagesizes import letter
from reportlab.pdfgen import canvas

def save_as_docx(text, out_path):
    doc = Document()
    for line in text.split('\n'):
        doc.add_paragraph(line)
    doc.save(out_path)

def save_as_pdf(text, out_path):
    c = canvas.Canvas(out_path, pagesize=letter)
    width, height = letter
    y = height - 72
    for line in text.split('\n'):
        c.drawString(72, y, line)
        y -= 14
        if y < 72:
            c.showPage()
            y = height - 72
    c.save()

print('export functions ready')

export functions ready
