Author: Nathan Wegmann

In [5]:
%pip install jupyterlab ipykernel pillow numpy lxml tqdm svgpathtools

Collecting svgpathtools
  Downloading svgpathtools-1.6.1-py2.py3-none-any.whl.metadata (22 kB)
Collecting svgwrite (from svgpathtools)
  Downloading svgwrite-1.4.3-py3-none-any.whl.metadata (8.8 kB)
Collecting scipy (from svgpathtools)
  Using cached scipy-1.15.2-cp313-cp313-macosx_14_0_arm64.whl.metadata (61 kB)
Downloading svgpathtools-1.6.1-py2.py3-none-any.whl (67 kB)
Using cached scipy-1.15.2-cp313-cp313-macosx_14_0_arm64.whl (22.4 MB)
Downloading svgwrite-1.4.3-py3-none-any.whl (67 kB)
Installing collected packages: svgwrite, scipy, svgpathtools
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3/3[0m [svgpathtools][0m [svgpathtools]
[1A[2KSuccessfully installed scipy-1.15.2 svgpathtools-1.6.1 svgwrite-1.4.3
Note: you may need to restart the kernel to use updated packages.


# First Step: Create Distinct Images for Each Word

In [2]:
from pathlib import Path
from PIL import Image
import numpy as np
import lxml.etree as ET
import csv, re, os
from tqdm import tqdm

# --- paths (adapt if your folders differ) --------------------------

ROOT = Path("dataset")               # root of the project
PAGES_DIR = ROOT / "images"            # 271.jpg ...
SVG_DIR   = ROOT / "locations"          # 271.svg ...
INDEX_TSV = ROOT / "transcription.tsv"
DEST_DIR  = ROOT / "words"            # crops will be created here
DEST_DIR.mkdir(parents=True, exist_ok=True)

In [3]:
index = []
with INDEX_TSV.open() as f:
    for locator,word in csv.reader(f, delimiter="\t"):
        index.append({
            "id": locator,
            "keyword" : word.replace("-", "")
        })

print("Total entries:", len(index))
index[:3]

Total entries: 3726


[{'id': '270-01-01', 'keyword': 's_2s_7s_0s_pt'},
 {'id': '270-01-02', 'keyword': 'Letterss_cm'},
 {'id': '270-01-03', 'keyword': 'Orders'}]

In [6]:
from svgpathtools import parse_path
def bbox_from_svg(svg_path: Path, locator):
    """
    Return (x, y, w, h) in SVG coordinate space for the path whose id is
    exactly '{page:03d}-{line:02d}-{word:02d}', e.g. '270-01-03'.
    """
    target_id = locator
    elem = ET.parse(str(svg_path)).find(f".//*[@id='{target_id}']")
    if elem is None:
        raise ValueError(f"id {target_id} not found in {svg_path.name}")

    # compute bbox from the path’s 'd' data
    path = parse_path(elem.get("d"))
    xmin, xmax, ymin, ymax = path.bbox()
    return xmin, ymin, xmax - xmin, ymax - ymin

In [7]:
def svg_to_pixel_coords(svg_path: Path, jpg_path: Path, xywh):
    """Map SVG coords to JPEG pixel coords (assumes same aspect)."""
    svg_width  = float((ET.parse(str(svg_path)).getroot().get("width")).rstrip("px"))
    svg_height = float((ET.parse(str(svg_path)).getroot().get("height")).rstrip("px"))
    img = Image.open(jpg_path)
    img_w, img_h = img.size

    scale_x = img_w / svg_width
    scale_y = img_h / svg_height

    x, y, w, h = xywh
    return int(x*scale_x), int(y*scale_y), int(w*scale_x), int(h*scale_y)

for rec in tqdm(index):
    locator   = rec["id"]
    word   = rec["keyword"]
    page = locator.split("-", 1)[0] 

    svg_path = SVG_DIR / f"{page}.svg"
    jpg_path = PAGES_DIR / f"{page}.jpg"

    # 1. locate bbox in SVG units
    xywh_svg = bbox_from_svg(svg_path, locator)

    # 2. map to pixel coords
    x, y, w, h = svg_to_pixel_coords(svg_path, jpg_path, xywh_svg)

    # 3. crop
    img  = Image.open(jpg_path).convert("L")        # grayscale
    crop = img.crop((x, y, x+w, y+h))

    # 4. save
    dest_dir = DEST_DIR / word
    dest_dir.mkdir(exist_ok=True)
    out_path = dest_dir / f"{locator}.png"
    crop.save(out_path)

100%|██████████| 3726/3726 [00:49<00:00, 75.30it/s]


# Pre‑process each crop into a 1‑D feature sequence

In [None]:
%pip install opencv-python scikit-image numpy



In [11]:
import cv2, numpy as np
from skimage.feature import hog

def word_image_to_seq(png_path, strip_w=3, h_out=64):
    img = cv2.imread(png_path, cv2.IMREAD_GRAYSCALE)
    # 1. binarise & deskew (simple OTSU + moments)
    _, bw = cv2.threshold(img, 0, 255, cv2.THRESH_BINARY_INV+cv2.THRESH_OTSU)
    # TODO: add deskew if lines are slanted
    # 2. resize height
    h, w = bw.shape
    scale = h_out / h
    bw = cv2.resize(bw, (int(w*scale), h_out), interpolation=cv2.INTER_NEAREST)
    # 3. sliding window
    seq = []
    for x in range(0, bw.shape[1]-strip_w+1):
        strip = bw[:, x:x+strip_w]
        hfeat = hog(strip, orientations=8, pixels_per_cell=(h_out, strip_w),
                    cells_per_block=(1,1), feature_vector=True)
        upper = h_out - np.argmax(strip[::-1].any(axis=1))
        lower = np.argmax(strip.any(axis=1))
        density = strip.mean()
        seq.append(np.r_[hfeat, upper, lower, density])
    return np.vstack(seq)        # shape (T, n_feat)

### Hand‑written‑word strip feature vector (length = 11)

| Index (s) | Dim | Name / Formula | Value Range | Intuition |
|-----------|-----|----------------|-------------|-----------|
| 0 – 7     | 8   | **HOG bins** – normalized histogram of gradient orientations (0°, 45°, …, 315°) inside the 3 px‑wide strip. | 0 – 1 (L2‑norm) | Encodes local stroke direction & curvature. |
| 8         | 1   | **Upper contour** – `row index of highest ink pixel` | 0 – 63 (px) | Tall letters like “l”, “k” push this **down** (larger number). |
| 9         | 1   | **Lower contour** – `row index of lowest ink pixel` | 0 – 63 (px) | Descenders (“g”, “y”) push this **up** (larger number). |
| 10        | 1   | **Ink density** – `mean(strip)` after binarisation (`1 = ink`, `0 = paper`). | 0.0 – 1.0 | Distinguishes thick blobs (dots, loops) from airy strokes. |

*Image height is rescaled to 64 px before these metrics are computed.*

---

**Shape per word**

* Each 3‑px horizontal shift emits one 11‑D vector.  
* A word ≈ `W` pixels wide → sequence length **T ≈ (W − 2)**.  
* Example sizes: <br>“Virginia” ≈ 138×11, “and” ≈ 68×11.
