# First Step: Create Distinct Images for Each Word

In [4]:
from pathlib import Path
from PIL import Image
import numpy as np
import lxml.etree as ET
import csv, re, os
from tqdm import tqdm

# --- paths (adapt if your folders differ) --------------------------

ROOT = Path("data/KWS")               # root of the project
PAGES_DIR = ROOT / "images"            # 271.jpg ...
SVG_DIR   = ROOT / "locations"          # 271.svg ...
INDEX_TSV = ROOT / "transcription.tsv"
DEST_DIR  = ROOT / "words"            # crops will be created here
DEST_DIR.mkdir(parents=True, exist_ok=True)

In [5]:
index = []
with INDEX_TSV.open() as f:
    for locator,word in csv.reader(f, delimiter="\t"):
        index.append({
            "id": locator,
            "keyword" : word.replace("-", "")
        })

print("Total entries:", len(index))
index[:3]

Total entries: 3726


[{'id': '270-01-01', 'keyword': 's_2s_7s_0s_pt'},
 {'id': '270-01-02', 'keyword': 'Letterss_cm'},
 {'id': '270-01-03', 'keyword': 'Orders'}]

In [6]:
from svgpathtools import parse_path
def bbox_from_svg(svg_path: Path, locator):
    """
    Return (x, y, w, h) in SVG coordinate space for the path whose id is
    exactly '{page:03d}-{line:02d}-{word:02d}', e.g. '270-01-03'.
    """
    target_id = locator
    elem = ET.parse(str(svg_path)).find(f".//*[@id='{target_id}']")
    if elem is None:
        raise ValueError(f"id {target_id} not found in {svg_path.name}")

    # compute bbox from the path’s 'd' data
    path = parse_path(elem.get("d"))
    xmin, xmax, ymin, ymax = path.bbox()
    return xmin, ymin, xmax - xmin, ymax - ymin

In [7]:
def svg_to_pixel_coords(svg_path: Path, jpg_path: Path, xywh):
    """Map SVG coords to JPEG pixel coords (assumes same aspect)."""
    svg_width  = float((ET.parse(str(svg_path)).getroot().get("width")).rstrip("px"))
    svg_height = float((ET.parse(str(svg_path)).getroot().get("height")).rstrip("px"))
    img = Image.open(jpg_path)
    img_w, img_h = img.size

    scale_x = img_w / svg_width
    scale_y = img_h / svg_height

    x, y, w, h = xywh
    return int(x*scale_x), int(y*scale_y), int(w*scale_x), int(h*scale_y)

for rec in tqdm(index):
    locator   = rec["id"]
    word   = rec["keyword"]
    page = locator.split("-", 1)[0] 

    svg_path = SVG_DIR / f"{page}.svg"
    jpg_path = PAGES_DIR / f"{page}.jpg"

    # 1. locate bbox in SVG units
    xywh_svg = bbox_from_svg(svg_path, locator)

    # 2. map to pixel coords
    x, y, w, h = svg_to_pixel_coords(svg_path, jpg_path, xywh_svg)

    # 3. crop
    img  = Image.open(jpg_path).convert("L")        # grayscale
    crop = img.crop((x, y, x+w, y+h))

    # 4. save
    dest_dir = DEST_DIR / word
    dest_dir.mkdir(exist_ok=True)
    out_path = dest_dir / f"{locator}.png"
    crop.save(out_path)

100%|██████████| 3726/3726 [00:50<00:00, 74.19it/s]


# Pre‑process each crop into a 1‑D feature sequence

In [8]:
import cv2, numpy as np
from skimage.feature import hog

def word_image_to_seq(png_path, strip_w=3, h_out=64):
    img = cv2.imread(png_path, cv2.IMREAD_GRAYSCALE)
    # 1. binarise & deskew (simple OTSU + moments)
    _, bw = cv2.threshold(img, 0, 255, cv2.THRESH_BINARY_INV+cv2.THRESH_OTSU)
    # TODO: add deskew if lines are slanted
    # 2. resize height
    h, w = bw.shape
    scale = h_out / h
    bw = cv2.resize(bw, (int(w*scale), h_out), interpolation=cv2.INTER_NEAREST)
    # 3. sliding window
    seq = []
    for x in range(0, bw.shape[1]-strip_w+1):
        strip = bw[:, x:x+strip_w]
        hfeat = hog(strip, orientations=8, pixels_per_cell=(h_out, strip_w),
                    cells_per_block=(1,1), feature_vector=True)
        upper = h_out - np.argmax(strip[::-1].any(axis=1))
        lower = np.argmax(strip.any(axis=1))
        density = strip.mean()
        seq.append(np.r_[hfeat, upper, lower, density])
    return np.vstack(seq)        # shape (T, n_feat)

In [9]:
import pickle, glob, tqdm, collections

LIB_PATH = Path("data/KWS/templates.pkl")
library  = collections.defaultdict(list)

for kw_dir in DEST_DIR.iterdir():
    for png in kw_dir.glob("*.png"):
        seq = word_image_to_seq(png)
        library[kw_dir.name].append(seq)

with open(LIB_PATH, "wb") as f:
    pickle.dump(library, f)

print({k: len(v) for k,v in library.items()})   # how many templates per keyword

{'Firelock': 1, 'weight': 1, 'Willis_mi': 1, 'your': 32, 'Cumberland': 4, 'attendance': 1, 'goes': 1, 'alreadys_cm': 1, 'rer': 1, 'Commis_ssary': 4, 'Honor': 4, 'Kets_mi': 1, 'inform': 1, 'hear': 2, 'eight': 7, 'Majors_cm': 1, 'Forts_cm': 1, 'paper': 2, 'arms_cm': 1, 'disobedience': 1, 'man': 2, 'ence': 1, 'Dispositionss_cm': 1, 'Fergusons_cm': 1, 'returned': 1, 'Busines_sss_cm': 1, 'Regiments_qo': 2, 'Hoggs_cm': 1, 'Winters_pt': 1, 'safe': 1, 'opportunity': 3, 'bys_cm': 1, 'ors_mi': 2, 'pains': 1, 'silvers_qo': 1, 'Honourable': 1, 'Williamsburghs_cm': 1, 'thems_cm': 3, 'given': 6, 'Bronaughs_cm': 1, 'keep': 1, 'Plantation': 1, 'Order': 5, 'Shirleys_cm': 1, 'deserteds_cm': 1, 's_GW': 10, 'Livings_mi': 1, 'month': 1, 'apply': 2, 'cans_pt': 1, 'berlands_cm': 1, 'most': 3, 'Instructions': 1, 'fore': 1, 'Peter': 1, 'hundred': 3, 'cases_cm': 2, 'confidents_cm': 1, 'Cumberlands_pt': 2, 'fastening': 1, 'Laws_cm': 1, 'gonss_cm': 1, 'hastened': 1, 'Recruit': 2, 'peremptorilys_cm': 1, 'requires_