# Lokale OCR-Pipeline: Tesseract + Calamari (Fraktur)

Dieses Notebook führt OCR auf `testbild.png` aus – einmal mit Tesseract, einmal mit Calamari.

In [None]:
from pathlib import Path
import subprocess
import shutil
from PIL import Image
import pytesseract
from pytesseract import Output

# Funktioniert sowohl aus dem Repo-Root als auch aus ./notebooks.
CANDIDATE_ROOTS = [Path.cwd().resolve(), Path.cwd().resolve().parent]
PROJECT_ROOT = next((p for p in CANDIDATE_ROOTS if (p / 'testbild.png').exists()), CANDIDATE_ROOTS[0])
TEST_IMAGE = PROJECT_ROOT / 'testbild.png'
VENV311_BIN = PROJECT_ROOT / '.venv311' / 'bin'
CALAMARI_PREDICT_BIN = VENV311_BIN / 'calamari-predict'
CALAMARI_PREDICT_EXE = str(CALAMARI_PREDICT_BIN) if CALAMARI_PREDICT_BIN.exists() else 'calamari-predict'
OUT_DIR = PROJECT_ROOT / 'outputs'
OUT_DIR.mkdir(exist_ok=True)

assert TEST_IMAGE.exists(), f'Missing test image: {TEST_IMAGE}'

print('Using image:', TEST_IMAGE)
print('Outputs:', OUT_DIR)
print('tesseract:', shutil.which('tesseract'))
print('calamari-predict (resolved):', CALAMARI_PREDICT_EXE)
print('calamari-predict (PATH):', shutil.which('calamari-predict'))


## (Optional) Bildvorschau

In [None]:
img = Image.open(TEST_IMAGE)
img


## 1) Tesseract-OCR

In [None]:
# Tesseract-Sprachen anpassen, z. B. 'deu', 'frk', oder kombiniert: 'deu+frk'
TESS_LANG = 'deu+frk'
TESS_PSM = 6  # 6 = assume a uniform block of text

tess_config = f'--psm {TESS_PSM}'
tess_text = pytesseract.image_to_string(img, lang=TESS_LANG, config=tess_config)

tess_out = OUT_DIR / 'tesseract_testbild.txt'
tess_out.write_text(tess_text, encoding='utf-8')

print(tess_text[:2000])
print('Saved:', tess_out)


## 2) Zeilensegmentierung (für Calamari)

Das Fraktur-Modell arbeitet besser auf **Zeilenbildern**. Wir nutzen Tesseract nur für die Layout-/Zeilen-Segmentierung.

In [None]:
lines_dir = OUT_DIR / 'lines'
lines_dir.mkdir(exist_ok=True)

data = pytesseract.image_to_data(
    img,
    lang=TESS_LANG,
    config=tess_config,
    output_type=Output.DICT,
)

line_records = []
n = len(data['level'])
for i in range(n):
    # level==4 entspricht 'line' in der Tesseract-TSV-Hierarchie.
    if data['level'][i] != 4:
        continue
    x = data['left'][i]
    y = data['top'][i]
    w = data['width'][i]
    h = data['height'][i]
    if w <= 0 or h <= 0:
        continue
    line_id = (data['block_num'][i], data['par_num'][i], data['line_num'][i])
    line_records.append((line_id, x, y, w, h))

line_records.sort(key=lambda r: r[0])

line_image_paths = []
for idx, (_, x, y, w, h) in enumerate(line_records):
    crop = img.crop((x, y, x + w, y + h))
    line_path = lines_dir / f'line_{idx:03d}.png'
    crop.save(line_path)
    line_image_paths.append(line_path)

print(f'Extracted {len(line_image_paths)} line images to {lines_dir}')
line_image_paths[:5]


## 3) Calamari-OCR (Frakturmodell, zeilenweise)

Calamari wird hier über die CLI aufgerufen und auf die segmentierten Zeilenbilder angewendet.

In [None]:
# Passe den/die Modellpfad(e) an deine lokalen Fraktur-Checkpoints an.
# Default: das geklonte Ensemble-Modell fraktur_19th_century (0-4).
MODEL_DIR = (PROJECT_ROOT / 'models/calamari_models/fraktur_19th_century').resolve()
CALAMARI_MODEL_PATHS = [
    MODEL_DIR / '0.ckpt.json',
    MODEL_DIR / '1.ckpt.json',
    MODEL_DIR / '2.ckpt.json',
    MODEL_DIR / '3.ckpt.json',
    MODEL_DIR / '4.ckpt.json',
]

for p in CALAMARI_MODEL_PATHS:
    print(p, 'exists:', p.exists())


In [None]:
# Calamari-Output in eine .pred.txt Datei schreiben und danach einlesen
calamari_out_dir = OUT_DIR / 'calamari'
calamari_out_dir.mkdir(exist_ok=True)

assert line_image_paths, 'No line images found; run the segmentation cell first.'

cmd = [
    CALAMARI_PREDICT_EXE,
    '--checkpoint', *[str(p) for p in CALAMARI_MODEL_PATHS],
    '--data.images', *[str(p) for p in line_image_paths],
    '--output_dir', str(calamari_out_dir),
]

print('Running:', ' '.join(cmd[:8]), '...')
result = subprocess.run(cmd, capture_output=True, text=True)
print('Return code:', result.returncode)
print(result.stdout[:2000])
if result.returncode != 0:
    print(result.stderr[:2000])
    raise RuntimeError('calamari-predict failed; see stderr above.')


In [None]:
# Calamari schreibt pro Zeile eine <stem>.pred.txt
pred_texts = []
missing = []
for p in line_image_paths:
    pred_path = calamari_out_dir / f'{p.stem}.pred.txt'
    if pred_path.exists():
        pred_texts.append(pred_path.read_text(encoding='utf-8').strip())
    else:
        missing.append(pred_path)

if missing:
    print(f'Missing {len(missing)} prediction files (showing up to 5):')
    for mp in missing[:5]:
        print(' -', mp)

calamari_text = '\n'.join(t for t in pred_texts if t)
calamari_txt_out = OUT_DIR / 'calamari_testbild.txt'
calamari_txt_out.write_text(calamari_text, encoding='utf-8')

print(calamari_text[:2000])
print('Saved:', calamari_txt_out)


## 4) Mini-Vergleich

In [None]:
def preview(label, text, n=400):
    print(f'--- {label} (first {n} chars) ---')
    print(text[:n])
    print()

preview('Tesseract', tess_text)
preview('Calamari', calamari_text)
