# 01. Environment & Repo Setup
# 02. Data Loader, Alignment Checks, Metric

Run each cell in order. If a step fails, re-run the cell after fixing the issue.

In [28]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
# Check GPU
import torch, platform, sys
print("Python:", sys.version)
print("PyTorch version (pre):", torch.__version__)
print("CUDA available:", torch.cuda.is_available())

Python: 3.12.12 (main, Oct 10 2025, 08:52:57) [GCC 11.4.0]
PyTorch version (pre): 2.9.0+cu128
CUDA available: False


In [3]:
# Install core packages (HF + metrics). Re-run if Colab restarts.
!pip -q install -U pip
!pip -q install -U transformers accelerate datasets evaluate jiwer Pillow regex editdistance sentencepiece timm

In [4]:
# Upgrade torchvision to a compatible version
!pip install -U torchvision



In [5]:
# (Optional) Install a stable PyTorch if missing. On Colab this is usually preinstalled.
# You can uncomment and run if needed:
# !pip -q install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121
import torch
print("PyTorch version (post):", torch.__version__)

PyTorch version (post): 2.9.0+cu128


In [6]:
# Verify imports
import transformers, datasets, jiwer, PIL, regex, editdistance, sentencepiece, timm
from transformers import TrOCRProcessor, VisionEncoderDecoderModel
print("Transformers:", transformers.__version__)
print("Ready.")

Transformers: 4.57.1
Ready.


In [7]:
# Create folders for data and experiments (idempotent)
import os
base = "/content/drive/MyDrive/GothiRead"
subdirs = ["src/data", "src/eval", "src/models", "scripts", "data/train", "data/val", "data/test_public", "exp"]
for sd in subdirs:
    os.makedirs(os.path.join(base, sd), exist_ok=True)
print("Project folders ready at", base)

Project folders ready at /content/drive/MyDrive/GothiRead


In [8]:
# Copy scaffold from the uploaded zip (if you uploaded it) or from Drive.
# If you downloaded the provided zip, upload it to Colab and run:
# from google.colab import files
# files.upload()  # then select 'icdar24-multifont.zip'
# import zipfile, os
# with zipfile.ZipFile('icdar24-multifont.zip', 'r') as z:
#     z.extractall('/content')

In [9]:
# (Optional) Mount Google Drive to persist data/experiments
# from google.colab import drive
# drive.mount('/content/drive')

## Optional: Install PaddleOCR (CPU) — safer on Day 1
GPU wheels can be tricky; switch to GPU later.

In [10]:
# CPU PaddleOCR install (safe default)
!pip -q install paddlepaddle==3.0.0 paddleocr
import paddle, paddleocr
print("Paddle:", paddle.__version__)

[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
gradio 5.49.1 requires pillow<12.0,>=8.0, but you have pillow 12.0.0 which is incompatible.
fastai 2.8.4 requires torch<2.9,>=1.10, but you have torch 2.9.0 which is incompatible.[0m[31m
[0m



Paddle: 3.0.0


## Quick zero-shot sanity check for TrOCR

In [11]:
# # Download a tiny demo image or place your line image at /content/line.jpg
# # For now, we'll generate a blank image as a placeholder.
# from PIL import Image, ImageDraw, ImageFont
# img = Image.new('RGB', (640, 96), color='white')
# draw = ImageDraw.Draw(img)
# draw.text((10, 30), "Demo Line", fill='black')
# img.save('/content/line.jpg')
# print("Saved /content/line.jpg")

In [12]:
# # Zero-shot TrOCR test (printed)
# from PIL import Image
# from transformers import TrOCRProcessor, VisionEncoderDecoderModel
# proc = TrOCRProcessor.from_pretrained("microsoft/trocr-base-printed")
# model = VisionEncoderDecoderModel.from_pretrained("microsoft/trocr-base-printed").eval()
# img = Image.open("/content/line.jpg").convert("RGB")
# inputs = proc(images=img, return_tensors="pt")
# out_ids = model.generate(**inputs, max_length=64)
# text = proc.batch_decode(out_ids, skip_special_tokens=True)[0]
# print("Prediction:", text)

**Data Loader, Alignment checks and Metrics**

In [13]:
import os, glob, json
from pathlib import Path
from PIL import Image
import regex as re

# Adjust base if needed
BASE = "/content/drive/MyDrive/GothiRead"
print("Base:", BASE)

# Make sure python can find our src when you place the repo at BASE
import sys
if BASE not in sys.path:
    sys.path.insert(0, BASE)

from src.data.icdar24 import LineDataset, split_into_chars
from src.eval.metrics import compute_ocr_metrics, compute_font_cer
from src.data.build_vocab import build_char_vocab, save_vocab

Base: /content/drive/MyDrive/GothiRead


In [14]:
# # === Download dataset ===
# # !wget -O /content/drive/MyDrive/GothiRead/data/icdar2024-comp-ocr-font.zip "https://faubox.rrze.uni-erlangen.de/dl/fiSDupUxNJWYgBkHtwDjZx/icdar2024-comp-ocr-font.zip"

# !unzip -q /content/drive/MyDrive/GothiRead/data/icdar2024-comp-ocr-font.zip -d /content/drive/MyDrive/GothiRead/data/full
# print("Extracted dataset to /content/drive/MyDrive/GothiRead/data/full ")
# !ls /content/drive/MyDrive/GothiRead/data/full | head


In [15]:
!unzip "/content/drive/MyDrive/GothiRead/data/Dataset.zip" -d "/content/dataset"


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
 extracting: /content/dataset/valid/single/rotunda/18989.txt  
  inflating: /content/dataset/valid/single/rotunda/138351.font  
  inflating: /content/dataset/valid/single/rotunda/138497.jpg  
  inflating: /content/dataset/valid/single/rotunda/19024.jpg  
 extracting: /content/dataset/valid/single/rotunda/138313.txt  
 extracting: /content/dataset/valid/single/rotunda/138121.txt  
 extracting: /content/dataset/valid/single/rotunda/138014.txt  
  inflating: /content/dataset/valid/single/rotunda/138413.jpg  
 extracting: /content/dataset/valid/single/rotunda/194127.txt  
 extracting: /content/dataset/valid/single/rotunda/138338.txt  
  inflating: /content/dataset/valid/single/rotunda/194110.jpg  
  inflating: /content/dataset/valid/single/rotunda/18983.txt  
  inflating: /content/dataset/valid/single/rotunda/138486.font  
  inflating: /content/dataset/valid/single/rotunda/05410.txt  
  inflating: /content/dataset/valid/singl

In [16]:
# from src.data.icdar24 import FlexibleLineDataset

# BASE = "/content"  # or wherever you extracted
# DATASET_ROOT = BASE  # since your zip has /dataset/... under the repo root

# # Train/valid in nested FAUBox layout:
# ds_train = FlexibleLineDataset(
#     root=DATASET_ROOT,
#     split="train",            # "train" or "valid"
#     base_dirname="dataset",   # force nested mode
#     load_fonts=True,
#     include_single=True,
#     include_multiple=True,    # set False if you want only single-font lines
#     include_unknown_classes=False,  # skip greek/hebrew etc. in 'valid'
# )

# ds_val = FlexibleLineDataset(
#     root=DATASET_ROOT,
#     split="valid",
#     base_dirname="dataset",
#     load_fonts=True,
#     include_single=True,
#     include_multiple=True,
#     include_unknown_classes=False,
# )

# print("Train items:", len(ds_train), "Valid items:", len(ds_val))


In [17]:
# # Inspect a few samples and ensure alignment
# for ds, name in [(ds_train, "train"), (ds_val, "valid")]:
#     ok = bad = 0
#     for i in range(min(200, len(ds))):
#         _, text, font_seq, fn = ds[i]
#         if font_seq is None:
#             continue
#         if len(split_into_chars(text)) == len(font_seq):
#             ok += 1
#         else:
#             bad += 1
#     print(f"[{name}] aligned={ok} mismatched={bad}")


In [18]:
# # Build character-level vocab from train texts
# from pathlib import Path
# text_files = sorted([str(p) for p in Path(DATASET_ROOT, 'dataset', 'train').rglob('*.txt')])
# print('Train text files:', len(text_files))
# vocab = build_char_vocab(text_files, min_freq=1)
# save_vocab(vocab, str(Path(BASE)/'exp'/'vocab_day2'))
# print('Vocab size:', len(vocab['vocab']))

In [19]:
# # Metrics sanity check
# refs=['Demo Line 123','hello world']
# preds=['Demo Line 12','hello wurld']
# print('OCR:', compute_ocr_metrics(preds, refs))
# ref_font=['afafafafafafaf']
# pred_font=['afafafffafafaf']
# print('Font CER:', compute_font_cer(pred_font, ref_font))

In [20]:
# for name, ds in [('TRAIN', ds_train), ('VALID', ds_val)]:
#     print(name)
#     for i in range(min(5, len(ds))):
#         _, t, f, fn = ds[i]
#         print(fn, '| text_len=', len(split_into_chars(t)), '| font_len=', None if f is None else len(f))
#     print('-'*60)

In [21]:
!python /content/drive/MyDrive/GothiRead/scripts/make_test_split.py \
  --root /content \
  --split_ratio 0.10 \
  --include_single True \
  --include_multiple True \
  --move


{
  "root": "/content",
  "train_root": "/content/dataset/train",
  "test_root": "/content/dataset/test",
  "include_single": true,
  "include_multiple": true,
  "split_ratio": 0.1,
  "seed": 42,
  "mode": "move",
  "dry_run": false,
  "leaf_dirs_processed": 17923,
  "triplets_considered": 179223,
  "triplets_selected": 17923
}

Done. Moved 17923 triplet(s) to test.
Test set available at: /content/dataset/test


In [29]:
!python /content/drive/MyDrive/GothiRead/scripts/build_manifest.py \
  --data-root /content/dataset \
  --splits train valid test \
  --out-dir manifests \
  --fail-if-missing


[OK] Wrote manifests/train.csv (163023 rows)
[OK] Wrote manifests/valid.csv (4040 rows)
[OK] Wrote manifests/test.csv (17923 rows)
[ERROR] Missing files found across splits: 1936


In [30]:
!python /content/drive/MyDrive/GothiRead/scripts/check_integrity.py \
  --manifests manifests/train.csv manifests/valid.csv manifests/test.csv



== train.csv ==
Total lines         : 163023
Clean (ok=True)     : 161297 (98.94%)
Missing image       : 0
Missing txt         : 1723
Missing font        : 0
Any missing (.img/.txt/.font): 1723
Length mismatches   : 3
Issues total        : 1726

== valid.csv ==
Total lines         : 4040
Clean (ok=True)     : 3827 (94.73%)
Missing image       : 0
Missing txt         : 213
Missing font        : 0
Any missing (.img/.txt/.font): 213
Length mismatches   : 0
Issues total        : 213

== test.csv ==
Total lines         : 17923
Clean (ok=True)     : 17923 (100.00%)
Missing image       : 0
Missing txt         : 0
Missing font        : 0
Any missing (.img/.txt/.font): 0
Length mismatches   : 0
Issues total        : 0


In [31]:
!python /content/drive/MyDrive/GothiRead/scripts/visualize_line.py \
  --manifest manifests/train.csv \
  --num 12


[OK] Wrote exp/viz/train_single/schwabacher/98601.png
[OK] Wrote exp/viz/train_single/bastarda/109717.png
[OK] Wrote exp/viz/train_single/antiqua/148078.png
[OK] Wrote exp/viz/train_single/gotico-antiqua/153054.png
[OK] Wrote exp/viz/train_single/gotico-antiqua/145598.png
[OK] Wrote exp/viz/train_single/gotico-antiqua/141334.png
[OK] Wrote exp/viz/train_single/fraktur/07541.png
[OK] Wrote exp/viz/train_single/gotico-antiqua/90210.png
[OK] Wrote exp/viz/train_single/fraktur/100013.png
[OK] Wrote exp/viz/train_single/bastarda/110371.png
[OK] Wrote exp/viz/train_single/gotico-antiqua/141249.png
[OK] Wrote exp/viz/train_multiple/18546.png
