# 01. Environment & Repo Setup
# 02. Data Loader, Alignment Checks, Metric

Run each cell in order. If a step fails, re-run the cell after fixing the issue.

In [1]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [2]:
# !pip uninstall -y paddleocr paddlepaddle paddlepaddle-gpu

# # In your Colab before running the script
!pip install "paddlepaddle-gpu==2.6.2" "paddleocr==2.9.0" "jiwer==3.0.3"

# %cd /content
# !git clone https://github.com/PaddlePaddle/PaddleOCR.git
# %cd PaddleOCR

# # Install dependencies (if not already installed)
# !pip install "paddlepaddle-gpu>=2.5.0" -U
# !pip install -r requirements.txt


Collecting paddlepaddle-gpu==2.6.2
  Downloading paddlepaddle_gpu-2.6.2-cp312-cp312-manylinux1_x86_64.whl.metadata (8.6 kB)
Collecting paddleocr==2.9.0
  Downloading paddleocr-2.9.0-py3-none-any.whl.metadata (8.4 kB)
Collecting jiwer==3.0.3
  Downloading jiwer-3.0.3-py3-none-any.whl.metadata (2.6 kB)
Collecting astor (from paddlepaddle-gpu==2.6.2)
  Downloading astor-0.8.1-py2.py3-none-any.whl.metadata (4.2 kB)
Collecting opt-einsum==3.3.0 (from paddlepaddle-gpu==2.6.2)
  Downloading opt_einsum-3.3.0-py3-none-any.whl.metadata (6.5 kB)
Collecting imgaug (from paddleocr==2.9.0)
  Downloading imgaug-0.4.0-py2.py3-none-any.whl.metadata (1.8 kB)
Collecting pyclipper (from paddleocr==2.9.0)
  Downloading pyclipper-1.4.0-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (8.6 kB)
Collecting lmdb (from paddleocr==2.9.0)
  Downloading lmdb-1.7.5-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl.metadata (1.4 kB)
Collecting numpy>=1.13 (from paddle

# Normal Packages

In [3]:
# # Install core packages (HF + metrics). Re-run if Colab restarts.
# !pip -q install -U pip
!pip install -U transformers accelerate datasets evaluate jiwer Pillow regex editdistance sentencepiece timm

Collecting transformers
  Downloading transformers-4.57.6-py3-none-any.whl.metadata (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.0/44.0 kB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m
Collecting datasets
  Downloading datasets-4.5.0-py3-none-any.whl.metadata (19 kB)
Collecting evaluate
  Downloading evaluate-0.4.6-py3-none-any.whl.metadata (9.5 kB)
Collecting jiwer
  Downloading jiwer-4.0.0-py3-none-any.whl.metadata (3.3 kB)
Collecting Pillow
  Downloading pillow-12.1.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (8.8 kB)
Collecting regex
  Downloading regex-2026.1.15-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl.metadata (40 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.5/40.5 kB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m
Collecting pyarrow>=21.0.0 (from datasets)
  Downloading pyarrow-23.0.0-cp312-cp312-manylinux_2_28_x86_64.whl.metadata (3.0 kB)
Downloading transformers-4

In [4]:
!pip uninstall -y torch torchvision torchaudio
!pip install --no-cache-dir torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu124
!python -c "import torch; print(torch.__version__)"


Found existing installation: torch 2.9.0+cu126
Uninstalling torch-2.9.0+cu126:
  Successfully uninstalled torch-2.9.0+cu126
Found existing installation: torchvision 0.24.0+cu126
Uninstalling torchvision-0.24.0+cu126:
  Successfully uninstalled torchvision-0.24.0+cu126
Found existing installation: torchaudio 2.9.0+cu126
Uninstalling torchaudio-2.9.0+cu126:
  Successfully uninstalled torchaudio-2.9.0+cu126
Looking in indexes: https://download.pytorch.org/whl/cu124
Collecting torch
  Downloading https://download.pytorch.org/whl/cu124/torch-2.6.0%2Bcu124-cp312-cp312-linux_x86_64.whl.metadata (28 kB)
Collecting torchvision
  Downloading https://download.pytorch.org/whl/cu124/torchvision-0.21.0%2Bcu124-cp312-cp312-linux_x86_64.whl.metadata (6.1 kB)
Collecting torchaudio
  Downloading https://download.pytorch.org/whl/cu124/torchaudio-2.6.0%2Bcu124-cp312-cp312-linux_x86_64.whl.metadata (6.6 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading https://download.pytorch.org/

In [5]:
!pip -q install safetensors


In [6]:
# Create folders for data and experiments (idempotent)
import os
base = "/content/drive/MyDrive/GothiRead"
subdirs = ["src/data", "src/eval", "src/models", "scripts", "data/train", "data/val", "data/test_public", "exp"]
for sd in subdirs:
    os.makedirs(os.path.join(base, sd), exist_ok=True)
print("Project folders ready at", base)

Project folders ready at /content/drive/MyDrive/GothiRead


Dataset Preparation

In [7]:
import os, glob, json
from pathlib import Path
from PIL import Image
import regex as re

# Adjust base if needed
BASE = "/content/drive/MyDrive/GothiRead"
print("Base:", BASE)

# Make sure python can find our src when you place the repo at BASE
import sys
if BASE not in sys.path:
    sys.path.insert(0, BASE)

from src.data.icdar24 import LineDataset, split_into_chars
from src.eval.metrics import compute_ocr_metrics, compute_font_cer
from src.data.build_vocab import build_char_vocab, save_vocab

Base: /content/drive/MyDrive/GothiRead


In [8]:
!unzip "/content/drive/MyDrive/GothiRead/data/Dataset.zip" -d "/content/dataset"


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
 extracting: /content/dataset/valid/single/rotunda/18989.txt  
  inflating: /content/dataset/valid/single/rotunda/138351.font  
  inflating: /content/dataset/valid/single/rotunda/138497.jpg  
  inflating: /content/dataset/valid/single/rotunda/19024.jpg  
 extracting: /content/dataset/valid/single/rotunda/138313.txt  
 extracting: /content/dataset/valid/single/rotunda/138121.txt  
 extracting: /content/dataset/valid/single/rotunda/138014.txt  
  inflating: /content/dataset/valid/single/rotunda/138413.jpg  
 extracting: /content/dataset/valid/single/rotunda/194127.txt  
 extracting: /content/dataset/valid/single/rotunda/138338.txt  
  inflating: /content/dataset/valid/single/rotunda/194110.jpg  
  inflating: /content/dataset/valid/single/rotunda/18983.txt  
  inflating: /content/dataset/valid/single/rotunda/138486.font  
  inflating: /content/dataset/valid/single/rotunda/05410.txt  
  inflating: /content/dataset/valid/singl

In [9]:
!python /content/drive/MyDrive/GothiRead/scripts/make_test_split.py \
  --root /content \
  --split_ratio 0.10 \
  --include_single True \
  --include_multiple True \
  --move


{
  "root": "/content",
  "train_root": "/content/dataset/train",
  "test_root": "/content/dataset/test",
  "include_single": true,
  "include_multiple": true,
  "split_ratio": 0.1,
  "seed": 42,
  "mode": "move",
  "dry_run": false,
  "leaf_dirs_processed": 17923,
  "triplets_considered": 179223,
  "triplets_selected": 17923
}

Done. Moved 17923 triplet(s) to test.
Test set available at: /content/dataset/test


In [10]:
%cd /content/
!python /content/drive/MyDrive/GothiRead/scripts/build_manifest.py \
  --data-root /content/dataset  \
  --splits train valid test \
  --out-dir manifests

/content
[OK] Wrote manifests/train.csv (163023 rows)
[OK] Wrote manifests/valid.csv (4040 rows)
[OK] Wrote manifests/test.csv (17923 rows)


In [11]:
!python /content/drive/MyDrive/GothiRead/scripts/filter_clean.py \
  --manifests manifests/train.csv manifests/valid.csv manifests/test.csv \
  --out-dir manifests --suffix _clean



[OK] train.csv: kept 161297/163023 (98.94%) -> manifests/train_clean.csv
[OK] valid.csv: kept 3827/4040 (94.73%) -> manifests/valid_clean.csv
[OK] test.csv: kept 17923/17923 (100.00%) -> manifests/test_clean.csv


In [None]:
# import json, glob

# # runs = glob.glob("/content/runs/*/metrics.json")
# # runs = glob.glob("/content/manifests/donut_sbhavy_donut-base-ocr/metrics.json")
# rows = []
# for path in runs:
#     m = json.load(open(path))
#     rows.append({
#         "run": path.split("/")[-2],
#         "CER": m["CER"],
#         "WER": m["WER"],
#         # "Avg. Latency ms": m.get("timing", {}).get("avg_latency_ms", 0),
#         # "num_beams": m.get("decode", {}).get("num_beams", 1),
#         # "length_penalty": m.get("decode", {}).get("length_penalty", 1.0),
#     })


# import pandas as pd
# df = pd.DataFrame(rows).sort_values("CER")
# print(df)


In [12]:
import csv
from pathlib import Path

def manifest_to_rec_txt(manifest_csv, out_txt):
    ids, imgs, gts = [], [], []
    with open(manifest_csv, newline="", encoding="utf-8") as f:
        reader = csv.DictReader(f)
        for r in reader:
            if r.get("ok") != "TRUE":
                continue
            img, txt = r["image_path"], r["txt_path"]
            if not (img and txt):
                continue
            gt = Path(txt).read_text(encoding="utf-8").strip()
            imgs.append(img)
            gts.append(gt)
    with open(out_txt, "w", encoding="utf-8") as f:
        for img, gt in zip(imgs, gts):
            f.write(f"{img}\t{gt}\n")

root = "/content"  # adjust if needed

manifest_train = f"{root}/manifests/train_clean.csv"
manifest_val   = f"{root}/manifests/valid_clean.csv"

out_train_txt = "./train_data/gothiread/rec_gt_train.txt"
out_val_txt   = "./train_data/gothiread/rec_gt_val.txt"

Path("./train_data/gothiread").mkdir(parents=True, exist_ok=True)
manifest_to_rec_txt(manifest_train, out_train_txt)
manifest_to_rec_txt(manifest_val, out_val_txt)

print("train:", out_train_txt)
print("val  :", out_val_txt)


train: ./train_data/gothiread/rec_gt_train.txt
val  : ./train_data/gothiread/rec_gt_val.txt


In [13]:
import csv, pathlib

def manifest_to_rec_gt(csv_path, out_txt):
    with open(csv_path, newline="", encoding="utf-8") as f, \
         open(out_txt, "w", encoding="utf-8") as out:
        reader = csv.DictReader(f)
        for row in reader:
            if row.get("ok") not in ("TRUE", "True", "1"):
                continue
            img = row["image_path"]
            txt_path = pathlib.Path(row["txt_path"])
            if not txt_path.is_file():
                continue
            gt = txt_path.read_text(encoding="utf-8").strip()
            if not gt:
                continue
            out.write(f"{img}\t{gt}\n")

manifest_to_rec_gt("/content/manifests/train_clean.csv", "./train_data/gothiread/rec_gt_train.txt")
manifest_to_rec_gt("/content/manifests/valid_clean.csv", "./train_data/gothiread/rec_gt_val.txt")


# TROCR

In [None]:
# !python /content/drive/MyDrive/GothiRead/scripts/zeroshot/zeroshot_trocr.py \
#         --manifest /content/manifests/valid_clean.csv \
#         --model microsoft/trocr-large-printed \
#         --num_beams 1 \
#         --batch_size 4 \
#         --max_length 128 \
#         # --limit 8000

In [None]:
# !mkdir -p "/content/drive/MyDrive/manifests_runs_backup"
# !cp -r /content/manifests/runs "/content/drive/MyDrive/manifests_runs_backup"
# print("Copied successfully!")

In [None]:
# !pip -q install huggingface_hub
# !huggingface-cli login


In [None]:
# !python - << 'PY'
# import pathlib

# p = pathlib.Path("/content/drive/MyDrive/GothiRead/scripts/finetune_trocr.py")
# text = p.read_text(encoding="utf-8")

# text = text.replace("evaluation_strategy=", "eval_strategy=")

# p.write_text(text, encoding="utf-8")
# print("✔ Patched evaluation_strategy -> eval_strategy")


In [None]:
# !python /content/drive/MyDrive/GothiRead/scripts/finetune_trocr.py \
#   --train_manifest /content/manifests/train_clean.csv \
#   --val_manifest /content/manifests/valid_clean.csv \
#   --model_name /content/drive/MyDrive/GothiRead/runs/trocr_bs32_a100/checkpoint-2400 \
#   --out_dir /content/drive/MyDrive/GothiRead/runs/trocr_bs40_a100 \
#   --epochs 4 \
#   --train_bs 40 \
#   --eval_bs 16 \
#   --lr 1e-5  # --lr 3e-5


In [None]:
# !python /content/drive/MyDrive/GothiRead/scripts/finetune_trocr.py \
#   --train_manifest /content/manifests/train_clean.csv \
#   --val_manifest /content/manifests/valid_clean.csv \
#   --model_name microsoft/trocr-base-handwritten \
#   --out_dir /content/drive/MyDrive/GothiRead/runs/trocr_a100_stable \
#   --epochs 3 \
#   --train_bs 54 --grad_accum 1 --eval_bs 8 \
#   --eval_steps 1000 --save_steps 1000 \
#   --max_label_len 128 \
#   --val_eval_limit 1000


# PaddleOCR


In [None]:
# -------------------------------------------

In [None]:
# !pip uninstall -y paddlex modelscope torch paddleocr
# !pip install "paddlepaddle-gpu==2.6.2" -f https://www.paddlepaddle.org.cn/whl/linux/mkl/avx/stable.html
# !pip install "paddleocr==2.9.0" opencv-python rapidfuzz jiwer


# # !pip install -U "paddleocr>=3.0.0" "paddlex>=3.0.0"
# # (optional but recommended to avoid older paddlepaddle)
# # For CPU only:
# # !pip install "paddlepaddle==3.0.0b1" -i https://www.paddlepaddle.org.cn/packages/stable/cpu/

# # For GPU (if CUDA 11.8, adjust if needed):
# # !pip install "paddlepaddle-gpu==3.0.0b1" -i https://www.paddlepaddle.org.cn/packages/stable/cu118/


In [None]:
# !wget https://paddleocr.bj.bcebos.com/PP-OCRv5/rec/en/en_PP-OCRv5_server_rec_infer.tar
# !tar -xf en_PP-OCRv5_server_rec_infer.tar -C /content/


In [14]:
!git clone https://github.com/PaddlePaddle/PaddleOCR.git


Cloning into 'PaddleOCR'...
remote: Enumerating objects: 311077, done.[K
remote: Counting objects: 100% (1247/1247), done.[K
remote: Compressing objects: 100% (298/298), done.[K
remote: Total 311077 (delta 1074), reused 1010 (delta 949), pack-reused 309830 (from 2)[K
Receiving objects: 100% (311077/311077), 1.65 GiB | 16.79 MiB/s, done.
Resolving deltas: 100% (246146/246146), done.


In [None]:
# # %cd /content/PaddleOCR

# # # # 1) Download the PP-OCRv5_server_rec pretrained model
# !wget https://paddle-model-ecology.bj.bcebos.com/paddlex/official_pretrained_model/PP-OCRv5_server_rec_pretrained.pdparams

# # # # (optional) rename it to something shorter
# !mv PP-OCRv5_server_rec_pretrained.pdparams pretrain_ppocrv5_server_rec.pdparams


In [15]:
import csv, pathlib

def build_char_dict(csv_path, out_path):
    chars = set()
    with open(csv_path, newline="", encoding="utf-8") as f:
        reader = csv.DictReader(f)
        for row in reader:
            if row.get("ok") not in ("TRUE", "True", "1"):
                continue
            txt_path = pathlib.Path(row["txt_path"])
            if not txt_path.is_file():
                continue
            gt = txt_path.read_text(encoding="utf-8").strip()
            for ch in gt:
                if ch != "\n":
                    chars.add(ch)

    with open(out_path, "w", encoding="utf-8") as out:
        for ch in sorted(chars):
            out.write(ch + "\n")

build_char_dict(
    "/content/manifests/train_clean.csv",
    "/content/PaddleOCR/ppocr/utils/gothi_dict.txt"
)

In [16]:
import csv
from pathlib import Path

def manifest_to_rec_txt(manifest_csv, out_txt):
    ids, imgs, gts = [], [], []
    with open(manifest_csv, newline="", encoding="utf-8") as f:
        reader = csv.DictReader(f)
        for r in reader:
            if r.get("ok") != "TRUE":
                continue
            img, txt = r["image_path"], r["txt_path"]
            if not (img and txt):
                continue
            gt = Path(txt).read_text(encoding="utf-8").strip()
            imgs.append(img)
            gts.append(gt)
    with open(out_txt, "w", encoding="utf-8") as f:
        for img, gt in zip(imgs, gts):
            f.write(f"{img}\t{gt}\n")

root = "/content"  # adjust if needed

manifest_train = f"{root}/manifests/train_clean.csv"
manifest_val   = f"{root}/manifests/valid_clean.csv"

out_train_txt = "./rec_gt_train.txt"
out_val_txt   = "./rec_gt_val.txt"

Path("./train_data/gothiread").mkdir(parents=True, exist_ok=True)
manifest_to_rec_txt(manifest_train, out_train_txt)
manifest_to_rec_txt(manifest_val, out_val_txt)

print("train:", out_train_txt)
print("val  :", out_val_txt)


train: ./rec_gt_train.txt
val  : ./rec_gt_val.txt


In [None]:
# !python /content/PaddleOCR/tools/train.py \
#   -c /content/PaddleOCR/PP-OCRv5_gothi_rec.yml \
#   -o Global.use_gpu=True \
#      Global.print_batch_step=50 \
#      Global.checkpoints="./output/PP-OCRv5_server_rec/latest" \
#      Global.eval_batch_step=1000

In [17]:
import os

# Navigate to the PaddleOCR directory
%cd /content/PaddleOCR

# Create the output directory
os.makedirs('./output', exist_ok=True)
print("Output directory created in PaddleOCR: /content/PaddleOCR/output")

/content/PaddleOCR
Output directory created in PaddleOCR: /content/PaddleOCR/output


In [18]:
# !cp -r /content/PaddleOCR/output/PP-OCRv5_server_rec /content/drive/MyDrive/GothiRead/Backup2_PPOCRv5_server_rec
!cp -r /content/drive/MyDrive/GothiRead/models/PPOCR/Backup2_PPOCRv5_server_rec /content/PaddleOCR/output/PP-OCRv5_server_rec


In [19]:
!python /content/drive/MyDrive/GothiRead/scripts/patch_dict_164.py


[INFO] Original dict length: 163
[INFO] After de-dup length: 163
[DONE] Patched dict written to: /content/PaddleOCR/ppocr/utils/gothi_dict_164.txt
[DONE] Final dict length: 164


In [None]:
# %cd /content/PaddleOCR

# !python tools/eval.py \
#   -c /content/PaddleOCR/PP-OCRv5_gothi_rec.yml \
#   -o Global.checkpoints=./output/PP-OCRv5_server_rec/best_accuracy

In [20]:
%cd /content/PaddleOCR
!python tools/export_model.py \
  -c /content/PaddleOCR/PP-OCRv5_gothi_rec.yml \
  -o Global.checkpoints=./output/PP-OCRv5_server_rec/best_accuracy \
     Global.save_inference_dir=./inference/PP-OCRv5

/content/PaddleOCR
Skipping import of the encryption module.
W0119 17:48:18.527384  6700 gpu_resources.cc:119] Please NOTE: device: 0, GPU Compute Capability: 7.5, Driver API Version: 12.4, Runtime API Version: 11.8
W0119 17:48:18.549125  6700 gpu_resources.cc:164] device: 0, cuDNN Version: 9.2.
[2026/01/19 17:48:19] ppocr INFO: resume from ./output/PP-OCRv5_server_rec/best_accuracy
[2026/01/19 17:48:19] ppocr INFO: Export inference config file to ./inference/PP-OCRv5/inference.yml
Skipping import of the encryption module
I0119 17:48:23.396008  6700 program_interpreter.cc:212] New Executor is Running.
[2026/01/19 17:48:23] ppocr INFO: inference model is saved to ./inference/PP-OCRv5/inference


In [None]:
# !python /content/drive/MyDrive/GothiRead/scripts/zeroshot_paddleocr.py \
#   --manifest /content/manifests/test_clean.csv \
#   --rec_model_dir /content/PaddleOCR/inference/PP-OCRv5/ \
#   --rec_char_dict_path /content/PaddleOCR/ppocr/utils/gothi_dict.txt \
#   --use_gpu

2026-01-13 14:08:46.155068: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1768313326.394427    8857 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1768313326.458555    8857 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1768313326.924436    8857 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1768313326.924487    8857 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1768313326.924491    8857 computation_placer.cc:177] computation placer alr

In [21]:
!pip install regex opencv-python-headless pyyaml

Collecting numpy<2.3.0,>=2 (from opencv-python-headless)
  Downloading numpy-2.2.6-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (62 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.0/62.0 kB[0m [31m6.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading numpy-2.2.6-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (16.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m16.5/16.5 MB[0m [31m12.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: numpy
  Attempting uninstall: numpy
    Found existing installation: numpy 1.26.4
    Uninstalling numpy-1.26.4:
      Successfully uninstalled numpy-1.26.4
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
paddleocr 2.9.0 requires numpy<2.0, but you have numpy 2.2.6 which is incompatible.
numba 0.60.0 requires numpy<2.1,>=1.22, but y

In [None]:
!python /content/drive/MyDrive/GothiRead/scripts/prefilter_extract.py \
  --in /content/manifests/train_clean.csv \
  --out /content/manifests/train_clean_T40.csv \
  --max-gt-len 40

!python /content/drive/MyDrive/GothiRead/scripts/prefilter_extract.py \
  --in /content/manifests/valid_clean.csv \
  --out /content/manifests/valid_clean_T40.csv \
  --max-gt-len 40


[OK] /content/manifests/train_clean_T40.csv: kept 66706/161297 (41.36%) with max_gt_len=40
[OK] /content/manifests/valid_clean_T40.csv: kept 1617/3827 (42.25%) with max_gt_len=40


In [22]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [23]:
!PYTHONPATH=$SCRIPTS_DIR:$PYTHONPATH \
python /content/drive/MyDrive/GothiRead/scripts/extract_align_dp.py \
   --manifest /content/manifests/train_clean.csv \
  --rec_model_dir /content/PaddleOCR/inference/PP-OCRv5 \
  --rec_char_dict_path /content/PaddleOCR/ppocr/utils/gothi_dict.txt \
  --out /content/manifests/train_align.jsonl \
  --use_gpu --batch_size 32 \
  --class_aware_gating --rare_quantile 0.25 \
  --hard_sample_gating --hard_id_prefix "multiple/" --mixed_dom_threshold 0.85 \
  --min_coverage 0.90 --max_gt_gap_run 1 --max_edits_abs 2 --max_edits_frac 0.10 \
  --min_coverage_rare 0.75 --max_gt_gap_run_rare 3 --max_edits_abs_rare 4 --max_edits_frac_rare 0.25 \
  --min_unique_ratio 0.08 --max_repeat_run 8 \
  --print_class_stats --debug_rejects 20


[DICT] loaded 163 tokens from infer_cfg in /content/PaddleOCR/inference/PP-OCRv5
[1m[35m--- Running analysis [ir_graph_build_pass][0m
I0119 17:48:44.053442  6970 executor.cc:187] Old Executor is Running.
[1m[35m--- Running analysis [ir_analysis_pass][0m
[32m--- Running IR pass [map_op_to_another_pass][0m
I0119 17:48:44.125610  6970 fuse_pass_base.cc:59] ---  detected 29 subgraphs
[32m--- Running IR pass [is_test_pass][0m
[32m--- Running IR pass [simplify_with_basic_ops_pass][0m
[32m--- Running IR pass [delete_quant_dequant_linear_op_pass][0m
[32m--- Running IR pass [delete_weight_dequant_linear_op_pass][0m
[32m--- Running IR pass [constant_folding_pass][0m
I0119 17:48:44.189554  6970 fuse_pass_base.cc:59] ---  detected 3 subgraphs
[32m--- Running IR pass [silu_fuse_pass][0m
[32m--- Running IR pass [conv_bn_fuse_pass][0m
I0119 17:48:44.385874  6970 fuse_pass_base.cc:59] ---  detected 86 subgraphs
[32m--- Running IR pass [conv_eltwiseadd_bn_fuse_pass][0m
[32m--- 

In [24]:
!PYTHONPATH=$SCRIPTS_DIR:$PYTHONPATH \
python /content/drive/MyDrive/GothiRead/scripts/extract_align_dp.py \
   --manifest /content/manifests/valid_clean.csv \
  --rec_model_dir /content/PaddleOCR/inference/PP-OCRv5 \
  --rec_char_dict_path /content/PaddleOCR/ppocr/utils/gothi_dict.txt \
  --out /content/manifests/valid_align.jsonl \
  --use_gpu --batch_size 32 \
  --class_aware_gating --rare_quantile 0.25 \
  --hard_sample_gating --hard_id_prefix "multiple/" --mixed_dom_threshold 0.85 \
  --min_coverage 0.90 --max_gt_gap_run 1 --max_edits_abs 2 --max_edits_frac 0.10 \
  --min_coverage_rare 0.75 --max_gt_gap_run_rare 3 --max_edits_abs_rare 4 --max_edits_frac_rare 0.25 \
  --min_unique_ratio 0.08 --max_repeat_run 8 \
  --print_class_stats --debug_rejects 20

[DICT] loaded 163 tokens from infer_cfg in /content/PaddleOCR/inference/PP-OCRv5
[1m[35m--- Running analysis [ir_graph_build_pass][0m
I0119 18:06:20.472425 11461 executor.cc:187] Old Executor is Running.
[1m[35m--- Running analysis [ir_analysis_pass][0m
[32m--- Running IR pass [map_op_to_another_pass][0m
I0119 18:06:20.740113 11461 fuse_pass_base.cc:59] ---  detected 29 subgraphs
[32m--- Running IR pass [is_test_pass][0m
[32m--- Running IR pass [simplify_with_basic_ops_pass][0m
[32m--- Running IR pass [delete_quant_dequant_linear_op_pass][0m
[32m--- Running IR pass [delete_weight_dequant_linear_op_pass][0m
[32m--- Running IR pass [constant_folding_pass][0m
I0119 18:06:20.800374 11461 fuse_pass_base.cc:59] ---  detected 3 subgraphs
[32m--- Running IR pass [silu_fuse_pass][0m
[32m--- Running IR pass [conv_bn_fuse_pass][0m
I0119 18:06:21.003820 11461 fuse_pass_base.cc:59] ---  detected 86 subgraphs
[32m--- Running IR pass [conv_eltwiseadd_bn_fuse_pass][0m
[32m--- 

In [None]:
# !python /content/drive/MyDrive/GothiRead/scripts/extract_align.py \
#   --manifest /content/manifests/train_clean_T40.csv \
#   --rec_model_dir /content/PaddleOCR/inference/PP-OCRv5 \
#   --rec_char_dict_path /content/PaddleOCR/ppocr/utils/gothi_dict.txt \
#   --out /content/manifests/train_align.jsonl \
#   --rec_image_shape 3,32,320 \
#   --batch_size 32 \
#   --use_gpu

# !python /content/drive/MyDrive/GothiRead/scripts/extract_align.py \
#   --manifest /content/manifests/valid_clean_T40.csv \
#   --rec_model_dir /content/PaddleOCR/inference/PP-OCRv5 \
#   --rec_char_dict_path /content/PaddleOCR/ppocr/utils/gothi_dict.txt \
#   --out /content/manifests/valid_align.jsonl \
#   --rec_image_shape 3,32,320 \
#   --batch_size 32 \
#   --use_gpu


[DICT] loaded 163 tokens from infer_cfg in /content/PaddleOCR/inference/PP-OCRv5
[1m[35m--- Running analysis [ir_graph_build_pass][0m
I0112 11:27:24.290059  5646 executor.cc:187] Old Executor is Running.
[1m[35m--- Running analysis [ir_analysis_pass][0m
[32m--- Running IR pass [map_op_to_another_pass][0m
I0112 11:27:24.366854  5646 fuse_pass_base.cc:59] ---  detected 29 subgraphs
[32m--- Running IR pass [is_test_pass][0m
[32m--- Running IR pass [simplify_with_basic_ops_pass][0m
[32m--- Running IR pass [delete_quant_dequant_linear_op_pass][0m
[32m--- Running IR pass [delete_weight_dequant_linear_op_pass][0m
[32m--- Running IR pass [constant_folding_pass][0m
I0112 11:27:24.432633  5646 fuse_pass_base.cc:59] ---  detected 3 subgraphs
[32m--- Running IR pass [silu_fuse_pass][0m
[32m--- Running IR pass [conv_bn_fuse_pass][0m
I0112 11:27:24.635782  5646 fuse_pass_base.cc:59] ---  detected 86 subgraphs
[32m--- Running IR pass [conv_eltwiseadd_bn_fuse_pass][0m
[32m--- 

In [26]:
!python /content/drive/MyDrive/GothiRead/scripts/font_vocab.py \
  --align-jsonl /content/manifests/train_align.jsonl \
  --out /content/manifests/font_vocab.json \
  --min-count 5


[OK] wrote /content/manifests/font_vocab.json num_fonts=8 total_graphemes=1573181


In [32]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [28]:
!PYTHONPATH=/content/PaddleOCR:$PYTHONPATH \
python /content/drive/MyDrive/GothiRead/scripts/train_font.py \
  --train-align-jsonl /content/manifests/train_align.jsonl \
  --val-align-jsonl /content/manifests/valid_align.jsonl \
  --font-vocab /content/manifests/font_vocab.json \
  --rec-config /content/PaddleOCR/PP-OCRv5_gothi_rec.yml \
  --rec-checkpoint /content/PaddleOCR/output/PP-OCRv5_server_rec/best_accuracy.pdparams \
  --device gpu \
  --pooling attnmax \
  --context conv \
  --feat-source im2seq \
  --oversample transitions \
  --oversample-mult 5.0 \
  --min-transitions 2 \
  --min-range-len 4 \
  --epochs 15 \
  --out-dir /content/drive/MyDrive/GothiRead/runs/font_v3 --resume \
  --boundary-weight 0.70

[INFO] num_fonts=8 pooling=attnmax context=conv feat_source=im2seq oversample=transitions
[INFO] transition-oversample: boosted_unique=1559/59228 target_mult=2.0 extra=1559 total_indices=60787 min_transitions=1
Skipping import of the encryption module.
W0119 12:24:06.287958 17907 gpu_resources.cc:119] Please NOTE: device: 0, GPU Compute Capability: 7.5, Driver API Version: 12.4, Runtime API Version: 11.8
W0119 12:24:06.288975 17907 gpu_resources.cc:164] device: 0, cuDNN Version: 9.2.
[INFO] encoder feature dim D=2048, pooled dim=4096
[INFO] token counts (by id): [338942, 241865, 237060, 193835, 173979, 142473, 126629, 118398]
[INFO] class weights (by id): [0.7293909788131714, 0.8634487390518188, 0.872155487537384, 0.9645105004310608, 1.0180630683898926, 1.1250110864639282, 1.1933187246322632, 1.2341015338897705]
[RESUME] loaded /content/drive/MyDrive/GothiRead/runs/font_v3/checkpoint_last.pdparams
[RESUME] start_epoch=14 global_step=24687 best_acc=0.8084
[train] epoch=14 step=24700 avg

In [None]:
# ===========================
# Font Confusion Matrix Cell
# ===========================
import sys
sys.path.insert(0, "/content/drive/MyDrive/GothiRead/scripts")

import json
from pathlib import Path
import numpy as np
import paddle

from font_dataset import AlignJsonlFontDataset, collate_font_batch
from rec_loader import load_rec_model_with_features, extract_rec_features
from train_font import RangePooler, FontHead  # same definitions as training

# ---------------------------
# CONFIG (EDIT THESE PATHS)
# ---------------------------
VAL_ALIGN = Path("/content/manifests/train_align.jsonl")
FONT_VOCAB = Path("/content/manifests/font_vocab.json")

REC_CONFIG = "/content/PaddleOCR/PP-OCRv5_gothi_rec.yml"
REC_CKPT = "/content/PaddleOCR/output/PP-OCRv5_server_rec/best_accuracy.pdparams"

RUN_DIR = Path("/content/drive/MyDrive/GothiRead/runs/font_v3")
HEAD_CKPT = RUN_DIR / "font_head_best.pdparams"
POOL_CKPT = RUN_DIR / "pooler_best.pdparams"

DEVICE = "gpu"
BATCH_SIZE = 32
REC_IMAGE_SHAPE = (3, 32, 320)
MAX_GRAPHEMES = 120
FEAT_SOURCE = "im2seq"
MIN_RANGE_LEN = 2

# ---------------------------
# LOAD VOCAB
# ---------------------------
vocab = json.loads(FONT_VOCAB.read_text())
font2id = vocab["font2id"]
id2font = {v: k for k, v in font2id.items()}
K = vocab["num_fonts"]

print("Fonts:")
for i in range(K):
    print(f"  {i}: {id2font[i]}")

# ---------------------------
# DATASET
# ---------------------------
val_ds = AlignJsonlFontDataset(
    align_jsonl=VAL_ALIGN,
    font2id=font2id,
    rec_image_shape=REC_IMAGE_SHAPE,
    max_graphemes=MAX_GRAPHEMES,
)

val_dl = paddle.io.DataLoader(
    val_ds,
    batch_size=BATCH_SIZE,
    shuffle=False,
    collate_fn=collate_font_batch,
    drop_last=False,
)

# ---------------------------
# LOAD MODELS
# ---------------------------
paddle.set_device("cpu" if DEVICE == "cpu" else "gpu")

rec_model = load_rec_model_with_features(
    config_path=REC_CONFIG,
    checkpoint_path=REC_CKPT,
    device=DEVICE,
)
rec_model.eval()
for p in rec_model.parameters():
    p.stop_gradient = True

# infer D
x0, r0, y0, m0 = next(iter(val_dl))
with paddle.no_grad():
    feats0 = extract_rec_features(rec_model, x0)
    F0 = feats0["im2seq"] if FEAT_SOURCE == "im2seq" else feats0["ctc_neck"]
D = F0.shape[-1]

pooler = RangePooler(D=D, pooling="attnmax")
head = FontHead(
    in_dim=2 * D,
    hidden=512,
    num_fonts=K,
    dropout=0.0,
    context="conv",
)

pooler.set_state_dict(paddle.load(str(POOL_CKPT)))
head.set_state_dict(paddle.load(str(HEAD_CKPT)))
pooler.eval()
head.eval()

# ---------------------------
# CONFUSION MATRIX
# ---------------------------
conf = np.zeros((K, K), dtype=np.int64)

boundary_correct = 0
boundary_total = 0
interior_correct = 0
interior_total = 0

# ---------------------------
# RUN VALIDATION
# ---------------------------
with paddle.no_grad():
    for x, ranges, y, mask in val_dl:
        feats = extract_rec_features(rec_model, x)
        F_btd = feats["im2seq"] if FEAT_SOURCE == "im2seq" else feats["ctc_neck"]

        rlen = (ranges[:, :, 1] - ranges[:, :, 0] + 1).astype("int64")
        valid = mask * (rlen >= MIN_RANGE_LEN)

        pooled = pooler(F_btd, ranges)
        logits = head(pooled)
        pred = paddle.argmax(logits, axis=-1)

        B, G = y.shape
        for b in range(B):
            for i in range(G):
                if valid[b, i] < 0.5:
                    continue

                gt = int(y[b, i])
                pr = int(pred[b, i])
                conf[gt, pr] += 1

                is_boundary = (
                    (i > 0 and y[b, i] != y[b, i - 1]) or
                    (i < G - 1 and y[b, i] != y[b, i + 1])
                )

                if is_boundary:
                    boundary_total += 1
                    boundary_correct += int(gt == pr)
                else:
                    interior_total += 1
                    interior_correct += int(gt == pr)

# ---------------------------
# PRINT RESULTS
# ---------------------------
print("\n=== CONFUSION MATRIX (rows=true, cols=pred) ===")
print(conf)

print("\n=== PER-CLASS ACCURACY ===")
for i in range(K):
    tot = conf[i].sum()
    acc = conf[i, i] / max(1, tot)
    print(f"{id2font[i]:>12s}: {acc:.4f}  (n={tot})")

print("\n=== BOUNDARY vs INTERIOR ===")
print(f"Boundary acc : {boundary_correct / max(1, boundary_total):.4f}  (n={boundary_total})")
print(f"Interior acc : {interior_correct / max(1, interior_total):.4f}  (n={interior_total})")


Fonts:
  0: a
  1: f
  2: G
  3: b
  4: t
  5: s
  6: r
  7: i
Skipping import of the encryption module.




In [31]:
!PYTHONPATH=/content/PaddleOCR:$PYTHONPATH \
  python /content/drive/MyDrive/GothiRead/scripts/font_report.py \
  --val-align-jsonl /content/manifests/valid_align.jsonl \
  --font-vocab /content/manifests/font_vocab.json \
  --rec-config /content/PaddleOCR/PP-OCRv5_gothi_rec.yml \
  --rec-checkpoint /content/PaddleOCR/output/PP-OCRv5_server_rec/best_accuracy.pdparams \
  --font-head /content/drive/MyDrive/GothiRead/runs/font_v3/font_head_best.pdparams \
  --pooling meanmax \
  --batch-size 32 \
  --out-dir runs/font_head_meanmax_sqrtinv/val_report_final


Skipping import of the encryption module.
W0119 18:34:34.116117 18816 gpu_resources.cc:119] Please NOTE: device: 0, GPU Compute Capability: 7.5, Driver API Version: 12.4, Runtime API Version: 11.8
W0119 18:34:34.116897 18816 gpu_resources.cc:164] device: 0, cuDNN Version: 9.2.
Traceback (most recent call last):
  File "/content/drive/MyDrive/GothiRead/scripts/font_report.py", line 378, in <module>
    main()
  File "/content/drive/MyDrive/GothiRead/scripts/font_report.py", line 296, in main
    in_dim, hidden, num_fonts_head, is_mlp = _infer_head_structure_from_state(
                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/content/drive/MyDrive/GothiRead/scripts/font_report.py", line 94, in _infer_head_structure_from_state
    raise RuntimeError(
RuntimeError: No net.*.weight keys found. Keys: ['ln.weight', 'ln.bias', 'dw.weight', 'dw.bias', 'pw.weight', 'pw.bias', 'mlp.0.weight', 'mlp.0.bias', 'mlp.3.weight', 'mlp.3.bias'] ...


In [None]:
# ============================================================
# Line-level Font Classification (CSV -> majority font label)
# Google Colab T4 GPU - PaddlePaddle
# ============================================================

!pip install paddlepaddle-gpu==2.6.1 -f https://www.paddlepaddle.org.cn/whl/linux/mkl/avx/stable.html
!pip install pillow pandas numpy

import os, json, random, math, time
from collections import Counter, defaultdict

import numpy as np
import pandas as pd
from PIL import Image

import paddle
import paddle.nn as nn
import paddle.nn.functional as F
from paddle.io import Dataset, DataLoader
from paddle.vision import transforms as T
from paddle.vision.models import resnet50

print("Paddle:", paddle.__version__)
print("CUDA available:", paddle.device.is_compiled_with_cuda())


Looking in links: https://www.paddlepaddle.org.cn/whl/linux/mkl/avx/stable.html
Paddle: 2.6.1
CUDA available: True


# Donut


In [None]:
# !python /content/drive/MyDrive/GothiRead/scripts/zeroshot/zeroshot_donut.py \
#     --manifest /content/manifests/valid_clean.csv \
#     --model sbhavy/donut-base-ocr \
#     --batch_size 1 \
#     --max_length 96 \
#     --image_size 640 \


# !python /content/drive/MyDrive/GothiRead/scripts/zeroshot/zeroshot_donut.py \
#     --manifest /content/manifests/valid_clean.csv \
#     --model naver-clova-ix/donut-base  \
#     --batch_size 1 \
#     --max_length 96 \
#     --image_size 640 \



# ParSeq

In [None]:
# !python /content/drive/MyDrive/GothiRead/scripts/zeroshot/zero_shot_parseq.py \
#   --manifest /content/manifests/valid_clean.csv \
#     --model abinet \
#     --batch_size 32 \
#     --max_length 32

# !python /content/drive/MyDrive/GothiRead/scripts/zeroshot/zero_shot_parseq.py \
#   --manifest /content/manifests/valid_clean.csv \
#     --model vitstr  \
#     --batch_size 32 \
#     --max_length 32

# !python /content/drive/MyDrive/GothiRead/scripts/zeroshot/zero_shot_parseq.py \
#   --manifest /content/manifests/valid_clean.csv \
#     --model parseq \
#     --batch_size 32 \
#     --max_length 32 \
#     --image_height 32 \
#     --image_width 128 \
#     --fp16

