In [6]:
from datasets import load_dataset
from dotenv import load_dotenv
import os

load_dotenv()
token=os.getenv("HF_TOKEN")

ds = load_dataset('sylvain471/ordonnances-typewriter-5')

dataset = ds['train'].train_test_split(test_size=0.05)
dataset

Using the latest cached version of the dataset since sylvain471/ordonnances-typewriter-5 couldn't be found on the Hugging Face Hub
Found the latest cached dataset configuration 'default' at /media/DATAEXT4/.huggingface/datasets/sylvain471___ordonnances-typewriter-5/default/0.0.0/86ba6dfe799cccfd855bdd88c6bc26ee1e8687b9 (last modified on Tue Apr  8 17:51:27 2025).


DatasetDict({
    train: Dataset({
        features: ['text', 'image'],
        num_rows: 6452
    })
    test: Dataset({
        features: ['text', 'image'],
        num_rows: 340
    })
})

In [7]:
import unicodedata
from PIL import Image

def prepare_text(text):
    text=unicodedata.normalize('NFC', text)
    text=text.replace("Â","A").replace("–","-").replace("Ç","C").replace("Ë","E").replace("Û","U").replace("ê","e").replace("ô","o").replace('"','').replace("Œ","oe")
    new_text=''
    for c in text:
        if c==' ':
            new_text+='<space> '
        else:
            new_text+=c+' '
    return new_text[:-1]

def resize_image(im,height=128):
    w=im.size[0]
    h=im.size[1]
    nh=height
    nw=int(w*height/h)
    im2=im.resize((nw, nh), Image.Resampling.LANCZOS)
    return im2


def prepare_data(dataset,img_prefix,txtname):
    for i,d in enumerate(dataset):
        os.makedirs("../data/images/",exist_ok=True)
        os.makedirs("../data/text/",exist_ok=True)
        img=d['image']
        fname=f"{img_prefix}_{i:04d}.jpg"
        img=resize_image(img)
        img.save(f"../data/images/{fname}")
        with open(f"../data/text/{txtname}",'a',encoding='utf-8') as f:
            f.write(fname+" "+prepare_text(d['text'])+"\n")
            
prepare_data(dataset['train'],"img","train.txt")
prepare_data(dataset['test'],"img_val","val.txt")

In [None]:
!git lfs ls-files
!cd ../models && git clone https://huggingface.co/Teklia/pylaia-himanis

Cloning into 'pylaia-himanis'...
remote: Enumerating objects: 31, done.[K
remote: Total 31 (delta 0), reused 0 (delta 0), pack-reused 31 (from 1)[K
Unpacking objects: 100% (31/31), 10.84 KiB | 1.55 MiB/s, done.
cp: cannot create regular file '..models/pylaia-himanis/initial_checkpoint.ckpt': No such file or directory


In [12]:
!cp ../models/pylaia-himanis/weights.ckpt ../models/pylaia-himanis/initial_checkpoint.ckpt 
!cp ../models/pylaia-himanis/model .

## Test model

In [14]:
ds = load_dataset('sylvain471/ordonnances-typewriter')

In [9]:
ds["train"][0]

{'text': 'Ordonnances',
 'image': <PIL.PngImagePlugin.PngImageFile image mode=RGB size=136x30>,
 'type': 'synthetic'}

In [None]:
!pylaia-htr-train-ctc --config ../config/config_finetune_model.yaml --common.experiment_dirname ../models/pylaia-himanis --common.checkpoint initial_checkpoint.ckpt --train.pretrain true --trainer.max_epochs 200

Global seed set to 74565
GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
Restoring states from the checkpoint file at ../models/pylaia-himanis/pretrained/initial_checkpoint_reset.ckpt
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Restored all states from the checkpoint file at ../models/pylaia-himanis/pretrained/initial_checkpoint_reset.ckpt

  | Name      | Type     | Params
---------------------------------------
0 | model     | LaiaCRNN | 5.4 M 
1 | criterion | CTCLoss  | 0     
---------------------------------------
5.4 M     Trainable params
0         Non-trainable params
5.4 M     Total params
21.424    Total estimated model params size (MB)
Global seed set to 74565                                                        
TR - E0:   5%|▊             | 22/404 [00:03<00:57,  6.68it/s, running_loss=65.4]^C
[2025-04-08 14:02:48,336 CRITICAL laia] Uncaught exception:                     
Traceback (most recent call last):
  File

## Evaluate mmodel

In [None]:
eval_dir="../data/eval"
os.makedirs(eval_dir,exist_ok=True)
with open("../img_list/img_list.txt","a") as f:
    for i,d in enumerate(ds['train']):
        fname=f"img_{i:04d}.jpg"
        im=resize_image(d["image"])
        im.save(f"{eval_dir}/{fname}")
        f.write(fname+"\n")  

In [20]:
config_decode="""syms: ../models/pylaia-himanis/syms.txt
img_list: ../img_list/img_list.txt
img_dirs:
  - ../data/eval/
common:
  experiment_dirname: ../models/pylaia-himanis/
  model_filename: ../models/pylaia-himanis/model
decode:
  join_string: ""
  convert_spaces: true
  use_language_model: true
  language_model_path: ../models/arpa/typewriter.arpa.gz
  tokens_path: ../models/pylaia-himanis/tokens.txt
  lexicon_path: ../models/pylaia-himanis/lexicon.txt
  language_model_weight: 1.5
trainer:
  gpus: 1
"""  

In [21]:
with open("config_decode_ft_himanis.yaml","w") as f:
    f.write(config_decode)

In [22]:
!pylaia-htr-decode-ctc --config ../config/config_decode_ft_himanis.yaml > ../outputs/outputs_ft_himanis.txt

[2025-04-08 18:05:37,427 INFO laia] Arguments: {'syms': '../models/pylaia-himanis/syms.txt', 'img_list': '../img_list/img_list.txt', 'img_dirs': ['../data/eval/'], 'common': CommonArgs(seed=74565, train_path='', model_filename='../models/pylaia-himanis/model', experiment_dirname='../models/pylaia-himanis/', monitor=<Monitor.va_cer: 'va_cer'>, checkpoint=None), 'data': DataArgs(batch_size=8, color_mode=<ColorMode.L: 'L'>, num_workers=None, reading_order=<ReadingOrder.LTR: 'LTR'>), 'decode': DecodeArgs(include_img_ids=True, separator=' ', join_string='', use_symbols=True, convert_spaces=True, input_space='<space>', output_space=' ', segmentation=None, temperature=1.0, print_line_confidence_scores=False, print_word_confidence_scores=False, use_language_model=True, language_model_path='../models/arpa/typewriter.arpa.gz', language_model_weight=1.5, tokens_path='../models/pylaia-himanis/tokens.txt', lexicon_path='../models/pylaia-himanis/lexicon.txt', unk_token='<unk>', blank_token='<ctc>'),