# HTR Training

Combine different sets of documents, train HTR on Transkribus and evaluate the results.

In [None]:
import os
import pandas as pd
import random
import regex
import shutil
import sys
import xml.etree.ElementTree as ET
from PIL import Image, ImageDraw
import matplotlib.pyplot as plt
from IPython.display import clear_output
sys.path.append(os.getcwd() + '/..')
from scripts.read_transkribus_files import get_text_from_file

## 1. Label and check available training data

In [None]:
DATA_DIRS = [ # "tmp/1221135/Trainingssample_", 
              # "tmp/1339924/Training_extra",
              # "tmp/1342783/Validation_set",
              # "tmp/1339925/Test_set",
              # "tmp/1339926/Validation_extra",
              # "tmp/1480298/Sample_known_V2(verbeterd)",
              # "tmp/1372935/Sample_regex",
              # "tmp/1616639/Sample_test_1",
              "tmp/1586842/Training_set",
              "tmp/1586854/Validation_set",
              "tmp/1609526/Training_set_2",
              "tmp/1609530/Validation_set_2",
              "tmp/1616742/Sample_three-column"
            ]
YEAR_FILES = [ os.path.basename(file_name) + "_years.csv" for file_name in DATA_DIRS ]
USAGE_FILES = [ os.path.basename(file_name) + "_usage.csv" for file_name in DATA_DIRS ]

random.seed(42)

In [None]:
def collect_years_of_files(data_dirs=DATA_DIRS, year_files=YEAR_FILES):
    for data_dir_id in range(0, len(data_dirs)):
        last_year = ""
        file_counter = 0
        data_dir = data_dirs[data_dir_id]
        year_file = year_files[data_dir_id]
        file_names = sorted(os.listdir(os.path.join(data_dir, "page")))
        try:
            years = list(pd.read_csv(year_file, index_col=0)["0"])
        except:
            years = []
        for file_name in file_names:
            file_counter += 1
            if file_counter > len(years):
                try:
                    display(Image.open(os.path.join(data_dir, regex.sub(".xml", ".jpg", file_name))))
                except:
                    display(Image.open(os.path.join(data_dir, regex.sub(".xml", ".JPG", file_name))))
                print(f"data_dir: {data_dir}; file name: {file_name}; last year: {last_year};", end=" ")
                print(f"file: {file_counter}/{len(file_names)}")
                year = input().strip()
                if year == "":
                    year = last_year
                years.append(year)
                pd.DataFrame(years).to_csv(year_file)
                last_year = year
                clear_output(wait=True)

In [None]:
def check_file_usability(data_dirs=DATA_DIRS, usage_files=USAGE_FILES):
    for data_dir_id in range(0, len(data_dirs)):
        file_counter = 0
        data_dir = data_dirs[data_dir_id]
        usage_file = usage_files[data_dir_id]
        file_names = sorted(os.listdir(os.path.join(data_dir, "page")))
        try:
            usage = list(pd.read_csv(usage_file, index_col=0)["0"])
        except:
            usage = []
        for file_name in file_names:
            file_counter += 1
            if file_counter > len(usage):
                try:
                    display(Image.open(os.path.join(data_dir, regex.sub(".xml", ".jpg", file_name))))
                except:
                    display(Image.open(os.path.join(data_dir, regex.sub(".xml", ".JPG", file_name))))
                print(f"data_dir: {data_dir}; file name: {file_name};", end=" ")
                print(f"file: {file_counter}/{len(file_names)}")
                usage_value = input().strip()
                if usage_value == "":
                    usage_value = "yes"
                else:
                    usage_value = "no"
                usage.append(usage_value)
                pd.DataFrame(usage).to_csv(usage_file)
                clear_output(wait=True)

In [None]:
def year2decade(year):
    return int(regex.sub(".$", "", str(year)))

In [None]:
def get_years(year_files=YEAR_FILES):
    all_years = []
    for file_name in year_files:
        years = pd.read_csv(file_name, index_col=0)
        all_years.extend(years["0"])
    return all_years

In [None]:
def get_decades(year_files=YEAR_FILES):
    return [ year2decade(year) for year in get_years(year_files=YEAR_FILES) ]

In [None]:
def get_file_usage(usage_files=USAGE_FILES):
    NBR_OF_SKIPPED_FILES = 50 # while annotation is not finished
    usage = []
    for file_name in usage_files:
        usage.extend(list(pd.read_csv(file_name, index_col=0)["0"]))
    return usage

In [None]:
def plot_file_counts(file_counts):
    x = [ x[0] for x in list(file_counts.index) ]
    y = list(file_counts.values)
    plt.xticks(ticks=x, labels=[ str(x_value) + "0" for x_value in x ])
    plt.title(f"Number of scans per decade (total={sum(file_counts.values)})")
    plt.bar(x, y)

In [None]:
collect_years_of_files()

In [None]:
check_file_usability()

In [None]:
NBR_OF_SKIPPED_FILES = 50 # while annotation is not finished

decades = get_decades()[:-NBR_OF_SKIPPED_FILES]
usage = get_file_usage()[:-NBR_OF_SKIPPED_FILES]
decades = [ decades[i] for i in range(0, len(usage)) if usage[i] == "yes" ]

In [None]:
plot_file_counts(pd.DataFrame(decades).value_counts())

## 2. Select complementary scans

Plan:

1. find years
2. remove difficult cases
3. fill up decades to 15
4. sort by name
5. train layout and baseline detection
6. train htr, check CER
7. evaluate names

In [None]:
FILE_DIR = "../../data/Overlijden"

def random_select_scan(year):
    year_dir = os.path.join(FILE_DIR, "O.R. " + str(year))
    file_names = []
    for region_dir in os.listdir(year_dir):
        file_names.extend([ os.path.join(region_dir, file_name) for file_name in os.listdir(os.path.join(year_dir, region_dir)) ])
    return file_names[random.randint(0, len(file_names) - 1)]

In [None]:
def get_years_from_dir(dir_name):
    years = []
    for file_name in os.listdir(dir_name):
        file_name_parts = file_name.split()
        years.append(int(file_name_parts[1]))
    return years

In [None]:
years = get_years()[:-NBR_OF_SKIPPED_FILES]
years = [ years[i] for i in range(0, len(usage)) if usage[i] == "yes" ]

In [None]:
TARGET_DIR = os.path.join(FILE_DIR, "x-samples", "complementary-2023")

target_dir_years = get_years_from_dir(TARGET_DIR)
for decade_start in [ 1860, 1870, 1910, 1920, 1940]:
    available_years = sorted(set([ year for year in years if year >= decade_start and year < decade_start + 10 ]))
    for year in range(decade_start, decade_start + 10):
        while year not in available_years and year not in target_dir_years:
            file_name = os.path.join(FILE_DIR, "O.R. " + str(year), random_select_scan(year))
            display(Image.open(file_name))
            print("accept this file?")
            accept_value = input().strip()
            if accept_value == "":
                shutil.copy(file_name, TARGET_DIR)
                available_years.append(year)
            clear_output(wait=True)

In [None]:
random_select_scan(1871)

In [None]:
text, meta_data, file_regions = get_text_from_file(os.path.join(DATA_DIRS[0], "page", "p001.xml"))

In [None]:
saw_trigger_word = False
for line in  text.split("\n"):
    if saw_trigger_word:
        print(line)
        break
    if regex.search("^Heden", line):
        saw_trigger_word = True