# HTR Training

Combine different sets of documents, train HTR on Transkribus and evaluate the results.

In [None]:
import os
import pandas as pd
import regex
import xml.etree.ElementTree as ET
from PIL import Image, ImageDraw
import matplotlib.pyplot as plt
from IPython.display import clear_output

In [None]:
DATA_DIRS = [ "tmp/1221135/Trainingssample_", 
              "tmp/1339924/Training_extra",
              "tmp/1342783/Validation_set",
              "tmp/1339925/Test_set",
              "tmp/1339926/Validation_extra",
              "tmp/1480298/Sample_known_V2(verbeterd)",
              "tmp/1372935/Sample_regex",
              "tmp/1616639/Sample_test_1",
              "tmp/1616742/Sample_three-column"
            ]
YEAR_FILES = [ os.path.basename(file_name) + "_years.csv" for file_name in DATA_DIRS ]
USAGE_FILES = [ os.path.basename(file_name) + "_usage.csv" for file_name in DATA_DIRS ]

In [None]:
def collect_years_of_files(data_dirs=DATA_DIRS, year_files=YEAR_FILES):
    for data_dir_id in range(0, len(data_dirs)):
        last_year = ""
        file_counter = 0
        data_dir = data_dirs[data_dir_id]
        year_file = year_files[data_dir_id]
        file_names = sorted(os.listdir(os.path.join(data_dir, "page")))
        try:
            years = list(pd.read_csv(year_file, index_col=0)["0"])
        except:
            years = []
        years_changed = False
        for file_name in file_names:
            file_counter += 1
            if file_counter > len(years):
                try:
                    display(Image.open(os.path.join(data_dir, regex.sub(".xml", ".jpg", file_name))))
                except:
                    display(Image.open(os.path.join(data_dir, regex.sub(".xml", ".JPG", file_name))))
                print(f"data_dir: {data_dir}; file name: {file_name}; last year: {last_year};", end=" ")
                print(f"file: {file_counter}/{len(file_names)}")
                year = input().strip()
                if year == "":
                    year = last_year
                years.append(year)
                years_changed = True
                last_year = year
                clear_output(wait=True)
        if years_changed:
            print(f"saving file {year_file}...") 
            pd.DataFrame(years).to_csv(year_file)

In [None]:
def check_file_usability(data_dirs=DATA_DIRS, usage_files=USAGE_FILES):
    for data_dir_id in range(0, len(data_dirs)):
        file_counter = 0
        data_dir = data_dirs[data_dir_id]
        usage_file = usage_files[data_dir_id]
        file_names = sorted(os.listdir(os.path.join(data_dir, "page")))
        try:
            usage = list(pd.read_csv(usage_file, index_col=0)["0"])
        except:
            usage = []
        for file_name in file_names:
            file_counter += 1
            if file_counter > len(usage):
                try:
                    display(Image.open(os.path.join(data_dir, regex.sub(".xml", ".jpg", file_name))))
                except:
                    display(Image.open(os.path.join(data_dir, regex.sub(".xml", ".JPG", file_name))))
                print(f"data_dir: {data_dir}; file name: {file_name};", end=" ")
                print(f"file: {file_counter}/{len(file_names)}")
                usage_value = input().strip()
                if usage_value == "":
                    usage_value = "yes"
                else:
                    usage_value = "no"
                usage.append(usage_value)
                pd.DataFrame(usage).to_csv(usage_file)
                clear_output(wait=True)

In [None]:
def year2decade(year):
    return int(regex.sub(".$", "", str(year)))

In [None]:
def get_decades(year_files=YEAR_FILES):
    decades = []
    for file_name in year_files:
        years = pd.read_csv(file_name, index_col=0)
        for year in years["0"]:
            decade = year2decade(year)
            decades.append(decade)
    return decades

In [None]:
def get_file_usage(usage_files=USAGE_FILES):
    NBR_OF_SKIPPED_FILES = 50 # while annotation is not finished
    usage = []
    for file_name in usage_files:
        usage.extend(list(pd.read_csv(file_name, index_col=0)["0"]))
    return usage

In [None]:
def plot_file_counts(file_counts):
    x = [ x[0] for x in list(file_counts.index) ]
    y = list(file_counts.values)
    plt.xticks(ticks=x, labels=[ str(x_value) + "0" for x_value in x ])
    plt.title(f"Number of scans per decade (total={sum(file_counts.values)})")
    plt.bar(x, y)

In [None]:
collect_years_of_files()

In [None]:
check_file_usability()

In [None]:
NBR_OF_SKIPPED_FILES = 50 # while annotation is not finished

decades = get_decades()[:-NBR_OF_SKIPPED_FILES]
usage = get_file_usage()[:-NBR_OF_SKIPPED_FILES]
decades = [ decades[i] for i in range(0, len(usage)) if usage[i] == "yes" ]

In [None]:
plot_file_counts(pd.DataFrame(decades).value_counts())