# Re-order Transkribus line segments

Line segments in the output of the hand-written text recognition software [Transkribus](https://readcoop.eu/transkribus/) are not always placed in the right order[1,2]. A typical problem is that Transkribus splits a line and then puts the right part in front of the left part because the start of the right part is a few pixels higher than the left part. This notebook reorders the lines by assuming that the height of a line is 20 pixels rather than zero: a right line segment will only be put in its position is  higher and if there is not considerable vertical overlap between the two segments. In case there is considerable overlap between the segments, they will be place based on their horizontal position. The notebook reads a Transkribus XML file and outputs a modified version of the file.

Example of the problem: in our data file "O.R. 1880 Stad 129.JPG" the two phrases "van beroep schoenmaker" and "oud vierenveertig jaren" appear on the same line but Transkribus puts them in the wrong order.

**References**

1. Lisa Hoek, [Extracting Entities from Handwritten Civil Records using HTR and RegExes](https://www.ru.nl/publish/pages/769526/lisa_hoek.pdf). Master’s thesis, Radboud University Nijmegen, 2023, sections 5.3, 5.4.4 and 9.1.
2. Erik Tjong Kim Sang, [REE-HDSC: Recognizing Extracted Entities for the Historical Database Suriname Curacao](https://ifarm.nl/erikt/papers/ree-hdsc-2023.pdf). Technical Report, Netherlands eScience Center, 2023, section 7.

## 1. Check the line segment order in Transkribus files

In [None]:
import os
import regex
import sys
from termcolor import colored
import xml.etree.ElementTree as ET

In [None]:
data_dir = "tmp/1609526/Training_set_2/page"
data_dir = "tmp/1636002/O_R__1879-1884/page"

In [None]:
def coordinates2rectangle(coords, top_bonus=0):
    left, right, top, bottom =  sys.maxsize, 0, sys.maxsize, 0
    for pair in coords.split():
        x, y = pair.split(",")
        if int(x) > right:
            right = int(x)
        if int(x) < left:
            left = int(x)
        if int(y) > bottom:
            bottom = int(y)
        if int(y) < top:
            top = int(y)
    return left, right, top + top_bonus, bottom

In [None]:
def get_reading_order_index(tag):
    last_token = ""
    try:
        for token in tag.attrib["custom"].split():
            if last_token == "readingOrder":
                return int(regex.split(":|;", token)[1])
            last_token = token
    except:
        pass
    print(f"get_reading_order: unexpected tag: {tag.attrib['custom']}")

In [None]:
def vertical_overlap_fraction(rectangle_1_top, rectangle_1_bottom, rectangle_2_top, rectangle_2_bottom):
    rectangle_1_height = rectangle_1_bottom - rectangle_1_top
    rectangle_2_height = rectangle_2_bottom - rectangle_2_top
    if rectangle_1_height == 0 or rectangle_2_height == 0:
        return 0
    vertical_overlap = min(rectangle_1_bottom, rectangle_2_bottom) - max(rectangle_1_top, rectangle_2_top)
    vertical_overlap_fraction =  min(vertical_overlap / rectangle_1_height, vertical_overlap / rectangle_2_height)
    return vertical_overlap_fraction

In [None]:
def rectangle_1_is_before_rectangle_2(rectangle_1, rectangle_2, debug=False):
    rectangle_1_left, rectangle_1_right, rectangle_1_top, rectangle_1_bottom = rectangle_1
    rectangle_2_left, rectangle_2_right, rectangle_2_top, rectangle_2_bottom = rectangle_2
    if rectangle_2_top > rectangle_1_bottom:
        if debug: print(1)
        return True
    if rectangle_1_top > rectangle_2_bottom:
        if debug: print(2)
        return False
    if vertical_overlap_fraction(rectangle_1_top, rectangle_1_bottom, rectangle_2_top, rectangle_2_bottom) < 0.6:
        if rectangle_2_bottom > rectangle_1_bottom and rectangle_2_top > rectangle_1_top:
            if debug: print(5)
            return True
        if rectangle_1_bottom > rectangle_2_bottom and rectangle_1_top > rectangle_2_top:
            if debug: print(6)
            return False
    else:
        if rectangle_2_left > rectangle_1_right:
            if debug: print(3)
            return True
        if rectangle_1_left > rectangle_2_right:
            if debug: print(4)
            return False
        if rectangle_2_left > rectangle_1_left and rectangle_2_right > rectangle_1_right:
            if debug: print(7)
            return True
        if rectangle_1_left > rectangle_2_left and rectangle_1_right > rectangle_2_right:
            if debug: print(7)
            return False
    
    return None

In [None]:
def get_text_from_xml(tag):
    text = ""
    for unicode in tag.findall(".//{*}Unicode"):
        if text == "":
            text += str(unicode.text)
        else:
            text += " " + str(unicode.text)
    return text

In [None]:
def process_file(file_name, show_all=False):
    tree = ET.parse(os.path.join(data_dir, file_name))
    root = tree.getroot()
    textregions = []
    for textregion in root.findall(".//{*}TextRegion"):
        last_rectangle = (0, 0, 0, 0)
        last_text = ""
        textline_list = textregion.findall("./{*}TextLine")
        for textline in textline_list:
            reading_order_index = get_reading_order_index(textline)
            for baseline in textline.findall("./{*}Baseline"):
                rectangle = coordinates2rectangle(baseline.attrib["points"], top_bonus=-20)
                order_check_result = rectangle_1_is_before_rectangle_2(last_rectangle, rectangle, debug=False)
                text = get_text_from_xml(textline)
                if show_all or (not order_check_result and reading_order_index / len(textline_list) < 0.75 and text != "" and last_text != ""):
                    print(file_name, reading_order_index, len(textline_list), colored(order_check_result, "black", attrs=["bold"]), rectangle, text, f"({last_text})")
                last_rectangle = rectangle
                last_text = text

In [None]:
for file_name in os.listdir(data_dir):
    if regex.search("xml$", file_name):
        process_file(file_name)

In [None]:
process_file("O.R. 1880 Stad 120.xml", show_all=True)