# Layout analysis

Compare layout recognition models trained in Transkribus: can they identify two-column and three-column texts? 

## 1. Aggregated text region visualization

In [None]:
import matplotlib.pyplot as plt
import os
import regex
import sys
sys.path.append(os.getcwd() + '/..')
from scripts import read_transkribus_files

In [None]:
data_dir = "../../data/Overlijden/x-samples/first-p2pala/page" # ETKS 2023 report Figure 4
# data_dir = "../../data/Overlijden/x-samples/three-columns-100-p2pala-2/page" # ETKS 2023 report Figure 5 left
# data_dir = "../../data/Overlijden/x-samples/three-columns-100-p2pala-3/page" # ETKS 2023 report Figure 5 right

texts, metadata, textregions = read_transkribus_files.read_files(data_dir)

In [None]:
def visualize_textregions(textregions, title="Positions of textregions on a page"):
    for file_id in textregions:
        for textregion in textregions[file_id]:
            if textregion is not None:
                rectangle = plt.Rectangle((textregion[0], -textregion[2]), textregion[1]-textregion[0], textregion[2]-textregion[3], 
                                          edgecolor="black", facecolor="none", linewidth=1)
                plt.gca().add_patch(rectangle)
    plt.axis("scaled")
    plt.title(title)
    plt.savefig("image.png")
    plt.show()

In [None]:
def get_integer_ticks(ticks):
    return [ tick for tick in ticks if int(tick) == tick ]

In [None]:
def visualize_textregions_frequencies(textregions, title="Number of regions per text"):
    frequencies = {}
    for file_id in textregions:
        frequency = len(textregions[file_id])
        if frequency in frequencies:
            frequencies[frequency] += 1
        else:
            frequencies[frequency] = 1
    plt.bar(frequencies.keys(), frequencies.values())
    plt.title(title)
    plt.xticks(get_integer_ticks(plt.xticks()[0]))
    plt.savefig("image.png")
    plt.show()

In [None]:
three_columns_data =  { textregion_id: textregions[textregion_id] 
                        for textregion_id in textregions if textregion_id < "1869" }
two_columns_data =    { textregion_id: textregions[textregion_id] 
                        for textregion_id in textregions if textregion_id > "1869" }

print(f"textregions: {len(textregions)}; three_columns_data: {len(three_columns_data)}; two_columns_data: {len(two_columns_data)}")

In [None]:
if len(three_columns_data) == 0 or len(two_columns_data) == 0:
    visualize_textregions(textregions)
else:
    visualize_textregions(two_columns_data, title=f"Positions of identified textregions on two-column page")
    visualize_textregions(three_columns_data, title=f"Positions of identified textregions on three-column page")

## 2. Check positions of largest text regions

In [None]:
def get_largest_textregion(textregions, right_margin_only=False):
    largest_surface = 0
    largest_textregion = None
    for textregion in textregions:
        if not right_margin_only or textregion[0] >= 1100:
            surface = (textregion[1] - textregion[0]) * (textregion[3] - textregion[2])
            if surface > largest_surface:
                largest_surface = surface
                largest_textregion = textregion
    return largest_textregion

In [None]:
if regex.search("first-p2pala", data_dir):
    target_data = three_columns_data
else:
    target_data = textregions

largest_textregions = { textregion_id: [ get_largest_textregion(target_data[textregion_id]) ] for textregion_id in target_data }
textregion_count = len([ True for textregion_id in largest_textregions if largest_textregions[textregion_id][0][1] < 1300 ])
textregion_total = len([ True for textregion_id in largest_textregions if largest_textregions[textregion_id] is not None ])
print(f"{textregion_count} of {textregion_total} textregions have space to the right for a margin text") 

In [None]:
visualize_textregions(largest_textregions, title="Positions of largest textregions on a page")

In [None]:
largest_textregions_in_right_margin = { textregion_id: [ get_largest_textregion(target_data[textregion_id], right_margin_only=True) ] for textregion_id in target_data }
textregion_count = len([ True for textregion_id in largest_textregions_in_right_margin if largest_textregions_in_right_margin[textregion_id][0] is not None ])
print(f"Found {textregion_count} certificates with a right margin") 

In [None]:
visualize_textregions(largest_textregions_in_right_margin, title="Positions of largest textregions in right margin")

## 3. Single certificate text region visualization

In [None]:
def get_id_of_text_with_most_textregions(textregions):
    nbr_of_textregions = { textregions_id: len(textregions[textregions_id]) for textregions_id in textregions }
    max_textregions_id = sorted(nbr_of_textregions.items(), key=lambda x: x[1], reverse=True)[0][0]
    return max_textregions_id

In [None]:
max_textregions_id = get_id_of_text_with_most_textregions(textregions)
visualize_textregions({ max_textregions_id: textregions[max_textregions_id] }, 
                      title=f"Positions of {len(textregions[max_textregions_id])} textregions on page {max_textregions_id}")

## 4. Text region counts per certificate

In [None]:
visualize_textregions_frequencies(textregions)

## 5. Check for texts without textregions

In [None]:
def sanity_check_textregions(textregions):
    for textregion_id in textregions:
        if len(textregions[textregion_id]) == 0:
            print(f"no textregions found for document {textregion_id}")

In [None]:
sanity_check_textregions(textregions)