# Get names from scans

Extract the parts of the scans of the Curacao Death Registers which contain the name of the deceased. This information is based on data annotation by humans.

In [None]:
import numpy
import os
import pandas as pd
import random
import regex
import sys
from PIL import Image, ImageDraw
import xml.etree.ElementTree as ET
from IPython.display import clear_output
sys.path.append(os.getcwd() + '/..')
from scripts.read_transkribus_files import read_files
import matplotlib.pyplot as plt
from Levenshtein import distance

In [None]:
def make_image_file_name(coordinates_file_name):
    """ change Transkribus file name to corresponding image file name and add location of directory """
    return "../website/private/hdsc/data/" + regex.sub(".xml$", ".JPG", coordinates_file_name)

In [None]:
def print_with_color(string, color_code=1):
    print(f"\x1b[3{color_code}m{string}\x1b[m", end="")

In [None]:
def squeal(text=None):
    clear_output(wait=True)
    if not text is None:
        print(text)

## 1. Read logfile data

In [None]:
def read_logfile_data(logfile_name):
    """ read manually checked locations of deceased names in scans """
    return pd.read_csv(logfile_name, header=None)

In [None]:
def convert_logfile_data_to_scan_data(logfile_data):
    """ remove duplicates from logfile data; keep only final annotation """
    scan_data = {}
    for index, row in logfile_data.iterrows():
        coordinates_file_name, status, text_line_id, coords_id, deceased_name, ip_address, date, text_line_name = row
        scan_data[coordinates_file_name] = [index, status, text_line_id, coords_id, ip_address, text_line_name]
    return scan_data

In [None]:
LOGFILE_NAME = "../website/private/hdsc/etc/logfile"

logfile_data = read_logfile_data(LOGFILE_NAME)
logfile_data

In [None]:
scan_data = convert_logfile_data_to_scan_data(logfile_data)
pd.DataFrame.from_dict(scan_data, orient="index")

## 2. Find best coordinates for guess of deceased name position

In [None]:
def get_coordinates_from_line(line):
    """ extract polygon coordinates from points attribute of Coords tag in Transkribus xml, format: '123,456 789,012 ...'"""
    split_line = [ pair.split(",") for pair in line.split() ]
    return [ ( int(x), int(y) ) for x, y in split_line ]

In [None]:
def find_top_left(polygon):
    """ find top left position of polygon (alternative: use polygon2rectangle) """
    top_coordinate = sys.maxsize
    left_coordinate = sys.maxsize
    for pair in polygon[0]["points"]:
        if pair[1] < top_coordinate:
            top_coordinate = pair[1]
        if pair[0] < left_coordinate:
            left_coordinate = pair[0]
    return top_coordinate, left_coordinate

In [None]:
def sort_polygons(polygons):
    """ sort polygons based on top left point: first by vertical position, then by horizontal position """
    extended_polygons = []
    for polygon in polygons:
        top_coordinate, left_coordinate = find_top_left(polygon)
        extended_polygons.append([top_coordinate, left_coordinate, polygon])
    return [ extended_polygon[2] 
             for extended_polygon in sorted(extended_polygons, 
                                            key=lambda ep: (ep[0], ep[1])) ]

In [None]:
INDEX_START_SORTING = 574
INDEX_REMOVE_MARIGINALIA = 2307

def get_text_polygons(coordinates_file_name, index):
    """ read polygons from Transkribus file (universal version)"""
    root = ET.parse(coordinates_file_name).getroot()
    polygons = []
    polygons_by_name = {}
    for text_region in root.findall(".//{*}TextRegion"):
        text_region_polygons = []
        for text_line in text_region.findall("./{*}TextLine"):
            text_region_polygons.append([])
            for coords in text_line.findall("./{*}Coords"):
                coordinates = get_coordinates_from_line(coords.attrib["points"])
                text_region_polygons[-1].append({"points": coordinates, "name": text_line.attrib["id"]})
                polygons_by_name[text_line.attrib["id"]] = coordinates
        if index < INDEX_START_SORTING:
            polygons.extend(text_region_polygons)
        elif index < INDEX_REMOVE_MARIGINALIA:
            polygons.extend(sort_polygons(text_region_polygons))
        else:
            if len(text_region_polygons) > len(polygons):
                polygons = sort_polygons(text_region_polygons)
    return polygons, polygons_by_name

In [None]:
def get_text_polygons_new(coordinates_file_name):
    """ read polygons from Transkribus file (ignores mariginalia)"""
    root = ET.parse(coordinates_file_name).getroot()
    polygons = []
    polygons_by_name = {}
    for text_region in root.findall(".//{*}TextRegion"):
        text_region_polygons = []
        for text_line in text_region.findall("./{*}TextLine"):
            text_region_polygons.append([])
            for coords in text_line.findall("./{*}Coords"):
                coordinates = get_coordinates_from_line(coords.attrib["points"])
                text_region_polygons[-1].append({"points": coordinates, "name": text_line.attrib["id"]})
                polygons_by_name[text_line.attrib["id"]] = coordinates
        if len(text_region_polygons) > len(polygons):
            polygons = sort_polygons(text_region_polygons)
    return polygons, polygons_by_name

In [None]:
def get_text_polygons_old(coordinates_file_name):
    """ read polygons from Transkribus file (considers mariginalia)"""
    root = ET.parse(coordinates_file_name).getroot()
    polygons = []
    polygons_by_name = {}
    for text_region in root.findall(".//{*}TextRegion"):
        text_region_polygons = []
        for text_line in text_region.findall("./{*}TextLine"):
            text_region_polygons.append([])
            for coords in text_line.findall("./{*}Coords"):
                coordinates = get_coordinates_from_line(coords.attrib["points"])
                text_region_polygons[-1].append({"points": coordinates, "name": text_line.attrib["id"]})
                polygons_by_name[text_line.attrib["id"]] = coordinates
        polygons.extend(sort_polygons(text_region_polygons))
    return polygons, polygons_by_name

In [None]:
def polygon2rectangle(coordinates):
    """ find coordinates of minimal rectangle completely enclosing polygon """
    x_min, x_max, y_min, y_max = (sys.maxsize, 0, sys.maxsize, 0)
    for x, y in coordinates:
        if x < x_min: x_min = x
        if x > x_max: x_max = x
        if y < y_min: y_min = y
        if y > y_max: y_max = y
    return x_min, y_min, x_max, y_max

In [None]:
def encloses_point(rectangle, point):
    """ check if point is inside rectangle """
    return(rectangle[0] <= point[0] and rectangle[2] >= point[0] and
           rectangle[1] <= point[1] and rectangle[3] >= point[1])

In [None]:
def get_best_polygon_for_y(polygons, y):
    """ find polygon closest to horizontal line indicated by argument y """
    best_distance, best_text_line_id, best_coords_id = (sys.maxsize, -1, -1)
    for text_line_id in range(0, len(polygons)):
        for coords_id in range(0, len(polygons[text_line_id])):
            rectangle = polygon2rectangle(polygons[text_line_id][coords_id]["points"])
            distance = abs(y - rectangle[1])
            if distance < best_distance:
                best_distance = distance
                best_text_line_id = text_line_id
                best_coords_id = coords_id
    return best_text_line_id, best_coords_id

In [None]:
def get_polygon_coordinates(polygons, polygons_by_name, text_line_id, coords_id, text_line_name):
    if text_line_name != "xxx" and text_line_name != "NA":
        return polygons_by_name[text_line_name]
    else:
        return polygons[text_line_id][coords_id]["points"]

In [None]:
def count_point_name_hits(best_point_x, best_point_y, best_line_y, decades):
    """ evaluate values of best_point_x, best_point_y and best_line_y: how often do they predict the right deceased name polygon """
    hit_counts = 0
    for coordinates_file_name in scan_data:
        index, status, text_line_id, coords_id, ip_address, text_line_name = scan_data[coordinates_file_name]
        if index >= 600 and status == "save" and regex.search(decades, coordinates_file_name):
            polygons, polygons_by_name = get_text_polygons("../website/private/hdsc/data/page/" + coordinates_file_name, 
                                                           index)
            polygon = get_polygon_coordinates(polygons, polygons_by_name, text_line_id, coords_id, text_line_name)
            rectangle = polygon2rectangle(polygon)
            if encloses_point(rectangle, (best_point_x, best_point_y)):
                hit_counts += 1
            else:
                best_text_line_id, best_coords_id = get_best_polygon_for_y(polygons, best_line_y)
                if best_text_line_id == text_line_id and best_coords_id == coords_id:
                    hit_counts += 1
    return hit_counts

In [None]:
def find_best_default_point(best_point_x, best_point_y, best_line_y, incr=10, decades="18[34][0-9]"):
    """ find best guess for position inside deceased name frame: best_point_x, best_point_y; keep best_line_y constant """
    while True:
        hit_counts = count_point_name_hits(best_point_x, best_point_y, best_line_y, decades)
        print(best_point_x, best_point_y, "#", best_line_y, hit_counts, incr)
        if count_point_name_hits(best_point_x + incr, best_point_y, best_line_y, decades) > hit_counts:
            best_point_x += incr
            continue
        if count_point_name_hits(best_point_x + incr, best_point_y + incr, best_line_y, decades) > hit_counts:
            best_point_x += incr
            best_point_y += incr
            continue
        if count_point_name_hits(best_point_x, best_point_y + incr, best_line_y, decades) > hit_counts:
            best_point_y += incr
            continue
        if count_point_name_hits(best_point_x - incr, best_point_y + incr, best_line_y, decades) > hit_counts:
            best_point_x -= incr
            best_point_y += incr
            continue
        if count_point_name_hits(best_point_x - incr, best_point_y, best_line_y, decades) > hit_counts:
            best_point_x -= incr
            continue
        if count_point_name_hits(best_point_x - incr, best_point_y - incr, best_line_y, decades) > hit_counts:
            best_point_x -= incr
            best_point_y -= incr
            continue
        if count_point_name_hits(best_point_x, best_point_y - incr, best_line_y, decades) > hit_counts:
            best_point_y -= incr
            continue
        if count_point_name_hits(best_point_x + incr, best_point_y - incr, best_line_y, decades) > hit_counts:
            best_point_x += incr
            best_point_y -= incr
            continue
        break
    return best_point_x, best_point_y

In [None]:
def find_best_line_y_default(best_point_x, best_point_y, best_line_y, incr=10, decades="18[34][0-9]"):
    """ find horizontal line closests to top of most deceased name boxes: best_line_y; 
        keep best_point_x and best_point_y constant """
    while True:
        hit_counts = count_point_name_hits(best_point_x, best_point_y, best_line_y, decades)
        print(best_line_y, "#", best_point_x, best_point_y, hit_counts, incr)
        if count_point_name_hits(best_point_x, best_point_y, best_line_y + incr, decades) > hit_counts:
            best_line_y += incr
            continue
        if count_point_name_hits(best_point_x, best_point_y, best_line_y + incr, decades) > hit_counts:
            best_line_y -= incr
            continue
        break
    return best_line_y

In [None]:
DECADES = "18[34][0-9]"
X_DEFAULT = 688
Y_DEFAULT = 471
LINE_Y_DEFAULT = 510

best_line_y = find_best_line_y_default(X_DEFAULT, Y_DEFAULT, LINE_Y_DEFAULT, incr=10, decades=DECADES)
best_line_y = find_best_line_y_default(X_DEFAULT, Y_DEFAULT, best_line_y, incr=5, decades=DECADES)
best_line_y = find_best_line_y_default(X_DEFAULT, Y_DEFAULT, best_line_y, incr=2, decades=DECADES)
best_line_y = find_best_line_y_default(X_DEFAULT, Y_DEFAULT, best_line_y, incr=1, decades=DECADES)

In [None]:
best_point_x, best_point_y = find_best_default_point(X_DEFAULT, Y_DEFAULT, LINE_Y_DEFAULT, incr=10, decades=DECADES)
best_point_x, best_point_y = find_best_default_point(best_point_x, best_point_y, LINE_Y_DEFAULT, incr=5, decades=DECADES)
best_point_x, best_point_y = find_best_default_point(best_point_x, best_point_y, LINE_Y_DEFAULT, incr=2, decades=DECADES)
best_point_x, best_point_y = find_best_default_point(best_point_x, best_point_y, LINE_Y_DEFAULT, incr=1, decades=DECADES)

In [None]:
DECADES = "192[0-9]"
X_DEFAULT = 818
Y_DEFAULT = 610
LINE_Y_DEFAULT = 745

best_line_y = find_best_line_y_default(X_DEFAULT, Y_DEFAULT, LINE_Y_DEFAULT, incr=10, decades=DECADES)
best_line_y = find_best_line_y_default(X_DEFAULT, Y_DEFAULT, best_line_y, incr=5, decades=DECADES)
best_line_y = find_best_line_y_default(X_DEFAULT, Y_DEFAULT, best_line_y, incr=2, decades=DECADES)
best_line_y = find_best_line_y_default(X_DEFAULT, Y_DEFAULT, best_line_y, incr=1, decades=DECADES)

In [None]:
best_point_x, best_point_y = find_best_default_point(X_DEFAULT, Y_DEFAULT, LINE_Y_DEFAULT, incr=10, decades=DECADES)
best_point_x, best_point_y = find_best_default_point(best_point_x, best_point_y, LINE_Y_DEFAULT, incr=5, decades=DECADES)
best_point_x, best_point_y = find_best_default_point(best_point_x, best_point_y, LINE_Y_DEFAULT, incr=2, decades=DECADES)
best_point_x, best_point_y = find_best_default_point(best_point_x, best_point_y, LINE_Y_DEFAULT, incr=1, decades=DECADES)

## 3. Compute areas of identified deceased name frames

In [None]:
def compute_rectangle_area(rectangle):
    """ compute the are of a rectangle """
    x_min, y_min, x_max, y_max = rectangle
    return (x_max - x_min) * (y_max - y_min)

In [None]:
def compute_areas(scan_data):
    """ compute areas of rectangles related to polygons and show minimum, average and maximum value """
    min_area, max_area, area_count, area_total = sys.maxsize, 0, 0, 0
    for coordinates_file_name in scan_data:
        index, status, text_line_id, coords_id, ip_address, text_line_name = scan_data[coordinates_file_name]
        if status == "save":
            polygons, polygons_by_name = get_text_polygons("../website/private/hdsc/data/page/" + coordinates_file_name, index)
            polygon = get_polygon_coordinates(polygons, polygons_by_name, text_line_id, coords_id, text_line_name)
            rectangle = polygon2rectangle(polygon)
            area = compute_rectangle_area(rectangle)
            if area < min_area:
                min_area = area
            if area > max_area:
                max_area = area
            area_count += 1
            area_total += area
    return min_area, max_area, int(area_total/area_count)

In [None]:
compute_areas(scan_data)

## 4. Compute aspect ratios of identified deceased name frames

In [None]:
def compute_rectangle_aspect_ratio(rectangle):
    """ compute the are of a rectangle """
    x_min, y_min, x_max, y_max = rectangle
    return (x_max - x_min) / (y_max - y_min)

In [None]:
def compute_aspect_ratios(scan_data):
    """ compute aspect ratios of rectangles related to polygons and show minimum, average and maximum value """
    min_ratio, max_ratio, ratio_count, ratio_total = sys.maxsize, 0, 0, 0
    for coordinates_file_name in scan_data:
        index, status, text_line_id, coords_id, ip_address, text_line_name = scan_data[coordinates_file_name]
        if status == "save":
            polygons, polygons_by_name = get_text_polygons("../website/private/hdsc/data/page/" + coordinates_file_name, index)
            polygon = get_polygon_coordinates(polygons, polygons_by_name, text_line_id, coords_id, text_line_name)
            rectangle = polygon2rectangle(polygon)
            ratio = compute_rectangle_aspect_ratio(rectangle)
            if ratio < min_ratio:
                min_ratio = ratio
            if ratio > max_ratio:
                max_ratio = ratio
            ratio_count += 1
            ratio_total += ratio
    return min_ratio, max_ratio, int(ratio_total/ratio_count)

In [None]:
compute_aspect_ratios(scan_data)

## 5. Cut out polygons from scans containing deceased name

In [None]:
COVERED_BACKGROUND = 0
TRANSPARENT_BACKGROUND = 255
FILL_COLOR = (198, 178, 125)

In [None]:
# code based on https://stackoverflow.com/questions/22588074/polygon-crop-clip-using-python-pil

def mask_polygon(image, polygon, covered_background):
    """ highlight polygon (= deceased name) on image of scan: transparency 255, while rest: transparency 0 """
    image_with_transparency = image.convert("RGBA")
    numpy_image = numpy.asarray(image_with_transparency)
    masked_image = Image.new('P', (numpy_image.shape[1], numpy_image.shape[0]), covered_background)
    ImageDraw.Draw(masked_image).polygon(polygon, outline=0, fill=TRANSPARENT_BACKGROUND)
    mask = numpy.array(masked_image)
    masked_numpy_image = numpy.empty(numpy_image.shape, dtype='uint8')
    masked_numpy_image[:,:,:3] = numpy_image[:,:,:3]
    masked_numpy_image[:,:,3] = mask
    return Image.fromarray(masked_numpy_image, "RGBA")

In [None]:
def most_frequent_pixel_value(image):
    """ determine most frequent pixel value in image; only check transparent parts """
    pixel_values = {}
    image_data = image.getdata()
    for data in image_data:
        if len(data) <= 3 or data[3] != 0:
            rounded_data = [ str(int(data[0]*0.1) * 10), 
                             str(int(data[1]*0.1) * 10), 
                             str(int(data[2]*0.1) * 10) ]
            rounded_data = " ".join(rounded_data)
            if rounded_data in pixel_values:
                pixel_values[rounded_data] += 1
            else:
                pixel_values[rounded_data] = 1
    minimal_value = sorted(pixel_values.items(), key=lambda pvi: pvi[1], reverse=True)[0][0]
    return(int(minimal_value.split()[0]),
           int(minimal_value.split()[1]),
           int(minimal_value.split()[2]))

In [None]:
def expand_rectangle(rectangle, border_width):
    """ add margin of size border_width to rectangle """
    return( rectangle[0] - border_width, 
            rectangle[1] - border_width,
            rectangle[2] + border_width, 
            rectangle[3] + border_width )

In [None]:
def randomize_pixel_value(pixel_value, spread=20):
    """ add a random value to a pixel value """
    return pixel_value[0] + random.randint(0, spread), pixel_value[1] + random.randint(0, spread), pixel_value[2] + random.randint(0, spread), 

In [None]:
def fill_background(masked_image):
    """ change background color of masked image and remove transparency """
    masked_image_data = masked_image.getdata()
    updated_data = []
    frequent_pixel_value = most_frequent_pixel_value(masked_image)
    for data in masked_image_data:
        if data[3] == 0:
            updated_data.append(randomize_pixel_value(frequent_pixel_value))
        else:
            updated_data.append(data[:3])
    masked_image.putdata(updated_data)
    masked_image = masked_image.convert("RGB")
    return masked_image

In [None]:
def get_name_frames_from_scans(scan_data):
    """ extract name frames from scans and store images in directory images """
    counter = 0
    for coordinates_file_name in dict(sorted(scan_data.items(), 
                                      key=lambda scan_data_item: scan_data_item[1][0])):
        index, status, text_line_id, coords_id, ip_address, text_line_name = scan_data[coordinates_file_name]
        if status == "save":
            polygons, polygons_by_name = get_text_polygons("../website/private/hdsc/data/page/" + coordinates_file_name, index)
            polygon = get_polygon_coordinates(polygons, polygons_by_name, text_line_id, coords_id, text_line_name)
            rectangle = polygon2rectangle(polygon)
            image_file_name = make_image_file_name(coordinates_file_name)
            image = Image.open(image_file_name)
            masked_image = mask_polygon(image, polygon, 0).crop(expand_rectangle(rectangle, 10))
            masked_image = fill_background(masked_image)
            masked_image.save("images/" + os.path.basename(image_file_name))
        counter += 1
        if 10 * int(counter/10) == counter:
            squeal(f"{counter}/{len(scan_data)}")
    squeal(f"{counter}/{len(scan_data)}")
    if "masked_image" in vars():
        return masked_image
    else:
        return

In [None]:
if False:
    get_name_frames_from_scans(scan_data)

## 6. Check image properties

In [None]:
def compute_property_values(scan_data, property_name="ratio"):
    """ compute the ratios of all images related to scan_data """
    property_values = {}
    min_property, max_property, property_count, property_total = sys.maxsize, 0, 0, 0
    for coordinates_file_name in scan_data:
        index, status, text_line_id, coords_id, ip_address, text_line_name = scan_data[coordinates_file_name]
        if status == "save":
            polygons, polygons_by_name = get_text_polygons("../website/private/hdsc/data/page/" + coordinates_file_name, index)
            polygon = get_polygon_coordinates(polygons, polygons_by_name, text_line_id, coords_id, text_line_name)
            rectangle = polygon2rectangle(polygon)
            if property_name == "ratio":
                property_value = compute_rectangle_aspect_ratio(rectangle)
            elif property_name == "area":
                property_value = compute_rectangle_area(rectangle)
            elif property_name == "height":
                property_value = rectangle[3] - rectangle[1]
            elif property_name == "length":
                property_value = rectangle[2] - rectangle[0]
            else:
                sys.exit(f"compute_image_property: unknown property name: {property_name}")
            property_values[coordinates_file_name] = [ index, property_value, ip_address ]
    return property_values 

In [None]:
def show_images_by_property_values(property_values, start, end):
    """ show selected images by property values from start to end """
    for coordinates_file_name, ratio in sorted(property_values.items(), 
                                               key=lambda ratio_item: ratio_item[1][1])[start:end]:
        index, property_value, ip_address = property_values[coordinates_file_name]
        print(f"{index} # {round(property_value, 1)} # {ip_address} # {coordinates_file_name}:")
        display(Image.open("images/" + regex.sub(".xml", ".JPG", coordinates_file_name)))    

In [None]:
def show_extreme_property_values(scan_data, property_name="ratio", n=5):
    """ show extreme name frames by frame property """
    property_values = compute_property_values(scan_data, property_name)
    show_images_by_property_values(property_values, 0, n)
    show_images_by_property_values(property_values, len(property_values) - n, len(property_values))

In [None]:
show_extreme_property_values(scan_data, property_name="ratio", n=10)

In [None]:
show_extreme_property_values(scan_data, property_name="area", n=10)

In [None]:
length_data = pd.DataFrame([ data[1]for data in compute_property_values(scan_data, "length").values() ]).value_counts()
plt.bar([x[0] for x in length_data.index], length_data.values)

In [None]:
height_data = pd.DataFrame([ data[1] for data in compute_property_values(scan_data, "height").values() ]).value_counts()
plt.bar([x[0] for x in height_data.index], height_data.values)

## 7. Combine images

In [None]:
X_MAX_VALUE = 1600

def fits_in_gap(gaps, combined_image, image):
    """ find smallest gap at the end of a line where the name fits; return False otherwise """ 
    smallest_gap, smallest_i = sys.maxsize, -1
    for i in range(0, len(gaps)):
        x, y = gaps[i]
        if x + image.size[0] <= X_MAX_VALUE and X_MAX_VALUE - x - image.size[0] < smallest_gap:
            smallest_gap = X_MAX_VALUE - x + image.size[0]
            smallest_i = i
    if smallest_i >= 0:
        return gaps[smallest_i][0], gaps[smallest_i][1], smallest_i
    return False

In [None]:
def compute_characters_per_name(deceased_names):
    names_counter, characters_counter = (0, 0)
    for scan in deceased_names:
        for line in scan:
            for name in line:
                names_counter += 1
                characters_counter += len(regex.sub("\s", "", name))
    return round(characters_counter/names_counter, 1)

In [None]:
def make_combined_image_file_name(file_counter):
    """ make the name of the images with combined names"""
    return f"combined_images/{file_counter.zfill(4)}.jpg"

In [None]:
def initialize_combine_images(deceased_names, years, file_counter):
    deceased_names.append([[]])
    years.append([[]])
    file_counter += 1
    return 0, 0, [], file_counter, deceased_names, years, Image.new( "RGB", (1600, 1200), (255, 255, 255))

In [None]:
def combine_images(scan_data, patterns):
    """ combine as many as possible name frames in several images """ 
    x, y, gaps, file_counter, deceased_names, years, combined_image = initialize_combine_images([], [], 0)
    for patterns_i in range(0, len(patterns)):
        pattern = patterns[patterns_i]
        for coordinates_file_name in dict(sorted(scan_data.items(), 
                                                 key=lambda scan_data_item: scan_data_item[1][0])): # index value
            if regex.search(pattern, coordinates_file_name):
                index, status, text_line_id, coords_id, ip_address, text_line_name = scan_data[coordinates_file_name]
                if status =="save":
                    try:
                        image_file_name = "images/" + os.path.basename(make_image_file_name(coordinates_file_name))
                        image = Image.open(image_file_name)
                    except Exception:
                        print(f"problem processing file {image_file_name}")
                        continue
                    if fits_in_gap(gaps, combined_image, image):
                        x_gap, y_gap, i_gap = fits_in_gap(gaps, combined_image, image)
                        combined_image.paste(image, (x_gap, y_gap + int((100 - image.size[1]) / 2)))
                        deceased_names[-1][int(0.5 + y_gap/100)].append(logfile_data.iloc[index][4])
                        years[-1][int(0.5 + y_gap/100)].append(coordinates_file_name.split()[1])
                        x_gap += image.size[0] + 30
                        gaps = gaps[:i_gap] + [[x_gap, y_gap]] + gaps[i_gap+1:]
                        continue
                    elif x + image.size[0] <= X_MAX_VALUE:
                       pass
                    elif y < 1100:
                        gaps.append([x, y])
                        x = 0
                        y += 100
                        deceased_names[-1].append([])
                        years[-1].append([])
                    else:
                        combined_image.save(make_combined_image_file_name(str(file_counter)))
                        x, y, gaps, file_counter, deceased_names, years, combined_image = initialize_combine_images(deceased_names, years, file_counter)
                        combined_image = Image.new( "RGB", (1600, 1200), (255, 255, 255))
                    combined_image.paste(image, (x, y + int((100 - image.size[1]) / 2)))
                    deceased_names[-1][-1].append(logfile_data.iloc[index][4])
                    years[-1][-1].append(coordinates_file_name.split()[1])
                    x += image.size[0] + 30
        if (x > 0 or y > 0):
            combined_image.save(make_combined_image_file_name(str(file_counter)))
            if patterns_i < len(patterns) - 1:
                x, y, gaps, file_counter, deceased_names, years, combined_image = initialize_combine_images(deceased_names, years, file_counter)
                combined_image = Image.new( "RGB", (1600, 1200), (255, 255, 255))
    print(f"created {file_counter} combined image files; showing final file below:")
    display(combined_image)
    return deceased_names, years

In [None]:
patterns = [ "183[0-9]", "184[0-9]", "192[0-9]" ]

deceased_names, years = combine_images(scan_data, patterns)

In [None]:
print(f"metadata sections: {len(deceased_names)}\nfinal metadata section: {deceased_names[-1]}\n{years[-1][-1]}")

In [None]:
compute_characters_per_name(deceased_names)

## 8. Check combined images

20231107 corrected names:
* page 49 corrected sveral
* page 130 bernardo wever
* page 130 Fabias Sebastiaan Josephina (removed)
* page 171 last

In [None]:
def find_name_in_deceased_names_pages(deceased_names, name):
    for page in deceased_names:
        for line in page:
            for line_name in line:
                if regex.search(name, line_name, regex.IGNORECASE):
                    print(page)

In [None]:
def find_name_in_logfile_data(logfile_data, name):
    return logfile_data.loc[logfile_data[4] == name]

In [None]:
find_name_in_deceased_names_pages(deceased_names, "francisco martelij")

In [None]:
find_name_in_logfile_data(logfile_data, "Englentina Martina")

## 9 Add names to checked baseline generated by Transkribus

The baselines are computed by Transkribus under Tools / P2PaLA with the Public Model Spruchakten_3_blonly-2019-11-07 14:09:29 19746

In [None]:
def fix_name_case(name):
    """ convert ALLCAPS name words to words with only capitalized first character """
    names = name.split()
    for i in range(0, len(names)):
        if regex.search("^[A-Z]+$", names[i]):
            names[i] = names[i][0] + names[i][1:].lower()
    return " ".join(names)

In [None]:
def add_text_to_element(element_tag, text):
    """ add gold standard text to TextLine element in Transkribus xml file """
    textequiv_tag = ET.Element("TextEquiv")
    unicode_tag = ET.Element("Unicode")
    unicode_tag.text = text
    textequiv_tag.append(unicode_tag)
    element_tag.append(textequiv_tag)

In [None]:
def delete_textequiv_elements(root):
    """ remove all TextEquiv elements with text from a Transkribus xml file """
    for textline in root.findall(".//{*}TextLine"):
        for child in textline:
            if regex.sub("{.*}", "", child.tag) == "TextEquiv":
                textline.remove(child)

In [None]:
FILE_DIR = "combined_images/page"
NAMESPACE = "http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15"
MAX_NBR_OF_LINES_PER_FILE = 12

def add_ground_truth_text_to_transkribus_xml(transkribus_xml_dir, deceased_names):
    """ add ground truth text to Transkribus xml files, first removes previous texts from the files """
    ET.register_namespace("", NAMESPACE)
    file_index = 0
    for file_name in sorted(os.listdir(transkribus_xml_dir)):
        squeal(file_name)
        tree = ET.parse(os.path.join(transkribus_xml_dir, file_name))
        root = tree.getroot()
        delete_textequiv_elements(root)
        line_index = 0
        for textline_tag in root.findall(".//{*}TextLine"):
            for textequiv_tag in textline_tag.findall(".//{*}TextEquiv"):
                textline_tag.remove(textequiv_tag)
            add_text_to_element(textline_tag, fix_name_case(" ".join(deceased_names[file_index][line_index])))
            line_index += 1
        tree.write(os.path.join(transkribus_xml_dir, file_name))
        if line_index != MAX_NBR_OF_LINES_PER_FILE:
            print(f"warning: incomplete file: {file_name}; file_index: {file_index}; number of lines: {line_index}; final line: {deceased_names[file_index][line_index-1]}")
        file_index += 1

In [None]:
if True:
    add_ground_truth_text_to_transkribus_xml(FILE_DIR, deceased_names)

## 10. Check processed validation data of training run

Training is done with Transkribus / Tools / Train a new model, with options 
* Automatic selection of validation set: 10% from train set
* Use existing polygons for training
* Base model: HTR-Curacao_bestModel
* Language: Dutch; Flemish (nld)
* Max-nr. of Epochs: 250

In [None]:
def get_word_split_ids(name_list):
    """ return total number of words in the names up until the current name """ 
    split_id = 0
    word_split_ids = []
    for name in name_list:
        word_split_ids.append(split_id)
        split_id += len(name.split())
    return word_split_ids

In [None]:
def split_line_in_names(guessed_line, correct_names):
    """ split guessed line in names based on espected names; fails if the number of words differs """
    guessed_words = guessed_line.split()
    word_split_ids = get_word_split_ids(correct_names)
    guessed_names = []
    for word_split_ids_id in range(0, len(word_split_ids)):
        if word_split_ids_id < len(word_split_ids) - 1:
            guessed_names.append(" ".join(guessed_words[word_split_ids[word_split_ids_id]: word_split_ids[word_split_ids_id + 1]]))
        else:
            guessed_names.append(" ".join(guessed_words[word_split_ids[word_split_ids_id]:]))
    return guessed_names

In [None]:
def compare_names(guessed_name, correct_name, correct_words_count, total_words_count, correct_names_count):
    """ compare guessed names with expected correct names """
    guessed_words = guessed_name.split()
    correct_words = correct_name.split()
    correct_words_count_start = correct_words_count
    for guessed_words_id in range(0, len(guessed_words)):
        total_words_count += 1
        try:
            if guessed_words[guessed_words_id].lower() == correct_words[guessed_words_id].lower():
                correct_words_count += 1
        except Exception:
            pass
    if guessed_name.lower() == correct_name.lower():
        correct_names_count += 1
    else:
        print(correct_words_count - correct_words_count_start, "#", guessed_name, "#", correct_name)
    return correct_words_count, total_words_count, correct_names_count

In [None]:
def compare_lines_via_names(guessed_line, correct_names):
    """ compare guessed lines with expected lines by splitting them in names first: fails when the number of words differ """ 
    guessed_names = split_line_in_names(guessed_line, correct_names)
    correct_words_count, total_words_count, correct_names_count, total_names_count = 0, 0, 0, 0
    for guessed_names_id in range(0, len(guessed_names)):
        total_names_count += 1
        correct_words_count, total_words_count, correct_names_count = compare_names(guessed_names[guessed_names_id], 
                                                                                    correct_names[guessed_names_id], 
                                                                                    correct_words_count, 
                                                                                    total_words_count, 
                                                                                    correct_names_count)

    return correct_words_count, total_words_count, correct_names_count, total_names_count

In [None]:
def get_word_dict(line):
    """ return dict with frequencies of lower-cased words on line """
    word_dict = {}
    for word in line.split():
        if word not in word_dict:
            word_dict[word] = 1
        else:
            word_dict[word] += 1
    return word_dict

In [None]:
def show_missed_names(missed_names, guessed_words, file_id, line_id):
    """ show the missed words in guessed names with file_id, line_id and non-matched strings """
    for name in missed_names:
        print(file_id, line_id, end=" ")
        for name_part in name.split():
            if name_part.lower() in guessed_words:
                print(name_part, end=" ")
            else:
                print_with_color(name_part)
                print("", end=" ")
        print([ word for word in guessed_words if guessed_words[word] > 0 ])

In [None]:
def compare_lines_via_words(guessed_line, correct_names, years_line):
    """ compare lines by comparing individual words: does not respect word order """
    guessed_words = get_word_dict(guessed_line.lower())
    correct_words_count, total_words_count, correct_names_count, total_names_count = 0, 0, 0, 0
    missed_names = []
    years_eval_line = {}
    for i in range(0, len(correct_names)):
        correct_name = correct_names[i]
        year = years_line[i]
        if year not in years_eval_line:
            years_eval_line[year] = { "correct": 0, "wrong": 0 }
        total_names_count += 1
        guessed_name_is_correct = True
        for correct_word in correct_name.lower().split():
            total_words_count += 1
            if correct_word not in guessed_words or guessed_words[correct_word] <= 0:
                guessed_name_is_correct = False
            else:
                correct_words_count += 1
                guessed_words[correct_word] -= 1
        if guessed_name_is_correct:
            correct_names_count += 1
            years_eval_line[year]["correct"] += 1
        else:
            missed_names.append(correct_name)
            years_eval_line[year]["wrong"] += 1
    return correct_words_count, total_words_count, correct_names_count, total_names_count, missed_names, guessed_words, years_eval_line

In [None]:
def get_doc_id(root):
    """ get the id of the xml document from the Transkribus metadata """ 
    return root.findall(".//{*}TranskribusMetadata")[0].attrib["docId"]

In [None]:
def get_time_stamp(root):
    """ get the time stamp of the file from the tag LastChange """
    time_stamp = root.findall(".//{*}LastChange")[0].text
    return regex.sub("\..*$", "", time_stamp)

In [None]:
def get_export_job_id(validation_file_dir):
    """ get the export job id from file log.txt """
    in_file = open(os.path.join(validation_file_dir, "../../../log.txt"), "r")
    line = in_file.readline().strip()
    in_file.close()
    return regex.sub("^.* ", "", line)

In [None]:
def update_years_eval_file(years_eval_file, years_eval_line):
    """ insert line scores per year in file scores per year variable """
    for year in years_eval_line:
        if year not in years_eval_file:
            years_eval_file[year] = { "correct": 0, "wrong": 0 }
        years_eval_file[year]["correct"] += years_eval_line[year]["correct"]
        years_eval_file[year]["wrong"] += years_eval_line[year]["wrong"]

In [None]:
def make_decade_scores(years_eval_file):
    """ convert scores per year to scores per decade """
    decade_scores = {}
    for year in years_eval_file:
        decade = regex.sub(".$", "", year) + "0s"
        if decade not in decade_scores:
            decade_scores[decade] = { "correct": 0, "wrong": 0 }
        decade_scores[decade]["correct"] += years_eval_file[year]["correct"]
        decade_scores[decade]["wrong"] += years_eval_file[year]["wrong"]
    return { decade: [ round(100 * decade_scores[decade]["correct"] / 
                           (decade_scores[decade]["correct"] + decade_scores[decade]["wrong"]), 1),
                       decade_scores[decade]["correct"] + decade_scores[decade]["wrong"] ]
             for decade in decade_scores }

In [None]:
def evaluate_validation_data(validation_file_dir, deceased_names, years, known_min_freq=1, unknown_min_freq=2, fix_lines=False):
    """ evaluate the validation data generated by Transkribus; this version is word-order insensitive """
    correct_words_file, total_words_file , correct_names_file, total_names_file = 0, 0, 0, 0
    years_eval_file = {}
    for file_name in sorted(os.listdir(VALIDATION_FILE_DIR)):
        file_id = int(regex.sub(".xml$", "", file_name)) - 1
        tree = ET.parse(os.path.join(VALIDATION_FILE_DIR, file_name))
        root = tree.getroot()
        line_id = 0
        for unicode_tag in root.findall(".//{*}TextLine/{*}TextEquiv/{*}Unicode"):
            name_line = unicode_tag.text
            if type(name_line) != str:
                name_line = ""
            if fix_lines:
                name_line = fix_name_line(name_line, known_min_freq, unknown_min_freq)
            (correct_words_line, 
             total_words_line, 
             correct_names_line, 
             total_names_line, 
             missed_names, 
             guessed_words,
             years_eval_line) = compare_lines_via_words(name_line, 
                                                        deceased_names[file_id][line_id],
                                                        years[file_id][line_id])
            update_years_eval_file(years_eval_file, years_eval_line)
            correct_words_file += correct_words_line
            total_words_file += total_words_line
            correct_names_file += correct_names_line
            total_names_file += total_names_line
            line_id += 1

    words_correct_percentage = round(100 * correct_words_file/total_words_file, 1)
    names_correct_percentage = round((100 * correct_names_file / total_names_file), 1)
    decade_scores = make_decade_scores(years_eval_file)
    return(get_export_job_id(validation_file_dir),
           get_doc_id(root),
           get_time_stamp(root), 
           { "words_acc": words_correct_percentage, "names_acc": names_correct_percentage }, 
           decade_scores)

In [None]:
VALIDATION_FILE_DIR = "tmp/1619947/TRAINING_VALIDATION_SET_combined_images_with_ground_truth_(4250)/page/" 
# Transkribus: CER: 20.3%; WER: 41.4%; time: 2:02h; epoch: 250
VALIDATION_FILE_DIR = "tmp/1625029/combined_images_validation/page"
# Transkribus: model combined images with ground truth (4250); use existing line polygons; Language model from training data
VALIDATION_FILE_DIR = "tmp/1632818/combined_images_validation_5258/page"
# 20231113 combined images (6750) 1830s 1840s runs: 150, 300, 550
VALIDATION_FILE_DIR = "tmp/1652679/combined_images_valudation_6750/page"
# 20231120 1830s 250 runs
VALIDATION_FILE_DIR = "tmp/1664712/TRAINING_VALIDATION_SET_combined_images_1830s/page"
# 20231120 1840s 250 runs
VALIDATION_FILE_DIR = "tmp/1665816/TRAINING_VALIDATION_SET_combined_images_1840s/page"
# 20231120 1920s 250 runs
VALIDATION_FILE_DIR = "tmp/1666399/TRAINING_VALIDATION_SET_combined_images_1920s/page"
# 20231120 1920s 250 runs; batch size 6
VALIDATION_FILE_DIR = "tmp/1667139/TRAINING_VALIDATION_SET_combined_images_1920s_batch_size_6/page"
# 20231120 1920s 250 runs; batch size 6
VALIDATION_FILE_DIR = "tmp/1667611/TRAINING_VALIDATION_SET_combined_images_(7513)/page"

evaluate_validation_data(VALIDATION_FILE_DIR, deceased_names, years, fix_lines=False)

| **HTR model** | **job id** | **doc id** | **date** | **name accuracy** | **1830s** | **1840s** | **1920s** | **post-processed** | **1830s** | **1840s** | **1920s** | TPR | missed | cutoff | |
| ------------- | ---------- | ---------- | -------- | :---------------: | :----------------: | :---: | :---: | :---: | :---: | :---: | :---: | :-: | :---: | :---: | :-: |
| Model 4250 from Dutchess | 6699433 | 1625029 | 2023-10-24T14:40:45 | - | ||| - | ||| |||
| HTR_Curacao_best_model (HCbm) | 6699264 | 1625029 | 2023-10-24T14:18:33 | 11.1% | ||| 16.0% | ||| |||
| The Dutchess I | 6699351 | 1625029 | 2023-10-24T14:30:39 | 26.0% | ||| 29.5% | ||| |||
| Model 4250 from HCbm | 6699155 | 1625029 | 2023-10-24T14:07:52 | 53.1% | ||| 53.8% | ||| |||
| Model 5258 from HCbm | 6748708 | 1632818 | 2023-10-31T09:27:16 | 29.1% | 26.8% | 35.7% | 8.3% | 38.8% | 36.2% | 44.2% | 22.9% | |||
| Model 6750 from HCbm 150 epochs | 6879183 | 1652679 | 2023-11-13T09:45:48 | 46.0% | 41.9% | 48.5% | --- | 46.0% |  44.1% | 47.2% | --- | |||
| Model 6750 from HCbm 300 epochs | 6879187 | 1652679 | 2023-11-13T09:46:15 | 52.9% | 49.3% | 55.0% | --- | 51.8% |  52.2% | 51.5% | --- | |||
| Model 6750 from HCbm 550 epochs | 6879193 | 1652679 | 2023-11-13T09:48:13 | 47.1% | 41.9% | 50.2% | --- | 48.5% |  47.8% | 48.9% | --- | |||
| 1830s from HCbm 250 epochs | 6973533 | 1664712 | 2023-11-19T17:16:01 | --- | 55.3% | --- | --- | --- | 58.7% | --- | --- | 94% | 6% | 13 |
| 1840s from HCbm 250 epochs | 6974337 | 1665816 | 2023-11-20T11:39:33 | --- | --- | 63.7% | --- | --- | --- | 66.0% | --- | 95% | 3% | 3 |
| 1920s from HCbm 250 epochs | 6982409 | 1666399 | 2023-11-20T16:12:18 | --- | --- | --- | 7.2% | --- | --- | --- | 10.2% | 71% | 29% | 220 |
| as above with batch size 6 | 6985194 | 1667139 | 2023-11-20T20:08:43 | --- | --- | --- | 11.4% | --- | --- | --- | 13.9% | 72% | 28% | 160 |
| 3 decades from HCbm 250 epochs | 6985330 | 1667611 | 2023-11-20T20:32:43 | 59.1% | 63.3% | 69.3% | 42.2% | 59.9% | 61.3% | 70.2 | 45.2% | 94% | 6% | 8 | ðŸ”¥

## 11. Postprocessing names

In [None]:
def split_name(name_in):
    for phrase in [ "\(a\)", 
                    "zich noemende en schrijfende*",
                    "zich noemende en schrijvende*",
                    "zich noemende en schryfende*",
                    "zich noemende en schryvende*",
                    "zich noemende en schrijfende*",
                    "zich ook noemende en schrijvende*",
                    "zich ook noemende en schryfende*",
                    "zich ook noemende en schryvende*",
                    "zich noemende en teekenende*",
                    "zich ook noemende en teekenende*",
                    "zich ook noemende*",
                    "zich noemde*",
                    "zich noemende*",
                    "zich teekenende*",
                    "zich ook schrijfende*",
                    "zich ook schrijvende*",
                    "zich ook schryfende*",
                    "zich ook schryvende*",
                    "zich schrijfende*",
                    "zich schrijvende*",
                    "zich schryfende*",
                    "zich schryvende*",
                    "zich ook te noemen",
                    "z\.n\.e\.s\.",
                    "z\.n\.e\.t\.",
                    "z\.n\.",
                    "z\.t\.",
                    "z\. t\.",
                    "ook genaamd",
                    "ook gen\.",
                    "of ook wel", 
                    "of ook",
                    "ook",
                    "alias",
                    "o\.g\."
                    "genaamd",
                    "door de wandeling", ]:
        if regex.search(f" {phrase} ", name_in):
            split_names = regex.split(f" {phrase} ", name_in)
            names_out = []
            for name in split_names:
                split_name_parts = split_name(name)
                if len(split_name_parts) > 1:
                    names_out.extend(split_name_parts)
                else:
                    names_out.append(name)
            return names_out
    return []

In [None]:
def split_names(known_names):
    for row_id, row in known_names.iterrows():
        split_names = split_name(row["Namen"])
        if len(split_names) > 0:
            known_names.iloc[row_id] = ""
            for name in split_names:
                known_names.loc[len(known_names)] = name
        if 1000 * int(row_id / 1000) == row_id:
            squeal(f"{row_id}/{len(known_names)} (split_names)")
    return known_names

In [None]:
def to_lower_names(known_names):
    for row_id, row in known_names.iterrows():
        name_words = row["Namen"].split()
        for i in range(0, len(name_words)):
            if regex.search("[A-Z][A-Z]+", name_words[i]) and not regex.search("[a-z]", name_words[i]):
                name_words[i] = name_words[i][0] + name_words[i][1:].lower()
        new_name = " ".join(name_words)
        if new_name != row["Namen"]:
            known_names.iloc[row_id] = " ".join(name_words)
        if 1000 * int(row_id / 1000) == row_id:
            squeal(f"{row_id}/{len(known_names)} (to_lower_names)")
    return known_names

In [None]:
INFIX_TOKENS = "@ d. da de del dela den der di don dos du el la las le lo los 's san santa sint st st. 't t te ten ter v. v.d. van vander von".split() 

def move_infix_tokens(known_names):
    for row_id, row in known_names.iterrows():
        if regex.search(",",  row["Namen"]):
            if not regex.search(",.*,",  row["Namen"]):
                prefix, suffix = regex.split(" *, *", row["Namen"])
                suffix_tokens = suffix.split()
                only_known_infix_tokens = True
                for suffix_token in suffix_tokens:
                    if suffix_token.lower() not in INFIX_TOKENS:
                        only_known_infix_tokens = False
                        break
                if only_known_infix_tokens:
                    prefix_tokens = prefix.split()
                    results = prefix_tokens[:-1]
                    results.extend(suffix_tokens)
                    results.extend([prefix_tokens[-1]])
                    known_names.loc[row_id] = " ".join(results)
        if 1000 * int(row_id / 1000) == row_id:
            squeal(f"{row_id}/{len(known_names)} (move_infix_tokens)")
    return known_names                      

In [None]:
def fix_of(known_names):
    for row_id, row in known_names.iterrows():
        if regex.search(" of ", row["Namen"]):
            name_words = row["Namen"].split()
            name_words_id = 0
            while name_words[name_words_id] != "of":
                name_words_id += 1
            known_names.iloc[row_id] = " ".join(name_words[:name_words_id] + name_words[name_words_id+2:])
            known_names.loc[len(known_names)] = " ".join(name_words[:name_words_id-1] + name_words[name_words_id+1:])
            out_file = open("tmp_of", "a")
            print(known_names.iloc[row_id]["Namen"], file=out_file)
            print(known_names.iloc[len(known_names)-1]["Namen"], file=out_file)
            out_file.close()
        if 1000 * int(row_id / 1000) == row_id:
            squeal(f"{row_id}/{len(known_names)} (fix_of)")
    squeal(f"{row_id}/{len(known_names)} (fix_of)")
    return known_names

In [None]:
KNOWN_NAMES_FILE = "../../data/Overlijden/x-misc/Namen en beroepen Curacao.csv"

Cleaning the names takes a lot of time...

In [None]:
known_names = pd.read_csv(KNOWN_NAMES_FILE)
known_names = split_names(known_names)
known_names = to_lower_names(known_names)
known_names = move_infix_tokens(known_names)
known_names = fix_of(known_names)
known_names = known_names[known_names.Namen != ""].drop_duplicates().sort_values(by="Namen")
known_names.to_csv("tmp.csv", index=None)

In [None]:
def count_words(name_list):
    """ count the words/name parts in the name list """
    word_freqs = {}
    for name in name_list:
        for word in name.lower().split():
            if word not in word_freqs:
                word_freqs[word] = 1
            else:
                word_freqs[word] += 1
    return word_freqs

In [None]:
def get_most_similar_names(guessed_name, word_freqs, unknown_min_freq):
    """ return the names with the smallest Levenshtein distance, sort by frequency """
    best_distance = sys.maxsize
    best_words = []
    guessed_name_lower = guessed_name.lower()
    for word in [ word for word in word_freqs if word_freqs[word] >= unknown_min_freq ]:
        word_distance = distance(word, guessed_name_lower)
        if word_distance < best_distance:
            best_distance = word_distance
            best_words = [ word ]
        elif word_distance == best_distance:
            best_words.append(word)
    return best_distance, sorted(best_words, key=lambda word: word_freqs[word], reverse=True)

In [None]:
def fix_name_line(name_line, known_min_freq=10, unknown_min_freq=10):
    """ replace infrequent name parts on the line by close frequent name parts """
    name_words_in = name_line.lower().split()
    name_words_out = []
    for name_word in name_words_in:
        if name_word in word_freqs and word_freqs[name_word] >= known_min_freq:
            name_words_out.append(name_word)
        else:
            name_words_out.append(get_most_similar_names(name_word, word_freqs, unknown_min_freq)[1][0])
    return " ".join(name_words_out)

In [None]:
def optimize_min_freqs(known_min_freq, unknown_min_freq, step_size, target=1):
    """ optimize the threshold frequencies for known name parts and replacing name parts """
    best_freq = evaluate_validation_data(VALIDATION_FILE_DIR, 
                                         deceased_names,
                                         years,
                                         known_min_freq, 
                                         unknown_min_freq,
                                         fix_lines=True)[target]
    best_i, best_j = 0, 0
    for i in [ -step_size, 0, step_size ]:
        for j in [ -step_size, 0, step_size ]:
            test_freq = evaluate_validation_data(VALIDATION_FILE_DIR, 
                                                 deceased_names,
                                                 years,
                                                 known_min_freq + i, 
                                                 unknown_min_freq +j,
                                                 fix_lines=True)[target]
            if test_freq  > best_freq:
                best_freq, best_i, best_j = test_freq, i, j
    if best_i != 0 or best_j != 0:
        return optimize_min_freqs(known_min_freq + best_i, unknown_min_freq + best_j, step_size)
    else:
        return known_min_freq, unknown_min_freq

In [None]:
word_freqs = count_words(known_names["Namen"])

In [None]:
optimize_min_freqs(1, 2, 1, target=1)

In [None]:
evaluate_validation_data(VALIDATION_FILE_DIR, deceased_names, years, 1, 2, fix_lines=True)

## 12. Estimate which guessed words are correct

In [None]:
def add_word_to_freq_dict(frequency_dict, word, count=1):
    """ add word with frequency to frequency list """
    if word in frequency_dict:
        frequency_dict[word] += count
    else:
        frequency_dict[word] = count
    return frequency_dict

In [None]:
def get_train_words(deceased_names, test_file_ids, cutoff=0):
    """ alternative for get_known_names_words: return words appearing in names in train data, exclude test data """
    train_words = {}
    for file_id in range(0, len(deceased_names)):
        if file_id not in test_file_ids:
            for line in deceased_names[file_id]:
                for word in " ".join(line).lower().split():
                    add_word_to_freq_dict(train_words, word)
    return { word: True for word in train_words if train_words[word] >= cutoff }

In [None]:
def get_known_names_words(known_names, cutoff=0):
    """ return words present in known names with a minimum frequency of "cutoff" """
    name_words = {}
    for known_name in known_names["Namen"]:
        for name_word in known_name.lower().split():
            add_word_to_freq_dict(name_words, name_word)
    return {name_word: True for name_word in name_words if name_words[name_word] >= cutoff }

In [None]:
def get_test_word_assessments(validation_file_dir):
    """ count correct and wrong words in test files based on bag of words per line """
    correct_test_words, wrong_test_words, test_file_ids = {}, {}, []
    for file_name in sorted(os.listdir(validation_file_dir)):
        file_id = int(regex.sub(".xml$", "", file_name)) - 1
        test_file_ids.append(file_id)
        tree = ET.parse(os.path.join(validation_file_dir, file_name))
        root = tree.getroot()
        line_id = 0
        for unicode_tag in root.findall(".//{*}TextLine/{*}TextEquiv/{*}Unicode"):
            name_line = unicode_tag.text
            if type(name_line) != str:
                name_line = ""
            guessed_words = get_word_dict(name_line.lower())
            correct_words = get_word_dict(" ".join(deceased_names[file_id][line_id]).lower())
            for guessed_word in guessed_words:
                if guessed_word in correct_words:
                    add_word_to_freq_dict(correct_test_words, 
                                          guessed_word, 
                                          min(guessed_words[guessed_word], correct_words[guessed_word]))
                    if guessed_words[guessed_word] > correct_words[guessed_word]:
                        add_word_to_freq_dict(wrong_test_words, 
                                              guessed_word, 
                                              guessed_words[guessed_word] - correct_words[guessed_word])
                else:
                    add_word_to_freq_dict(wrong_test_words, guessed_word, guessed_words[guessed_word])
            line_id += 1
    return correct_test_words, wrong_test_words, test_file_ids

In [None]:
def count_assessments(validation_file_dir, deceased_names, known_names, cutoff=0):
    """ check how often correct words appear in known words and not; do the same for wrong words """
    correct_test_words, wrong_test_words, test_file_ids = get_test_word_assessments(validation_file_dir)
    known_words = get_known_names_words(known_names, cutoff)
    correct_assessed_correctly, correct_assessed_wrongly = 0, 0
    wrong_assessed_correctly, wrong_assessed_wrongly = 0, 0
    for word in correct_test_words:
        if word in known_words:
            correct_assessed_correctly += correct_test_words[word]
        else:
            correct_assessed_wrongly += correct_test_words[word]
    for word in wrong_test_words:
        if word in known_words:
            wrong_assessed_wrongly += wrong_test_words[word]
        else:
            wrong_assessed_correctly += wrong_test_words[word]
    return correct_assessed_correctly, correct_assessed_wrongly, wrong_assessed_correctly, wrong_assessed_wrongly

In [None]:
def evaluate_cutoff_values(max_cutoff):
    """ evaluate different vocabulary frequency cutoff values """
    correct_accuracies, wrong_accuracies, cutoff = [], [], 0
    while cutoff <= max_cutoff:
        (correct_assessed_correctly, 
         correct_assessed_wrongly, 
         wrong_assessed_correctly, 
         wrong_assessed_wrongly) = count_assessments(VALIDATION_FILE_DIR, 
                                                     deceased_names, 
                                                     known_names,
                                                     cutoff=cutoff)
        correct_total = correct_assessed_correctly + correct_assessed_wrongly
        wrong_total = wrong_assessed_correctly + wrong_assessed_wrongly
        correct_accuracy = round(correct_assessed_correctly / correct_total, 3)
        wrong_accuracy = round(wrong_assessed_correctly / wrong_total, 3)
        correct_assessed_as_correct = round(correct_accuracy * correct_total)
        correct_assessed_as_wrong = round((1 - correct_accuracy) * correct_total)
        wrong_assessed_as_correct = round((1 - wrong_accuracy) * wrong_total)
        wrong_assessed_as_wrong = round(wrong_accuracy * wrong_total)
        percentage_false_positives = wrong_assessed_as_correct / ( wrong_assessed_as_correct + correct_assessed_as_correct )
        percentage_missed = correct_assessed_as_wrong / ( correct_assessed_as_wrong + correct_assessed_as_correct )
        print(f"cutoff: {cutoff}; correct: accuracy: {correct_accuracy}, total: {correct_total}; wrong: accuracy: {wrong_accuracy}, total: {wrong_total};",
              f"percentage false positives: {round(100 * percentage_false_positives,1)}%;", 
              f"percentage missed: {round(100 * percentage_missed, 1)}%")
        correct_accuracies.append(correct_accuracy)
        wrong_accuracies.append(wrong_accuracy)
        if cutoff < 20:
            cutoff += 1
        elif cutoff < 300:
            cutoff += 10
        else:
            cutoff += 100
    return correct_accuracies, wrong_accuracies, correct_assessed_as_correct, correct_assessed_as_wrong, wrong_assessed_as_correct, wrong_assessed_as_wrong

In [None]:
def plot_accuracies(correct_accuracies, wrong_accuracies):
    fig, ax = plt.subplots(figsize=(5.5, 5))
    ax.set_title("correct_accuracies vs wrong_accuracies")
    ax.set_xlabel("correct_accuracies")
    ax.set_ylabel("wrong_accuracies")
    plt.xlim(0, 1)
    plt.ylim(0, 1)
    ax.plot(correct_accuracies, wrong_accuracies)

In [None]:
(correct_accuracies, 
 wrong_accuracies, 
 correct_assessed_as_correct, 
 correct_assessed_as_wrong, 
 wrong_assessed_as_correct, 
 wrong_assessed_as_wrong) = evaluate_cutoff_values(1000)

In [None]:
plot_accuracies(correct_accuracies, wrong_accuracies)

In [None]:
pd.DataFrame([[correct_assessed_as_correct, correct_assessed_as_wrong, correct_assessed_as_correct + correct_assessed_as_wrong], 
              [wrong_assessed_as_correct, wrong_assessed_as_wrong, wrong_assessed_as_correct + wrong_assessed_as_wrong]], 
             columns=["assessed as correct", "assessed as wrong", "total"], index=["correct", "wrong"])

## 99. Tests

In [None]:
import unittest

In [None]:
class TestNotebook(unittest.TestCase):    
    def test_split_line_in_names(self):
        self.assertEqual(split_line_in_names("Jan Piet Klaas Marie Jozef Benjamin", ["A B C", "D", "E F"]),
                         ['Jan Piet Klaas', 'Marie', 'Jozef Benjamin'] )

    def test_get_word_dict(self):
        self.assertEqual(get_word_dict("Jan Piet Klaas Jan"),
                         { 'Jan': 2, "Piet": 1, "Klaas": 1 } )

    def test_move_infix_tokens(self):
        self.assertEqual(move_infix_tokens(pd.DataFrame(["Jose Costa, da"], columns=["Namen"]))["Namen"][0], "Jose da Costa")

In [None]:
unittest.main(argv=[''], verbosity=2, exit=False)