# Get names from scans

Extract the parts of the scans of the Curacao Death Registers which contain the name of the deceased. This information is based on data annotation by humans.

In [None]:
import numpy
import os
import pandas as pd
import random
import regex
import sys
from PIL import Image, ImageDraw
import xml.etree.ElementTree as ET
from IPython.display import clear_output

In [None]:
def squeal(text=None):
    clear_output(wait=True)
    if not text is None:
        print(text)

## 1. Read logfile data

In [None]:
def read_logfile_data():
    """ read manually checked locations of deceased names in scans """
    return pd.read_csv("../website/private/hdsc/etc/logfile", header=None)

In [None]:
def convert_logfile_data_to_scan_data(logfile_data):
    """ remove duplicates from logfile data; keep only final annotation """
    scan_data = {}
    for index, row in logfile_data.iterrows():
        coordinates_file_name, status, text_line_id, coords_id, deceased_name, ip_address, date = row
        scan_data[coordinates_file_name] = [index, status, text_line_id, coords_id, ip_address]
    return scan_data

In [None]:
logfile_data = read_logfile_data()
logfile_data

In [None]:
scan_data = convert_logfile_data_to_scan_data(logfile_data)

## 2. Find best coordinates for guess of deceased name position

In [None]:
def get_coordinates_from_line(line):
    """ extract polygon coordinates from points attribute of Coords tag in Transkribus xml, format: '123,456 789,012 ...'"""
    split_line = [ pair.split(",") for pair in line.split() ]
    return [ ( int(x), int(y) ) for x, y in split_line ]

In [None]:
def find_top_left(polygon):
    """ find top left position of polygon (alternative: use polygon2rectangle) """
    top_coordinate = sys.maxsize
    left_coordinate = sys.maxsize
    for pair in polygon[0]:
        if pair[1] < top_coordinate:
            top_coordinate = pair[1]
        if pair[0] < left_coordinate:
            left_coordinate = pair[0]
    return top_coordinate, left_coordinate

In [None]:
def sort_polygons(polygons):
    """ sort polygons based on top left point: first by vertical position, then by horizontal position """
    extended_polygons = []
    for polygon in polygons:
        top_coordinate, left_coordinate = find_top_left(polygon)
        extended_polygons.append([top_coordinate, left_coordinate, polygon])
    return [ extended_polygon[2] 
             for extended_polygon in sorted(extended_polygons, 
                                            key=lambda ep: (ep[0], ep[1])) ]

In [None]:
INDEX_START_SORTING = 574
INDEX_REMOVE_MARIGINALIA = 2307

def get_text_polygons(coordinates_file_name, index):
    """ read polygons from Transkribus file (universal version)"""
    root = ET.parse(coordinates_file_name).getroot()
    polygons = []
    for text_region in root.findall(".//{*}TextRegion"):
        text_region_polygons = []
        for text_line in text_region.findall("./{*}TextLine"):
            text_region_polygons.append([])
            for coords in text_line.findall("./{*}Coords"):
                text_region_polygons[-1].append(get_coordinates_from_line(coords.attrib["points"]))
        if index < INDEX_START_SORTING:
            polygons.extend(text_region_polygons)
        elif index < INDEX_REMOVE_MARIGINALIA:
            polygons.extend(sort_polygons(text_region_polygons))
        else:
            if len(text_region_polygons) > len(polygons):
                polygons = sort_polygons(text_region_polygons)
    return polygons

In [None]:
def get_text_polygons_new(coordinates_file_name):
    """ read polygons from Transkribus file (ignores mariginalia)"""
    root = ET.parse(coordinates_file_name).getroot()
    polygons = []
    for text_region in root.findall(".//{*}TextRegion"):
        text_region_polygons = []
        for text_line in text_region.findall("./{*}TextLine"):
            text_region_polygons.append([])
            for coords in text_line.findall("./{*}Coords"):
                text_region_polygons[-1].append(get_coordinates_from_line(coords.attrib["points"]))
        if len(text_region_polygons) > len(polygons):
            polygons = sort_polygons(text_region_polygons)
    return polygons

In [None]:
def get_text_polygons_old(coordinates_file_name):
    """ read polygons from Transkribus file (considers mariginalia)"""
    root = ET.parse(coordinates_file_name).getroot()
    polygons = []
    for text_region in root.findall(".//{*}TextRegion"):
        text_region_polygons = []
        for text_line in text_region.findall("./{*}TextLine"):
            text_region_polygons.append([])
            for coords in text_line.findall("./{*}Coords"):
                text_region_polygons[-1].append(get_coordinates_from_line(coords.attrib["points"]))
        polygons.extend(sort_polygons(text_region_polygons))
    return polygons

In [None]:
def polygon2rectangle(coordinates):
    """ find coordinates of minimal rectangle completely enclosing polygon """
    x_min, x_max, y_min, y_max = (sys.maxsize, 0, sys.maxsize, 0)
    for x, y in coordinates:
        if x < x_min: x_min = x
        if x > x_max: x_max = x
        if y < y_min: y_min = y
        if y > y_max: y_max = y
    return x_min, y_min, x_max, y_max

In [None]:
def encloses_point(rectangle, point):
    """ check if point is inside rectangle """
    return(rectangle[0] <= point[0] and rectangle[2] >= point[0] and
           rectangle[1] <= point[1] and rectangle[3] >= point[1])

In [None]:
def get_best_polygon_for_y(polygons, y):
    """ find polygon closest to horizontal line indicated by argument y """
    best_distance, best_text_line_id, best_coords_id = (sys.maxsize, -1, -1)
    for text_line_id in range(0, len(polygons)):
        for coords_id in range(0, len(polygons[text_line_id])):
            rectangle = polygon2rectangle(polygons[text_line_id][coords_id])
            distance = abs(y - rectangle[1])
            if distance < best_distance:
                best_distance = distance
                best_text_line_id = text_line_id
                best_coords_id = coords_id
    return best_text_line_id, best_coords_id

In [None]:
def count_point_name_hits(best_point_x, best_point_y, best_line_y):
    """ evaluate values of best_point_x, best_point_y and best_line_y: how often do they predict the right deceased name polygon """
    hit_counts = 0
    for coordinates_file_name in scan_data:
        index, status, text_line_id, coords_id, ip_address = scan_data[coordinates_file_name]
        if index >= 600 and status == "save":
            polygons = get_text_polygons("../website/private/hdsc/data/page/" + 
                                        coordinates_file_name, index)
            polygon = polygons[text_line_id][coords_id]
            rectangle = polygon2rectangle(polygon)
            if encloses_point(rectangle, (best_point_x, best_point_y)):
                hit_counts += 1
            else:
                best_text_line_id, best_coords_id = get_best_polygon_for_y(polygons, best_line_y)
                if best_text_line_id == text_line_id and best_coords_id == coords_id:
                    hit_counts += 1
    return hit_counts

In [None]:
def find_best_default_point(best_point_x, best_point_y, best_line_y, incr=10):
    """ find best guess for position inside deceased name frame: best_point_x, best_point_y; keep best_line_y constant """
    while True:
        hit_counts = count_point_name_hits(best_point_x, best_point_y, best_line_y)
        print(best_point_x, best_point_y, "#", best_line_y, hit_counts, incr)
        if count_point_name_hits(best_point_x + incr, best_point_y, best_line_y) > hit_counts:
            best_point_x += incr
            continue
        if count_point_name_hits(best_point_x + incr, best_point_y + incr, best_line_y) > hit_counts:
            best_point_x += incr
            best_point_y += incr
            continue
        if count_point_name_hits(best_point_x, best_point_y + incr, best_line_y) > hit_counts:
            best_point_y += incr
            continue
        if count_point_name_hits(best_point_x - incr, best_point_y + incr, best_line_y) > hit_counts:
            best_point_x -= incr
            best_point_y += incr
            continue
        if count_point_name_hits(best_point_x - incr, best_point_y, best_line_y) > hit_counts:
            best_point_x -= incrpolygons
            continue
        if count_point_name_hits(best_point_x - incr, best_point_y - incr, best_line_y) > hit_counts:
            best_point_x -= incr
            best_point_y -= incr
            continue
        if count_point_name_hits(best_point_x, best_point_y - incr, best_line_y) > hit_counts:
            best_point_y -= incr
            continue
        if count_point_name_hits(best_point_x + incr, best_point_y - incr, best_line_y) > hit_counts:
            best_point_x += incr
            best_point_y -= incr
            continue
        break
    return best_point_x, best_point_y

In [None]:
def find_best_line_y_default(best_point_x, best_point_y, best_line_y, incr=10):
    """ find horizontal line closests to top of most deceased name boxes: best_line_y; 
        keep best_point_x and best_point_y constant """
    while True:
        hit_counts = count_point_name_hits(best_point_x, best_point_y, best_line_y)
        print(best_line_y, "#", best_point_x, best_point_y, hit_counts, incr)
        if count_point_name_hits(best_point_x, best_point_y, best_line_y + incr) > hit_counts:
            best_line_y += incr
            continue
        if count_point_name_hits(best_point_x, best_point_y, best_line_y + incr) > hit_counts:
            best_line_y -= incr
            continue
        break
    return best_line_y

In [None]:
X_DEFAULT = 693
Y_DEFAULT = 469
LINE_Y_DEFAULT = 510

best_point_x, best_point_y = find_best_default_point(X_DEFAULT, Y_DEFAULT, LINE_Y_DEFAULT, incr=10)
best_point_x, best_point_y = find_best_default_point(best_point_x, best_point_y, LINE_Y_DEFAULT, incr=5)
best_point_x, best_point_y = find_best_default_point(best_point_x, best_point_y, LINE_Y_DEFAULT, incr=2)
best_point_x, best_point_y = find_best_default_point(best_point_x, best_point_y, LINE_Y_DEFAULT, incr=1)

In [None]:
best_line_y = find_best_line_y_default(best_point_x, best_point_y, LINE_Y_DEFAULT, incr=10)
best_line_y = find_best_line_y_default(best_point_x, best_point_y, best_line_y, incr=5)
best_line_y = find_best_line_y_default(best_point_x, best_point_y, best_line_y, incr=2)
best_line_y = find_best_line_y_default(best_point_x, best_point_y, best_line_y, incr=1)

## 3. Compute areas of identified deceased name frames

In [None]:
def compute_rectangle_area(rectangle):
    """ compute the are of a rectangle """
    x_min, y_min, x_max, y_max = rectangle
    return (x_max - x_min) * (y_max - y_min)

In [None]:
def compute_areas(scan_data):
    """ compute areas of rectangles related to polygons and show minimum, average and maximum value """
    min_area, max_area, area_count, area_total = sys.maxsize, 0, 0, 0
    for coordinates_file_name in scan_data:
        index, status, text_line_id, coords_id, ip_address = scan_data[coordinates_file_name]
        if status == "save":
            polygon = get_text_polygons("../website/private/hdsc/data/page/" + 
                                        coordinates_file_name, index)[text_line_id][coords_id]
            rectangle = polygon2rectangle(polygon)
            area = compute_rectangle_area(rectangle)
            if area < min_area:
                min_area = area
            if area > max_area:
                max_area = area
            area_count += 1
            area_total += area
    return min_area, max_area, int(area_total/area_count)

In [None]:
compute_areas(scan_data)

## 4. Compute aspect ratios of identified deceased name frames

In [None]:
def compute_rectangle_aspect_ratio(rectangle):
    """ compute the are of a rectangle """
    x_min, y_min, x_max, y_max = rectangle
    return (x_max - x_min) / (y_max - y_min)

In [None]:
def compute_aspect_ratios(scan_data):
    """ compute aspect ratios of rectangles related to polygons and show minimum, average and maximum value """
    min_ratio, max_ratio, ratio_count, ratio_total = sys.maxsize, 0, 0, 0
    for coordinates_file_name in scan_data:
        index, status, text_line_id, coords_id, ip_address = scan_data[coordinates_file_name]
        if status == "save":
            polygon = get_text_polygons("../website/private/hdsc/data/page/" + 
                                        coordinates_file_name, index)[text_line_id][coords_id]
            rectangle = polygon2rectangle(polygon)
            ratio = compute_rectangle_aspect_ratio(rectangle)
            if ratio < min_ratio:
                min_ratio = ratio
            if ratio > max_ratio:
                max_ratio = ratio
            ratio_count += 1
            ratio_total += ratio
    return min_ratio, max_ratio, int(ratio_total/ratio_count)

In [None]:
compute_aspect_ratios(scan_data)

## 5. Cut out polygons from scans containing deceased name

In [None]:
COVERED_BACKGROUND = 0
TRANSPARENT_BACKGROUND = 255
FILL_COLOR = (198, 178, 125)

In [None]:
# code based on https://stackoverflow.com/questions/22588074/polygon-crop-clip-using-python-pil

def mask_polygon(image, polygon, covered_background):
    """ highlight polygon (= deceased name) on image of scan: transparency 255, while rest: transparency 0 """
    image_with_transparency = image.convert("RGBA")
    numpy_image = numpy.asarray(image_with_transparency)
    masked_image = Image.new('P', (numpy_image.shape[1], numpy_image.shape[0]), covered_background)
    ImageDraw.Draw(masked_image).polygon(polygon, outline=0, fill=TRANSPARENT_BACKGROUND)
    mask = numpy.array(masked_image)
    masked_numpy_image = numpy.empty(numpy_image.shape, dtype='uint8')
    masked_numpy_image[:,:,:3] = numpy_image[:,:,:3]
    masked_numpy_image[:,:,3] = mask
    return Image.fromarray(masked_numpy_image, "RGBA")

In [None]:
def make_image_file_name(coordinates_file_name):
    """ change Transkribus file name to corresponding image file name and add location of directory """
    return "../website/private/hdsc/data/" + regex.sub(".xml$", ".JPG", coordinates_file_name)

In [None]:
def most_frequent_pixel_value(image):
    """ determine most frequent pixel value in image; only check transparent parts """
    pixel_values = {}
    image_data = image.getdata()
    for data in image_data:
        if len(data) <= 3 or data[3] != 0:
            rounded_data = [ str(int(data[0]*0.1) * 10), 
                             str(int(data[1]*0.1) * 10), 
                             str(int(data[2]*0.1) * 10) ]
            rounded_data = " ".join(rounded_data)
            if rounded_data in pixel_values:
                pixel_values[rounded_data] += 1
            else:
                pixel_values[rounded_data] = 1
    minimal_value = sorted(pixel_values.items(), key=lambda pvi: pvi[1], reverse=True)[0][0]
    return(int(minimal_value.split()[0]),
           int(minimal_value.split()[1]),
           int(minimal_value.split()[2]))

In [None]:
def expand_rectangle(rectangle, border_width):
    """ add margin of size border_width to rectangle """
    return( rectangle[0] - border_width, 
            rectangle[1] - border_width, 
            rectangle[2] + border_width, 
            rectangle[3] + border_width )

In [None]:
def randomize_pixel_value(pixel_value, spread=20):
    """ add a random value to a pixel value """
    return pixel_value[0] + random.randint(0, spread), pixel_value[1] + random.randint(0, spread), pixel_value[2] + random.randint(0, spread), 

In [None]:
def fill_background(masked_image):
    """ change background color of masked image and remove transparency """
    masked_image_data = masked_image.getdata()
    updated_data = []
    frequent_pixel_value = most_frequent_pixel_value(masked_image)
    for data in masked_image_data:
        if data[3] == 0:
            updated_data.append(randomize_pixel_value(frequent_pixel_value))
        else:
            updated_data.append(data[:3])
    masked_image.putdata(updated_data)
    masked_image = masked_image.convert("RGB")
    return masked_image

In [None]:
def get_name_frames_from_scans(scan_data):
    """ extract name frames from scans and store images in directory images """
    for coordinates_file_name in dict(sorted(scan_data.items(), 
                                      key=lambda scan_data_item: scan_data_item[1][0])):
        index, status, text_line_id, coords_id, ip_address = scan_data[coordinates_file_name]
        if status == "save":
            polygon = get_text_polygons("../website/private/hdsc/data/page/" + 
                                        coordinates_file_name, index)[text_line_id][coords_id]
            rectangle = polygon2rectangle(polygon)
            image_file_name = make_image_file_name(coordinates_file_name)
            image = Image.open(image_file_name)
            masked_image = mask_polygon(image, polygon, 0).crop(expand_rectangle(rectangle, 10))
            masked_image = fill_background(masked_image)
            masked_image.save("images/" + os.path.basename(image_file_name))
        squeal(index)
    if "masked_image" in vars():
        return masked_image
    else:
        return

In [None]:
get_name_frames_from_scans(scan_data)

## 6. Check image properties

In [None]:
def compute_property_values(scan_data, property_name="ratio"):
    """ compute the ratios of all images related to scan_data """
    property_values = {}
    min_property, max_property, property_count, property_total = sys.maxsize, 0, 0, 0
    for coordinates_file_name in scan_data:
        index, status, text_line_id, coords_id, ip_address = scan_data[coordinates_file_name]
        if status == "save":
            polygon = get_text_polygons("../website/private/hdsc/data/page/" + 
                                        coordinates_file_name, index)[text_line_id][coords_id]
            rectangle = polygon2rectangle(polygon)
            if property_name == "ratio":
                property_value = compute_rectangle_aspect_ratio(rectangle)
            elif property_name == "area":
                property_value = compute_rectangle_area(rectangle)
            elif property_name == "height":
                property_value = rectangle[3] - rectangle[1]
            elif property_name == "length":
                property_value = rectangle[2] - rectangle[0]
            else:
                sys.exit(f"compute_image_property: unknown property name: {property_name}")
            property_values[coordinates_file_name] = [ index, property_value, ip_address ]
    return property_values 

In [None]:
def show_images_by_property_values(property_values, start, end):
    """ show selected images by property values from start to end """
    for coordinates_file_name, ratio in sorted(property_values.items(), 
                                               key=lambda ratio_item: ratio_item[1][1])[start:end]:
        index, property_value, ip_address = property_values[coordinates_file_name]
        print(f"{index} # {round(property_value, 1)} # {ip_address} # {coordinates_file_name}:")
        display(Image.open("images/" + regex.sub(".xml", ".JPG", coordinates_file_name)))    

In [None]:
def show_extreme_property_values(scan_data, property_name="ratio", n=5):
    """ show extreme name frames by frame property """
    property_values = compute_property_values(scan_data, property_name)
    show_images_by_property_values(property_values, 0, n)
    show_images_by_property_values(property_values, len(property_values) - n, len(property_values))

In [None]:
show_extreme_property_values(scan_data, property_name="ratio", n=5)

In [None]:
show_extreme_property_values(scan_data, property_name="area", n=5)

In [None]:
pd.DataFrame([ data[1] for data in compute_property_values(scan_data, "length").values() ]).value_counts() 

## 7. Combine images

In [None]:
X_MAX_VALUE = 1600

def fits_in_gap(gaps, combined_image, image):
    """ find smallest gap at the end of a line where the name fits; return False otherwise """ 
    smallest_gap, smallest_i = sys.maxsize, -1
    for i in range(0, len(gaps)):
        x, y = gaps[i]
        if x + image.size[0] <= X_MAX_VALUE and X_MAX_VALUE - x - image.size[0] < smallest_gap:
            smallest_gap = X_MAX_VALUE - x + image.size[0]
            smallest_i = i
    if smallest_i >= 0:
        return gaps[smallest_i][0], gaps[smallest_i][1], smallest_i
    return False

In [None]:
def make_combined_image_file_name(file_counter):
    """ make the name of the images with combined names"""
    return f"combined_images/{file_counter.zfill(2)}.jpg"

In [None]:
def initialize_combine_images(deceased_names, file_counter):
    deceased_names.append([[]])
    file_counter += 1
    return 0, 0, [], file_counter, deceased_names, Image.new( "RGB", (1600, 1200), (255, 255, 255))

In [None]:
def combine_images(scan_data):
    """ combine as many as possible name frames in several images """ 
    x, y, gaps, file_counter, deceased_names, combined_image = initialize_combine_images([], 0)
    for coordinates_file_name in dict(sorted(scan_data.items(), 
                                             key=lambda scan_data_item: scan_data_item[1][0])):
        index, status, text_line_id, coords_id, ip_address = scan_data[coordinates_file_name]
        if status =="save":
            try:
                image_file_name = "images/" + os.path.basename(make_image_file_name(coordinates_file_name))
                image = Image.open(image_file_name)
            except Exception:
                print(f"problem processing file {image_file_name}")
                continue
            if fits_in_gap(gaps, combined_image, image):
                x_gap, y_gap, i_gap = fits_in_gap(gaps, combined_image, image)
                combined_image.paste(image, (x_gap, y_gap + int((100 - image.size[1]) / 2)))
                deceased_names[-1][int(0.5 + y_gap/100)].append(logfile_data.iloc[index][4])
                x_gap += image.size[0] + 30
                gaps = gaps[:i_gap] + [[x_gap, y_gap]] + gaps[i_gap+1:]
                continue
            elif x + image.size[0] <= X_MAX_VALUE:
                pass
            elif y < 1100:
                gaps.append([x, y])
                x = 0
                y += 100
                deceased_names[-1].append([])
            else:
                combined_image.save(make_combined_image_file_name(str(file_counter)))
                x, y, gaps, file_counter, deceased_names, combined_image = initialize_combine_images(deceased_names, file_counter)
                combined_image = Image.new( "RGB", (1600, 1200), (255, 255, 255))
            combined_image.paste(image, (x, y + int((100 - image.size[1]) / 2)))
            deceased_names[-1][-1].append(logfile_data.iloc[index][4])
            x += image.size[0] + 30
    if x > 0 or y > 0:
        combined_image.save(make_combined_image_file_name(str(file_counter)))
    print(file_counter)
    display(combined_image)

In [None]:
combine_images(scan_data)