# Get names from scans

Extract the parts of the scans of the Curacao Death Registers which contain the name of the deceased. This information is based on data annotation by humans.

In [None]:
import pandas as pd
import regex
import sys
from PIL import Image
import xml.etree.ElementTree as ET

## 1. Read logfile data

In [None]:
def read_logfile_data():
    return pd.read_csv("../website/private/hdsc/etc/logfile", header=None)

In [None]:
def convert_logfile_data_to_scan_data(logfile_data):
    scan_data = {}
    for index, row in logfile_data.iterrows():
        coordinates_file_name, status, text_line_id, coords_id = row[:4]
        scan_data[coordinates_file_name] = [index, status, text_line_id, coords_id]
    return scan_data

In [None]:
logfile_data = read_logfile_data()
logfile_data

In [None]:
scan_data = convert_logfile_data_to_scan_data(logfile_data)

## 2. Find best coordinates for guess of deceased name position

In [None]:
def get_coordinates_from_line(line):
    split_line = [ pair.split(",") for pair in line.split() ]
    return [ ( int(x), int(y) ) for x, y in split_line ]

In [None]:
def find_top_left(polygon):
    top_coordinate = sys.maxsize
    left_coordinate = sys.maxsize
    for pair in polygon[0]:
        if pair[1] < top_coordinate:
            top_coordinate = pair[1]
        if pair[0] < left_coordinate:
            left_coordinate = pair[0]
    return top_coordinate, left_coordinate

In [None]:
def sort_polygons(polygons):
    extended_polygons = []
    for polygon in polygons:
        top_coordinate, left_coordinate = find_top_left(polygon)
        extended_polygons.append([top_coordinate, left_coordinate, polygon])
    return [ extended_polygon[2] 
             for extended_polygon in sorted(extended_polygons, 
                                            key=lambda ep: (ep[0], ep[1])) ]

In [None]:
def get_text_polygons(coordinates_file_name):
    root = ET.parse(coordinates_file_name).getroot()
    polygons = []
    for text_region in root.findall(".//{*}TextRegion"):
        text_region_polygons = []
        for text_line in text_region.findall("./{*}TextLine"):
            text_region_polygons.append([])
            for coords in text_line.findall("./{*}Coords"):
                text_region_polygons[-1].append(get_coordinates_from_line(coords.attrib["points"]))
        polygons.extend(sort_polygons(text_region_polygons))
    return polygons

In [None]:
def polygon2rectangle(coordinates):
    x_min, x_max, y_min, y_max = (sys.maxsize, 0, sys.maxsize, 0)
    for x, y in coordinates:
        if x < x_min: x_min = x
        if x > x_max: x_max = x
        if y < y_min: y_min = y
        if y > y_max: y_max = y
    return x_min, y_min, x_max, y_max

In [None]:
def encloses_point(rectangle, point):
    return(rectangle[0] <= point[0] and rectangle[2] >= point[0] and
           rectangle[1] <= point[1] and rectangle[3] >= point[1])

In [None]:
def get_best_polygon_for_y(polygons, y):
    best_distance, best_text_line_id, best_coords_id = (sys.maxsize, -1, -1)
    for text_line_id in range(0, len(polygons)):
        for coords_id in range(0, len(polygons[text_line_id])):
            rectangle = polygon2rectangle(polygons[text_line_id][coords_id])
            distance = abs(y - rectangle[1])
            if distance < best_distance:
                best_distance = distance
                best_text_line_id = text_line_id
                best_coords_id = coords_id
    return best_text_line_id, best_coords_id

In [None]:
def count_point_name_hits(best_point_x, best_point_y, best_line_y):
    hit_counts = 0
    for coordinates_file_name in scan_data:
        index, status, text_line_id, coords_id = scan_data[coordinates_file_name]
        if index >= 600 and status == "save":
            polygons = get_text_polygons("../website/private/hdsc/data/page/" + 
                                        coordinates_file_name)
            polygon = polygons[text_line_id][coords_id]
            rectangle = polygon2rectangle(polygon)
            if encloses_point(rectangle, (best_point_x, best_point_y)):
                hit_counts += 1
            else:
                best_text_line_id, best_coords_id = get_best_polygon_for_y(polygons, best_line_y)
                if best_text_line_id == text_line_id and best_coords_id == coords_id:
                    hit_counts += 1
    return hit_counts

In [None]:
def find_best_default_point(best_point_x, best_point_y, best_line_y, incr=10):
    while True:
        hit_counts = count_point_name_hits(best_point_x, best_point_y, best_line_y)
        print(best_point_x, best_point_y, "#", best_line_y, hit_counts, incr)
        if count_point_name_hits(best_point_x + incr, best_point_y, best_line_y) > hit_counts:
            best_point_x += incr
            continue
        if count_point_name_hits(best_point_x + incr, best_point_y + incr, best_line_y) > hit_counts:
            best_point_x += incr
            best_point_y += incr
            continue
        if count_point_name_hits(best_point_x, best_point_y + incr, best_line_y) > hit_counts:
            best_point_y += incr
            continue
        if count_point_name_hits(best_point_x - incr, best_point_y + incr, best_line_y) > hit_counts:
            best_point_x -= incr
            best_point_y += incr
            continue
        if count_point_name_hits(best_point_x - incr, best_point_y, best_line_y) > hit_counts:
            best_point_x -= incr
            continue
        if count_point_name_hits(best_point_x - incr, best_point_y - incr, best_line_y) > hit_counts:
            best_point_x -= incr
            best_point_y -= incr
            continue
        if count_point_name_hits(best_point_x, best_point_y - incr, best_line_y) > hit_counts:
            best_point_y -= incr
            continue
        if count_point_name_hits(best_point_x + incr, best_point_y - incr, best_line_y) > hit_counts:
            best_point_x += incr
            best_point_y -= incr
            continue
        break
    return best_point_x, best_point_y

In [None]:
def find_best_line_y_default(best_point_x, best_point_y, best_line_y, incr=10):
    while True:
        hit_counts = count_point_name_hits(best_point_x, best_point_y, best_line_y)
        print(best_line_y, "#", best_point_x, best_point_y, hit_counts, incr)
        if count_point_name_hits(best_point_x, best_point_y, best_line_y + incr) > hit_counts:
            best_line_y += incr
            continue
        if count_point_name_hits(best_point_x, best_point_y, best_line_y + incr) > hit_counts:
            best_line_y -= incr
            continue
        break
    return best_line_y

In [None]:
X_DEFAULT = 693
Y_DEFAULT = 471
LINE_Y_DEFAULT = 509

best_point_x, best_point_y = find_best_default_point(X_DEFAULT, Y_DEFAULT, LINE_Y_DEFAULT, incr=10)
best_point_x, best_point_y = find_best_default_point(best_point_x, best_point_y, LINE_Y_DEFAULT, incr=5)
best_point_x, best_point_y = find_best_default_point(best_point_x, best_point_y, LINE_Y_DEFAULT, incr=2)
best_point_x, best_point_y = find_best_default_point(best_point_x, best_point_y, LINE_Y_DEFAULT, incr=1)

In [None]:
best_line_y = find_best_line_y_default(best_point_x, best_point_y, LINE_Y_DEFAULT, incr=10)
best_line_y = find_best_line_y_default(best_point_x, best_point_y, best_line_y, incr=5)
best_line_y = find_best_line_y_default(best_point_x, best_point_y, best_line_y, incr=2)
best_line_y = find_best_line_y_default(best_point_x, best_point_y, best_line_y, incr=1)

## 3. Compute areas of identified deceased name frames

In [None]:
def compute_rectangle_area(rectangle):
    x_min, y_min, x_max, y_max = rectangle
    return (x_max - x_min) * (y_max - y_min)

In [None]:
def compute_areas():
    min_area, max_area, area_count, area_total = sys.maxsize, 0, 0, 0
    for coordinates_file_name in scan_data:
        index, status, text_line_id, coords_id = scan_data[coordinates_file_name]
        if index >= 600 and status == "save":
            polygon = get_text_polygons("../website/private/hdsc/data/page/" + 
                                        coordinates_file_name)[text_line_id][coords_id]
            rectangle = polygon2rectangle(polygon)
            area = compute_rectangle_area(rectangle)
            if area < min_area:
                min_area = area
            if area > max_area:
                max_area = area
            area_count += 1
            area_total += area
    return min_area, max_area, int(area_total/area_count)

In [None]:
compute_areas()