# Process scans of death records of Curaçao

In [None]:
import os
import re
import pandas as pd

In [None]:
data_dir = "../../data/Overlijden"

In [None]:
def print_with_color(string, color_code=1):
    print(f"\x1b[3{color_code}m{string}\x1b[m", end="")

## 1. Check file names

In [None]:
def check_year_dir_names(data_dir):
    for year_dir_name in sorted(os.listdir(data_dir)):
        if not os.path.isdir(os.path.join(data_dir, year_dir_name)):
            print_with_color(f"\"{year_dir_name}\" is not a directory!\n")
        elif not re.search("^O\.R\. \d\d\d\d$", year_dir_name):
            print_with_color(f"the name of \"{year_dir_name}\" is not shaped like a year directory name!\n")
        else:
            check_district_dir_names(data_dir, year_dir_name)

In [None]:
def check_district_dir_names(data_dir, year_dir_name):
    for district_dir_name in sorted(os.listdir(os.path.join(data_dir, year_dir_name))):
        if district_dir_name == "log.txt":
            continue
        if not os.path.isdir(os.path.join(data_dir, year_dir_name, district_dir_name)):
            print_with_color(f"\"{district_dir_name}\" is not a directory!\n")
        elif (not re.search("^O\.R\. \d\d\d\d Stad$", district_dir_name) and
            not re.search("^O\.R\. \d\d\d\d Buiten \d\d?e distr$", district_dir_name) and
            not re.search("^O\.R\. \d\d\d\d Buiten (Midden|Oost|West) \de distr", district_dir_name)):
            print_with_color(f"the name of \"{district_dir_name}\" is not shaped like a district directory name! (in {year_dir_name})\n")
        else:
            check_scan_file_names(data_dir, year_dir_name, district_dir_name)

In [None]:
def check_scan_file_names(data_dir, year_dir_name, district_dir_name):
    for scan_file_name in sorted(os.listdir(os.path.join(data_dir, year_dir_name, district_dir_name))):
        if not os.path.isfile(os.path.join(data_dir, year_dir_name, district_dir_name, scan_file_name)):
            print_with_color(f"\"{scan_file_name}\" is not a file!\n")
        elif (not re.search("^O\.R\. \d\d\d\d Stad \d\d\d.JPG$", scan_file_name) and
              not re.search("^O\.R\. \d\d\d\d Buiten \d\d?e distr\. \d\d\d.JPG$", scan_file_name) and
              not re.search("^O\.R\. \d\d\d\d Buiten (Midden|Oost|West) \de distr\. \d\d\d.JPG$", scan_file_name)):
            print_with_color(f"the name of \"{scan_file_name}\" is not shaped like a scan file name!\n")

In [None]:
check_year_dir_names(data_dir)

# 2. Check file sizes

Repeat with hashes of content?

In [None]:
for year_dir_name in sorted(os.listdir(data_dir)):
    file_sizes = {}
    nbr_of_duplicates = 0
    for district_dir_name in sorted(os.listdir(os.path.join(data_dir, year_dir_name))):
        if re.search("^O\.R\. \d\d\d\d$", year_dir_name):
            for scan_file_name in sorted(os.listdir(os.path.join(data_dir, year_dir_name, district_dir_name))):
                size = os.path.getsize(os.path.join(data_dir, year_dir_name, district_dir_name, scan_file_name))
                if size not in file_sizes:
                    file_sizes[size] = scan_file_name
                else:
                    nbr_of_duplicates += 1
                    print_with_color(f"  duplicate size {size} for {scan_file_name} {file_sizes[size]}\n")
                    file_sizes[size] += " " + scan_file_name
    if nbr_of_duplicates > 0:
        print(year_dir_name, nbr_of_duplicates)

## 3. Count scans

In [None]:
def count_files(dir_name):
    return len([file_name for file_name in os.listdir(dir_name)
                          if re.search("\.jpg$", file_name, re.IGNORECASE)])

In [None]:
def get_year_from_dir_name(dir_name):
    return int(re.sub("^.*(\d{4})$", "\\1", dir_name))

In [None]:
def count_scans(dir_name):
    counts = []
    for year_dir in sorted(os.listdir(dir_name)):
        if re.search("^O\.R\. \d{4}$", year_dir):
            outside_count = 0
            city_count = 0
            for scan_dir in os.listdir(os.path.join(data_dir, year_dir)):
                scan_dir = os.path.join(data_dir, year_dir, scan_dir)
                if os.path.isdir(scan_dir):
                    if re.search("buiten", scan_dir, re.IGNORECASE):
                        outside_count += count_files(scan_dir)
                    elif re.search("stad", scan_dir, re.IGNORECASE):
                        city_count += count_files(scan_dir)
            counts.append((get_year_from_dir_name(year_dir), city_count, outside_count))
    return counts

In [None]:
def get_total(counts):
    return sum( [ int(x[2]) + int(x[1]) for x in counts ] )

In [None]:
counts = count_scans(data_dir)

In [None]:
get_total(counts)

## 4. Check counts

In [None]:
def get_min_max_counts(counts):
    if not counts:
        return None, None
    else:
        years = [data[0] for data in counts]
        return min(years), max(years)

In [None]:
def check_and_fill_gaps(counts):
    min_count, max_count = get_min_max_counts(counts)
    years = [data[0] for data in counts]
    for year in range(min_count, max_count+1):
        if year not in years:
            print_with_color(f"warning: missing year: {year}")
            counts.append((year, 0, 0))
    return counts

In [None]:
def check_duplicates_counts(counts):
    seen = {}
    for data in counts:
        year = data[0]
        if year in seen:
            print_with_color(f"warning: duplicate year: {year} {seen}")
        seen[year] = True

In [None]:
def check_zero_counts(counts):
    for data in counts:
        if data[1] == 0:
            print_with_color(f"warning: no scans for stad in year {data[0]}")
        if data[2] == 0:
            print_with_color(f"warning: no scans for buiten in year {data[0]}")

In [None]:
def check_counts(counts):
    check_duplicates_counts(counts)
    check_zero_counts(counts)
    counts = check_and_fill_gaps(counts)
    return counts

In [None]:
counts = check_counts(counts)

## 5. Visualize counts

The year 1887 needs to be checked. The buiten districts are duplicated.

In [None]:
import matplotlib.pyplot as plt

In [None]:
plt.title("scans per year")
plt.plot([x[0] for x in counts], [x[1] for x in counts], label="stad")
plt.plot([x[0] for x in counts], [x[2] for x in counts], label="buiten")
plt.legend()
plt.savefig("scans-per-year.png")
plt.show()

## 6. Compare scan counts with JESSY file

In [None]:
data_df = pd.read_csv("../../data/Overlijden/x-misc/Overlijdensmerged.csv", low_memory=False)

In [None]:
data_years = dict(data_df["Jaar"].value_counts())
data_years = { int(year): data_years[year] for year in sorted(data_years.keys()) if data_years[year] > 1 }

In [None]:
plt.title("scans per year")
plt.plot([x[0] for x in counts], [x[1] + x[2] for x in counts], label="all scans")
plt.plot([year for year in data_years], [data_years[year] for year in data_years], color="C3", label="Jessy data")
plt.plot([x[0] for x in counts], [x[1] + x[2] - data_years[x[0]] for x in counts], color="C2", label="difference")
plt.plot([year for year in data_years], [0 for year in data_years], color="0")
plt.legend()
plt.savefig("scans-per-year-jessy.png")
plt.show()

In [None]:
{x: x[1] + x[2] - data_years[x[0]] for x in counts if x[1] + x[2] - data_years[x[0]] > 0 }