# CER tests

Tests with character error rate (CER) computation with the goal of evaluating parts of documents that were generated with hand-written text recognition (HTR).

## 1. Initialization

In [None]:
from collections import Counter
import fastwer
import os
import regex
import sys
sys.path.append(os.getcwd() + '/..')
from scripts import read_transkribus_files

In [None]:
sentence_gold = "John moved the table from the room to outside"
sentence_ocr = "John mowed zhe taole fom he broom fo ovtsioe"

In [None]:
fastwer.score_sent(sentence_gold, sentence_ocr, char_level=True)

In [None]:
data_dir_gold = "tmp/1586854/Validation_set/page"
data_dir_htr = "tmp/1749649/Validation_set_HTR_Curacao_bestModel/page"

In [None]:
texts_gold, metadata_gold, textregions_gold  = read_transkribus_files.read_files(data_dir_gold)
texts_htr, metadata_htr, textregions_htr  = read_transkribus_files.read_files(data_dir_htr)

In [None]:
total_cer = 0
total_chars = 0
for file_name in sorted(texts_htr.keys()):
    cer = fastwer.score_sent(texts_htr[file_name], texts_gold[file_name], char_level=True)
    max_text_length = max(len(texts_htr[file_name]), len(texts_gold[file_name]))
    total_chars += max_text_length
    total_cer += max_text_length * cer
    print(cer, file_name)
print(f"average cer: {total_cer/total_chars}")

In [None]:
def find_best_line_match(line_htr, lines_gold):
    best_index_gold = -1
    best_cer = 100
    for index_gold in range(0, len(lines_gold)):
        cer = fastwer.score_sent(line_htr, lines_gold[index_gold], char_level=True)
        if cer < best_cer:
            best_cer = cer
            best_index_gold = index_gold
    return best_index_gold, best_cer

In [None]:
def align_lines(lines_htr, lines_gold):
    alignments = []
    for index_htr in range(0, len(lines_htr)):
        best_index_gold, best_cer = find_best_line_match(lines_htr[index_htr], lines_gold)
        alignments.append((index_htr, best_index_gold, best_cer))
    return alignments

In [None]:
def cleanup_line(line):
    line = line.lower()
    line = regex.sub("y", "ij", line)
    line = line.translate(str.maketrans("çéëó", "ceeo"))
    line = regex.sub('^[.,!?:;")(-]+', "", line)
    line = regex.sub('\s[.,!?:;")(-]+', " ", line)
    line = regex.sub('[.,!?:;")(-]+\s', " ", line)
    line = regex.sub('[.,!?:;")(-]+$', "", line)
    return(line)

In [None]:
def cleanup_text(text):
    return "\n".join([cleanup_line(line) for line in  text.split("\n")])

In [None]:
def count_distances(alignments):
    return Counter([ alignment[1] - alignment[0] for alignment in alignments])

In [None]:
def check_alignments_order(alignments):
    last_gold_index = -1
    to_be_deleted = []
    distances = count_distances(alignments)
    for alignment_index in range(0, len(alignments)):
        if last_gold_index < alignments[alignment_index][1]:
            last_gold_index = alignments[alignment_index][1]
        elif alignments[alignment_index][2] > alignments[alignment_index - 1][2]:
            to_be_deleted.append(alignment_index)
        elif alignments[alignment_index][2] < alignments[alignment_index - 1][2]:
            to_be_deleted.append(alignment_index - 1)
            last_gold_index = alignments[alignment_index][1]
        elif (distances[alignments[alignment_index][1] - alignments[alignment_index][0]] <
              distances[alignments[alignment_index - 1][1] - alignments[alignment_index - 1][0]]):
            to_be_deleted.append(alignment_index)
        else:
            to_be_deleted.append(alignment_index - 1)
            last_gold_index = alignments[alignment_index][1]
    for to_be_deleted_value in list(reversed(to_be_deleted)):
        alignments.pop(to_be_deleted_value)
    to_be_added = []
    for alignment_index in range(1, len(alignments)):
        if ((alignments[alignment_index][0] - alignments[alignment_index - 1][0] == 
             alignments[alignment_index][1] - alignments[alignment_index - 1][1]) and
            alignments[alignment_index][0] - alignments[alignment_index - 1][0] != 1):
            for alignment_index_delta in range(1, alignments[alignment_index][0] - alignments[alignment_index - 1][0]):
                to_be_added.append((alignment_index, 
                                    alignments[alignment_index - 1][0] + alignment_index_delta,
                                    alignments[alignment_index - 1][1] + alignment_index_delta))
    for to_be_added_element in list(reversed(to_be_added)):
        alignments.insert(to_be_added_element[0], (to_be_added_element[1],
                                                   to_be_added_element[2],
                                                   fastwer.score_sent(lines_htr[to_be_added_element[1]], 
                                                                      lines_gold[to_be_added_element[2]], 
                                                                      char_level=True)))
    return len(to_be_deleted) > 0 or len(to_be_added) > 0, alignments

In [None]:
def check_alignments_order_wrapper(alignments):
    alignments_changed = True
    while alignments_changed:
        alignments_changed, alignments = check_alignments_order(alignments)
    return alignments

In [None]:
def fix_split_words(words_htr, words_gold, wrong_words, missed_words):
    to_be_deleted = []
    for index_wrong in range(1, len(wrong_words)):
        if wrong_words[index_wrong] == wrong_words[index_wrong - 1] + 1:
            combined_word = (words_htr[wrong_words[index_wrong - 1]] + 
                             words_htr[wrong_words[index_wrong]])
            for index_missed in range(0, len(missed_words)):
                if words_gold[missed_words[index_missed]] == combined_word:
                    to_be_deleted.append((index_wrong - 1, index_wrong, index_missed))
                    break
    for to_be_deleted_item in list(reversed(to_be_deleted)):
        for to_be_deleted_wrong_index in range(to_be_deleted_item[1], to_be_deleted_item[0] - 1, -1):
            wrong_words.pop(to_be_deleted_wrong_index)
        missed_words.pop(to_be_deleted_item[2])
    return wrong_words, missed_words

In [None]:
def analyze_words(line_htr, line_gold):
    missed_words = []
    wrong_words = []
    if line_htr != line_gold:
        words_htr = line_htr.split()
        words_gold = line_gold.split()
        alignments = align_lines(words_htr, words_gold)
        alignments = check_alignments_order_wrapper(alignments)
        index_htr = 0
        index_gold = 0
        for index_alignment in range(0, len(alignments)):
            target_index_htr = alignments[index_alignment][0]
            target_index_gold = alignments[index_alignment][1]
            while index_htr < len(words_htr) and index_htr < target_index_htr:
                wrong_words.append(index_htr)
                index_htr += 1
            while index_gold < len(words_gold) and index_gold < target_index_gold:
                missed_words.append(index_gold)
                index_gold += 1
            if words_htr[target_index_htr] != words_gold[target_index_gold]:
                missed_words.append(target_index_gold)
                wrong_words.append(index_htr)
            index_htr += 1
            index_gold += 1
        for index_htr_extra in range(index_htr, len(words_htr)):
            wrong_words.append(index_htr)
        for index_gold_extra in range(index_gold, len(words_gold)):
            missed_words.append(index_gold_extra)
        wrong_words, missed_words = fix_split_words(words_htr, words_gold, wrong_words, missed_words)
    return wrong_words, missed_words

In [None]:
def analyze_lines(lines_htr, lines_gold, alignments):
    index_htr = 0
    line_analysis = []
    for alignment in alignments:
        for index_htr_delta in range(1, alignment[0]-index_htr):
             line_analysis.append(analyze_words(lines_htr[index_htr + index_htr_delta], ""))
        line_analysis.append(analyze_words(lines_htr[alignment[0]], lines_gold[alignment[1]]))
        index_htr = alignment[0] + 1
    for index_htr_delta in range(1, len(lines_htr)-index_htr):
        line_analysis.append(analyze_words(lines_htr[index_htr + index_htr_delta], ""))
    return line_analysis

In [None]:
def show_word_analysis(line_htr, line_gold, line_analysis_line):
    words_htr = line_htr.split()
    words_gold = line_gold.split()
    for index_htr in range(0, len(words_htr)):
        if index_htr in line_analysis_line[0]:
            read_transkribus_files.print_with_color(words_htr[index_htr], color_code=1, end=" ")
        else:
            read_transkribus_files.print_with_color(words_htr[index_htr], color_code=0, end=" ")
    if len(line_analysis_line[1]) > 0:
        read_transkribus_files.print_with_color([ words_gold[index_gold] 
                                                  for index_gold in line_analysis_line[1] ], 
                                                color_code=4, 
                                                end=" ")
    print()

In [None]:
def show_line_analysis(lines_htr, lines_gold, alignments, line_analysis):
    index_htr = 0
    for alignment in alignments:
        for index_htr_extra in range(index_htr, alignment[0]):
            show_word_analysis(lines_htr[index_htr_extra], "", [[], []])
        show_word_analysis(lines_htr[alignment[0]], 
                           lines_gold[alignment[1]], 
                           line_analysis[alignment[0]])
        index_htr = alignment[0] + 1
    for index_htr_extra in range(index_htr, len(lines_htr)):
        show_word_analysis(lines_htr[index_htr_extra], "", [[], []])

In [None]:
for file_name in sorted(texts_htr.keys()):
    lines_htr = cleanup_text(texts_htr[file_name]).split("\n")
    lines_gold = cleanup_text(texts_gold[file_name]).split("\n")
    alignments = align_lines(lines_htr, lines_gold)
    cer = fastwer.score_sent(texts_htr[file_name], texts_gold[file_name], char_level=True)
    read_transkribus_files.print_with_color(f"{file_name} (cer={cer:.1f}):\n", color_code=4)
    line_analysis = analyze_lines(lines_htr, lines_gold, alignments)
    show_line_analysis(lines_htr, lines_gold, alignments, line_analysis)