# USFM marker placement metric evaluation

This code and the corresponding constructed gold data can be used to recreate the results of the evaluation of various metrics for evaluating USFM marker placement.

I used a copy of the jaro similarity code rather than the library to get access to the number of "half transposes" to evaluate it as a potential metric.

In [5]:
import re
from pathlib import Path

import nltk
from scipy.stats import pearsonr
# import jaro

from machine.corpora import UsfmFileText
from machine.tokenization import WhitespaceTokenizer

class WhitespaceMarkerTokenizer(WhitespaceTokenizer):
    def _is_whitespace(self, c: str) -> bool:
        return super()._is_whitespace(c) or c == "\\" or c == "*"

In [None]:
'''Code from https://github.com/richmilne/JaroWinkler/blob/master/jaro/jaro.py'''

def jaro(s1, s2):
    # s1 is always the shorter string
    if len(s2) < len(s1):
        s1, s2 = s2, s1

    if len(s1) == 0:
        print("empty sequence")
        print(s1)
        print(s2)
        return 0, 0

    search_range = max((len(s2) // 2) - 1, 0)
    matched1 = [0] * len(s1)
    matched2 = [0] * len(s2)
    num_matches = 0

    for i, char in enumerate(s1):
        for j in range(max(i - search_range, 0), min(i + search_range, len(s2) - 1) + 1):
            if not matched2[j] and char == s2[j]:
                matched1[i] = matched2[j] = 1
                num_matches += 1
                break
    
    if num_matches == 0:
        print("no matches")
        print(s1)
        print(s2)
        return 0, 0

    # number of matched tokens in s1 such that if it is the ith matched token, the the ith matched token in s2 (in linear order) is not what it was matched with
    # this number divided by 2 is the number of transpositions
    half_transposes = 0
    j = 0
    for i, matched in enumerate(matched1):
        if not matched:
            continue
        while not matched2[j]:
            j += 1
        if s1[i] != s2[j]:
            half_transposes += 1
        j += 1

    dist = (
        num_matches / len(s1)
        + num_matches / len(s2)
        + (num_matches - half_transposes // 2) / num_matches
        ) / 3
    return dist, half_transposes

In [None]:
'''
Levenshtein distance
raw score: num of insertions/substitutions/deletions/transpositions
'''
def levenshtein_metrics(gs_toks, ps_toks, num_markers):
    dists = [] # edit distance
    for gs, ps in zip(gs_toks, ps_toks):
        dist = nltk.edit_distance(gs, ps, transpositions=True)
        dists.append(dist)

    dists_per_10_tokens = [d * 10 / len(gs) for d,gs in zip(dists, gs_toks)]
    dists_per_marker = [d / n for d,n in zip(dists, num_markers)]
    return dists, dists_per_10_tokens, dists_per_marker

'''
Jaro similarity
raw score:
avg of:
1. % matches in s1 (m / len(s1))
2. % matches in s2 (m / len(s2))
3. % matches not transposed ((m-t)/m), t = # "half transposes" / 2
'''
def jaro_metrics(gs_toks, ps_toks, num_markers):
    jaro_scores = []
    half_transposes = []
    for gs, ps in zip(gs_toks, ps_toks):
        score, hts = jaro(gs, ps)
        jaro_scores.append(score)
        half_transposes.append(hts)

    ht_per_tok = [t * 10 / len(g) for t,g in zip(half_transposes, gs_toks)]
    ht_per_mark = [t / n for t,n in zip(half_transposes, num_markers)]
    return jaro_scores, half_transposes, ht_per_tok, ht_per_mark

In [None]:
test_prefix = "41MAT_1_eng" # 41MAT_1_eng 23ISA_1_spa 43LUK_3_aps 04NUM_1_npi
n = 5
human_eval_scores = []

# custom tokenizer to handle markers
tokenizer = WhitespaceMarkerTokenizer()

gold_file_path = Path(f"{test_prefix}/{test_prefix}_gold.SFM")
gold_file_text = UsfmFileText("usfm.sty", "utf-8-sig", test_prefix[2:5], gold_file_path, include_markers=True, include_all_text=True)

pred_file_paths = [Path(f"{test_prefix}/{test_prefix}_pred_{i+1}.SFM") for i in range(n)]

sent_idxs = []
vrefs = []
markers = []
gold_sent_toks = []
for i, gs in enumerate(gold_file_text):
    if gs.text.count("\\") == 0:
        continue

    sent_idxs.append(i)
    vrefs.append(gs.ref)
    markers.append(re.findall(r"(?<=\\)[^\s\\\*]*", gs.text))
    
    if tokenizer:
        gold_sent_toks.append(list(tokenizer.tokenize(gs.text)))
    else:
        gold_sent_toks.append(gs.text)

num_markers = [len(m) for m in markers]

'''get scores'''
edit_dists, edit_dists_per_10_tokens, edit_dists_per_marker = [], [], []
jaro_scores, half_transposes, hts_per_10_tokens, hts_per_marker = [], [], [], []
scaled_jaro = []
for pred_file_path in pred_file_paths:
    pred_file_text = UsfmFileText("usfm.sty", "utf-8-sig", test_prefix[2:5], pred_file_path, include_markers=True, include_all_text=True)
    pred_sent_toks = []
    for i, ps in enumerate(pred_file_text):
        if i in sent_idxs:
            # add in any markers that didn't get read in (from being at the end of the verse, only happens to paragraph markers)
            gold_markers = markers[sent_idxs.index(i)].copy()
            pred_markers = re.findall(r"(?<=\\)[^\s\\\*]*", ps.text)
            for marker in pred_markers:
                gold_markers.pop(gold_markers.index(marker))

            text = ps.text
            for m in gold_markers:
                text += f" \\{m}"

            if tokenizer:
                pred_sent_toks.append(list(tokenizer.tokenize(text)))
            else:
                pred_sent_toks.append(text)

    dists, dists_per_10_tokens, dists_per_marker = levenshtein_metrics(gold_sent_toks, pred_sent_toks, num_markers)
    edit_dists.append(dists)
    edit_dists_per_10_tokens.append(dists_per_10_tokens)
    edit_dists_per_marker.append(dists_per_marker)

    scores, hts, ht_per_tok, ht_per_mark = jaro_metrics(gold_sent_toks, pred_sent_toks, num_markers)
    jaro_scores.append(scores)
    half_transposes.append(hts)
    hts_per_10_tokens.append(ht_per_tok)
    hts_per_marker.append(ht_per_mark)


'''calculate summary stats'''
avg_dist = [sum(ed) / len(vrefs) for ed in edit_dists]
avg_dist_per_10_tokens = [sum(edpt) / len(vrefs) for edpt in edit_dists_per_10_tokens]
overall_avg_dist_per_10_tokens = [sum(ed) * 10 / sum([len(gs) for gs in gold_sent_toks]) for ed in edit_dists]
avg_dist_per_marker = [sum(edpm) / len(vrefs) for edpm in edit_dists_per_marker]
overall_avg_dist_per_marker = [sum(ed) / sum(num_markers) for ed in edit_dists]

avg_jaro = [sum(js) / len(vrefs) for js in jaro_scores]
avg_transposes = [sum(hts) / len(vrefs) for hts in half_transposes]
avg_ts_per_10_tokens = [sum(htpt) / len(vrefs) for htpt in hts_per_10_tokens]
overall_avg_ts_per_10_tokens = [sum(hts) * 10 / sum([len(g) for g in gold_sent_toks]) for hts in half_transposes]
avg_ts_per_marker = [sum(htpm) / len(vrefs) for htpm in hts_per_marker]
overall_avg_ts_per_marker = [sum(hts) / sum(num_markers) for hts in half_transposes]

'''calculate Pearson's correlation coefficient'''
res = pearsonr(human_eval_scores, avg_dist)
avg_dist_pearson = res.statistic
avg_dist_p = res.pvalue

res = pearsonr(human_eval_scores, avg_dist_per_10_tokens)
avg_dist_per_10_tokens_pearson = res.statistic
avg_dist_per_10_tokens_p = res.pvalue

res = pearsonr(human_eval_scores, overall_avg_dist_per_10_tokens)
overall_avg_dist_per_10_tokens_pearson = res.statistic
overall_avg_dist_per_10_tokens_p = res.pvalue

res = pearsonr(human_eval_scores, avg_dist_per_marker)
avg_dist_per_marker_pearson = res.statistic
avg_dist_per_marker_p = res.pvalue

res = pearsonr(human_eval_scores, overall_avg_dist_per_marker)
overall_avg_dist_per_marker_pearson = res.statistic
overall_avg_dist_per_marker_p = res.pvalue

res = pearsonr(human_eval_scores, avg_jaro)
avg_jaro_pearson = res.statistic
avg_jaro_p = res.pvalue

res = pearsonr(human_eval_scores, avg_transposes)
avg_transposes_pearson = res.statistic
avg_transposes_p = res.pvalue

res = pearsonr(human_eval_scores, avg_ts_per_10_tokens)
avg_ts_per_10_tokens_pearson = res.statistic
avg_ts_per_10_tokens_p = res.pvalue

res = pearsonr(human_eval_scores, overall_avg_ts_per_10_tokens)
overall_avg_ts_per_10_tokens_pearson = res.statistic
overall_avg_ts_per_10_tokens_p = res.pvalue

res = pearsonr(human_eval_scores, avg_ts_per_marker)
avg_ts_per_marker_pearson = res.statistic
avg_ts_per_marker_p = res.pvalue

res = pearsonr(human_eval_scores, overall_avg_ts_per_marker)
overall_avg_ts_per_marker_pearson = res.statistic
overall_avg_ts_per_marker_p = res.pvalue

In [None]:
print("number of markers", num_markers)
print("sent lengths", [len(gs) for gs in gold_sent_toks])
print("avg markers", sum(num_markers) / len(vrefs))
print("avg sent len", sum([len(gs) for gs in gold_sent_toks]) / len(vrefs))
print("all markers", markers)
print("\n")

print("Edit distance")
print("edit distance")
for ed in edit_dists:
    print(ed)
print("dist per 10 tokens")
for edpt in edit_dists_per_10_tokens:
    print(edpt)
print("dist per marker")
for edpm in edit_dists_per_marker:
    print(edpm)
print("\n")

print("Jaro similarity")
print("jaro")
for js in jaro_scores:
    print(js)
print("scaled jaro")
for sjs in scaled_jaro:
    print(sjs)
print("half transposes")
for hts in half_transposes:
    print(hts)
print("transposes per 10 tokens")
for htpt in hts_per_10_tokens:
    print(htpt)
print("transposes per marker")
for htpm in hts_per_marker:
    print(htpm)
print("\n")

print("avg dist \
      avg dist per 10 tokens \
      overall avg dist per 10 tokens \
      avg dist per marker \
      overall avg dist per marker")
for i in range(len(pred_file_paths)):
    print(f"{avg_dist[i]}\t \
        {avg_dist_per_10_tokens[i]}\t \
        {overall_avg_dist_per_10_tokens[i]}\t \
        {avg_dist_per_marker[i]}\t \
        {overall_avg_dist_per_marker[i]}")
print("\n")

print("avg jaro \
      avg scaled jaro \
      avg transposes \
      avg t's per 10 tokens \
      overall avg t's per 10 tokens \
      avg t's per marker \
      overall avg t's per marker")
for i in range(len(pred_file_paths)):
    print(f"{avg_jaro[i]}\t \
        {avg_transposes[i]}\t \
        {avg_ts_per_10_tokens[i]}\t \
        {overall_avg_ts_per_10_tokens[i]}\t \
        {avg_ts_per_marker[i]}\t \
        {overall_avg_ts_per_marker[i]}")
print("\n")

print("summary stats") # {avg_scaled_jaro[i]}\t \
for i in range(len(pred_file_paths)):
    print(f"{avg_dist[i]}\t \
        {avg_dist_per_10_tokens[i]}\t \
        {overall_avg_dist_per_10_tokens[i]}\t \
        {avg_dist_per_marker[i]}\t \
        {overall_avg_dist_per_marker[i]}\t \
        {avg_jaro[i]}\t \
        {avg_transposes[i]}\t \
        {avg_ts_per_10_tokens[i]}\t \
        {overall_avg_ts_per_10_tokens[i]}\t \
        {avg_ts_per_marker[i]}\t \
        {overall_avg_ts_per_marker[i]}")

print("Pearson correlation coefficients and p-values") # {avg_scaled_jaro_pearson}\t \ # {avg_scaled_jaro_p}\t \
print(f"{avg_dist_pearson}\t \
      {avg_dist_per_10_tokens_pearson}\t \
      {overall_avg_dist_per_10_tokens_pearson}\t \
      {avg_dist_per_marker_pearson}\t \
      {overall_avg_dist_per_marker_pearson}\t \
      {avg_jaro_pearson}\t \
      {avg_transposes_pearson}\t \
      {avg_ts_per_10_tokens_pearson}\t \
      {overall_avg_ts_per_10_tokens_pearson}\t \
      {avg_ts_per_marker_pearson}\t \
      {overall_avg_ts_per_marker_pearson}")
print(f"{avg_dist_p}\t \
      {avg_dist_per_10_tokens_p}\t \
      {overall_avg_dist_per_10_tokens_p}\t \
      {avg_dist_per_marker_p}\t \
      {overall_avg_dist_per_marker_p}\t \
      {avg_jaro_p}\t \
      {avg_transposes_p}\t \
      {avg_ts_per_10_tokens_p}\t \
      {overall_avg_ts_per_10_tokens_p}\t \
      {avg_ts_per_marker_p}\t \
      {overall_avg_ts_per_marker_p}")