# Post procesing of the json-files
- Remove articles in false order
- Linking cross references

In [1]:
from tqdm.notebook import tqdm
from utils.paths import *
from utils import json_helpers as jh
import regex as re
import bisect
import random
import copy

e1 = f'{ENCYCLOPEDIAS_JSON_FOLDER}/e1'
e2 = f'{ENCYCLOPEDIAS_JSON_FOLDER}/e2'

e1_eval_order = f'{ORDER_TEST_FOLDER}/e1_test_order'
e2_eval_order = f'{ORDER_TEST_FOLDER}/e2_test_order'

e1_test_links_json = f'{CROSS_TEST_FOLDER}/e1_test_links'
e2_test_links_json = f'{CROSS_TEST_FOLDER}/e2_test_links'

e1_stats_links_json = f'{CROSS_STATS_FOLDER}/e1_links_recall'
e2_stats_links_json = f'{CROSS_STATS_FOLDER}/e2_links_recall'

## Removing articles in false order
True articles will are ordered alphabetically in the editions. Outliers that disrupt this order are not articles and must be removed...

- Method: Loop through entries. Remove entry based on comparison with previous and next words. Context can be of any odd size but 5 seems to be enough.
- It is safer to compare less characters. We found the first 3 to work best.
- The encyclopedia orders words by letter but ignores characters which are not letters.

In [None]:
NR_CHAR_COMPARE = 3

# Removes non-letter characters
def replace_uncommon_chars(text: str) -> str:
    replacements = [
        (r'[^\p{L}]', ''),
        (r'Ü', 'U'),
        (r'ü', 'u'),
        (r'W', 'V'), # Encyclopedia sees 'w' as 'v'
        (r'w', 'v'),
        (r'\'', '')

    ]
    for p in replacements:
        text = re.sub(p[0], p[1], text)
    return text

def nr_chars_match(word1: str, word2: str) -> int:
    shortest_word_len = min(len(word1), len(word2))
    nr_matches = 0
    for i in range(shortest_word_len):
        if word1[i] == word2[i]:
            nr_matches += 1
    return nr_matches + (abs(len(word1) - len(word2)))

# Returns true if word at center of window is to be removed.
# Insert parameters are for handling inputs where target word is not in center
def remove_condition(words: list[str], insert_back: int=0, insert_front: int=0) -> bool:
    window = []
    # Insert smallest string in back 
    for word in range(insert_back):
        window.append("") # "" is smaller than the smallest letter

    # Append words
    for word in words:
        window.append(replace_uncommon_chars(word).lower()[:NR_CHAR_COMPARE])

    # Inserst biggest string in front
    for word in range(insert_front):
        window.append("Ø") # "Ø" is greater than the biggest letter 
    
    test_word = window[len(window) // 2]
    back_approved = 0
    front_approved = 0
    for i in range(0, len(window) // 2):
        if test_word >= window[i]:
            back_approved += 1
        # if nr_chars_match(test_word, window[i]) > NR_CHAR_COMPARE // 2:
        #     back_approved += 1
    for i in range((len(window) // 2) + 1, len(window)):
        if test_word <= window[i]:
            front_approved += 1     
        # if nr_chars_match(test_word, window[i]) > NR_CHAR_COMPARE // 2:
        #     front_approved += 1  
    return (back_approved + front_approved) <= (len(window) // 2)

def nr_back_front_inserts(idx: int, context_size: int, list_size: int) -> tuple[int, int]:
    back_inserts = max(0, context_size - idx)
    front_inserts = max(0, context_size - (list_size - 1 - idx))
    return back_inserts, front_inserts

def remove_unordered(edition_name: str, context_size: int=2):
    entries = jh.read_items(edition_name)
    entries_ord = []
    entries_removed = []
    for i in tqdm(list(range(len(entries))), desc=f"Removing unordered entries"):
        if entries[i]['text'][:3] == "<b>":
            continue
        back_inserts, front_inserts = nr_back_front_inserts(i, context_size, len(entries))
        context_words = list(map(lambda a: a.get('headword'), entries[max(0, i - context_size): i + context_size + 1]))
        if remove_condition(context_words, back_inserts, front_inserts):
            entries_removed.append(copy.deepcopy(entries[i]))
        else:
            entries_ord.append(entries[i])
    
    print(f"Number of entries removed: {len(entries_removed)}")
    return entries_ord, entries_removed 

In [None]:
e1_ordered, e1_removed = remove_unordered(e1)
e2_ordered, e2_removed = remove_unordered(e2)

## Make evaluation data of removed entries
* This data has to be manually annotated as valid removal (1) or not valid (0) removal.
* This step can be skipped (go to "Linking cross references") if these statistics are not of interest.

In [None]:
# Make data sets for manual annotation
e1_removed_data = random.sample(e1_removed, 50)
e2_removed_data = random.sample(e2_removed, 50)

for entry in e1_removed_data:
    entry['valid_removal'] = 0

for entry in e2_removed_data:
    entry['valid_removal'] = 0

jh.write_items(e1_removed_data, e1_eval_order)
jh.write_items(e2_removed_data, e2_eval_order)

## Linking cross references

#### Cross reference linking functions

In [None]:
def is_cross_ref(text: str):
    return len(text) < 60 and " Se " in text

# Get reference word (what is after 'Se ')
def get_ref_word(text: str) -> str:
    match = re.search(r'Se\s+([^.]+)\.', text)
    if match:
        return match.group(1).strip()
    return ""

# Find index of headword
def binary_search(arr, target):
    index = bisect.bisect_left(arr, target)
    if index != len(arr) and arr[index] == target:
        return index
    return -1

def get_index(lst: list, item) -> int:
    try:
        return lst.index(item)
    except ValueError:
        return -1

# Cross reference linking logic
def cross_link(entries: list[dict]):
    headwords = [entry['headword'] for entry in entries]

    nr_linked = 0
    # Assign cross references
    for entry in tqdm(entries, desc="Finding cross references"):
        if is_cross_ref(entry['text']):
            ref_word = get_ref_word(entry['text'])
            # Binary search probably doesn't work since we can't guarantee that it is ordered
            # 2500 articles are in false order but valid articles either way
            # idx = binary_search(headwords, ref_word) 
            idx = get_index(headwords, ref_word)
            if idx != -1:
                entry['cross_ref_key'] = entries[idx]['entryid']
                nr_linked += 1
                continue
            else:
                entry['cross_ref_key'] = "-"
    print(f"Number of cross references linked: {nr_linked}")
    return entries, nr_linked

#### Running methods for each edition

In [None]:
e1_ordered = jh.read_items(e1)
e2_ordered = jh.read_items(e2)

e1_final, e1_nr_linked = cross_link(e1_ordered)
e2_final, e2_nr_linked = cross_link(e2_ordered)

jh.write_items(e1_final, e1)
jh.write_items(e2_final, e2)


## Make evaluation data of linked entries
* This data has to be manually annotated as valid link (1) or not valid (0) link.
* This step can be skipped if these statistics are not of interest.

In [None]:
# Make data sets for manual annotation
e1_linked_data = random.sample(e1_final, 100)
e2_linked_data = random.sample(e2_final, 100)

for entry in e1_linked_data:
    entry['is_cross_ref'] = 0

for entry in e2_linked_data:
    entry['is_cross_ref'] = 0

jh.write_items(item_e1, e1_stats_links_json)
jh.write_items(item_e2, e2_stats_links_json)

item_e1 = [{
    "Nr_articles": len(e1_final),
    "Nr_cross_ref_linked": e1_nr_linked,
}]

item_e2 = [{
    "Nr_articles": len(e2_final),
    "Nr_cross_ref_linked": e2_nr_linked,
}]

jh.write_items(e1_linked_data, e1_test_links_json)
jh.write_items(e2_linked_data, e2_test_links_json)


# jh.write_items(e1_linked_data, e1_test_links_json)
# jh.write_items(e2_linked_data, e2_test_links_json)