# Post procesing of the json-files
- Remove articles in false order
- Linking cross references

In [9]:
from tqdm.notebook import tqdm
import regex as re
import json
import bisect

edition_test = 'cross_test'
e1 = 'e1'
e2 = 'e2'

In [2]:
# Get entries from json file
def get_entries(edition_name: str) -> list[dict]:
    with open(f"{edition_name}.json", 'r', encoding='utf-8') as infile:
        entries = json.loads(infile.read())
    return entries

# Write entries to json file
def dump_entries(entries: list[dict], edition_name: str) -> None:
    with open(f"{edition_name}.json", 'w', encoding='utf-8') as outfile:
        json.dump(entries, outfile, ensure_ascii=False, indent=4)

## Removing articles in false order
True articles will are ordered alphabetically in the editions. Outliers that disrupt this order are not articles and must be removed...

- Method: Loop through entries. Remove entry if the headword is smaller than the headword of the previous entry, or the headword is is bigger than the headword of the next entry.

In [21]:
def strip_non_letters(text: str) -> str:
    return re.sub(r'[^\p{L}]', '', text)

def remove_condition(window: list[str]) -> bool:
    window = [strip_non_letters(word) for word in window]
    test_word = window[len(window) // 2]
    back_approved = 0
    front_approved = 0
    for i in range(0, len(window) // 2):
        if test_word >= window[i]:
            back_approved += 1
    for i in range((len(window) // 2) + 1, len(window)):
        if test_word <= window[i]:
            front_approved += 1       
    return (back_approved + front_approved) <= (len(window) // 2)

In [None]:
entries = get_entries(edition_test)


## Linking cross references

#### Cross reference linking functions

In [None]:
# Get reference word (what is after 'Se ')
def get_ref_word(text: str) -> str:
    match = re.search(r'Se\s+([^.]+)\.', text)
    if match:
        return match.group(1).strip()
    return ""

# Find index of headword
def binary_search(arr, target):
    index = bisect.bisect_left(arr, target)
    if index != len(arr) and arr[index] == target:
        return index
    return -1

# Cross reference linking logic
def cross_link(edition_name: str) -> None:
    entries = get_entries(edition_name)
    headwords = [entry['headword'] for entry in tqdm(entries, desc="Retrieving Headwords")]

    # Assign cross references
    for entry in tqdm(entries, desc="Finding cross references"):
        if entry['is_cross_ref']:
            ref_word = get_ref_word(entry['text'])
            idx = binary_search(headwords, ref_word)
            if idx != -1:
                entry['cross_id'] = entries[idx]['entryid']
                continue
        else: # REMOVE IF ENTRIES ALREADY HAVE KEY 'CROSS_ID'
            entry['cross_id'] = ''
    
    dump_entries(entries, edition_name)

#### Running methods for each edition

In [None]:
cross_link(edition_test)