# Segmentation of the text files
Segmentiation consists of 3 steps in sequential order. A line is marked as an entry if one of the following holds:
1. Line begins with a bold delimiter \<b>, followed by the current look-up letter.
2. A substring from the start of the line matches an index in the index list.
3. The logistic regression classification model classifies the line as an entry.

In [1]:
import json_helpers as jh
import joblib
import regex as re
import numpy as np
import mlp_classifier_utils as mcu # i hate marvel cinematic universe
import regex_utils as ru
from tqdm.notebook import tqdm
from scraping_and_segmenting_helpers import *

In [2]:
INDEX_SEGMENTER_THRESHOLD = 0.15 #relative edit distance threshold

### Functions for entry segmentation using the index

In [None]:
def index_string_to_list(index: str) -> list[str]:
    return [query.strip() for query in index.split(" - ")][1:]

def clean_text_and_index(text_word: str, index_word: str) -> str:
    # Clean text_word, e.g., remove italic tags, [...].
    tags = [
                ["<b>", ""],
                ["</b>", ""],
                ["<i>", ""],
                ["</i>", ""],
                ]
    text_word = clean_html_markup(text_word, tags)

    #if not '[' in index_word:
    text_word = re.sub(r'\s*\[(.*?)\]', '', text_word)
    index_word = re.sub(r'\s*\[(.*?)\]', '', index_word)    
    #if not '(' in index_word:
    text_word = re.sub(r'\s*\((.*?)\)', '', text_word)
    index_word = re.sub(r'\s*\((.*?)\)', '', index_word)

    return text_word, index_word

def edit_distance(text_word: str, index_word: str) -> int:
    
    #Initializing distance matrix
    distances = np.zeros((len(text_word) + 1, len(index_word) + 1))
    for t1 in range(len(text_word) + 1):
        distances[t1][0] = t1
    for t2 in range(len(index_word) + 1):
        distances[0][t2] = t2

    # Computation
    a = 0
    b = 0
    c = 0
    
    for t1 in range(1, len(text_word) + 1):
        for t2 in range(1, len(index_word) + 1):
            if (text_word[t1-1] == index_word[t2-1]):
                distances[t1][t2] = distances[t1 - 1][t2 - 1]
            else:
                a = distances[t1][t2 - 1]
                b = distances[t1 - 1][t2]
                c = distances[t1 - 1][t2 - 1]
                
                if (a <= b and a <= c):
                    distances[t1][t2] = a + 1
                elif (b <= a and b <= c):
                    distances[t1][t2] = b + 1
                else:
                    distances[t1][t2] = c + 1

    return distances[len(text_word)][len(index_word)]

def print_distances(distances, token1_length, token2_length):
    for t1 in range(token1_length + 1):
        for t2 in range(token2_length + 1):
            print(int(distances[t1][t2]), end=" ")
        print()

def relative_edit_distance(text_word: str, index_word: str) -> float:
    return edit_distance(text_word, index_word) / len(index_word)

def segment(folder: str, volumes: list[str], edition_nbr: int):
    # Load the pre-trained logistic regression model from disk
    model = joblib.load('mlp_model.pkl')

    
    if edition_nbr == 1:
        volume_letters = edition1_volume_letters
    else:
        volume_letters = edition2_volume_letters

    data = []
    entry_nbr = 0
    page_entry_nbr = 0
    index = []
    is_entry = False
    bold_hits = 0
    index_hits = 0
    classifier_hits = 0
    first_letter_list: list[str] = []
    classifier_type = 0 # 0 = bold, 1 = index, 2 = neural network/logistic regression classifier
    for volume in tqdm(volumes):
        first_letter_boundary = 0
        volume_letters_index = -1
        page_nbr = 0
        with open(folder + f"{volume}.txt", "r", encoding='utf-8') as f:
            for line in f:
                entryid = f"e{edition_nbr}_{entry_nbr}_{volume}_{page_nbr}_{page_entry_nbr}"
                pagenbr_matches = re.search(r'page_number=(\d+)', line)
                if pagenbr_matches:
                    page_nbr = int(pagenbr_matches.group(1))
                    if page_nbr > first_letter_boundary:
                        volume_letters_index += 1
                        first_letter_list = volume_letters[volume][volume_letters_index][0]
                        first_letter_boundary = volume_letters[volume][volume_letters_index][1]
                    page_entry_nbr = 0
                    start_index = line.find(INDEX_STRING)
                    index = index_string_to_list(line[start_index + len(INDEX_STRING):]) #Identical for lines on same page
                    index = sorted(index, key=len, reverse=True) # To solve problem (Arm, Armadillo)
                    # print(f"page_nbr: {page_nbr}: ", index)
                else:
                    line = line.rstrip()[:MAX_ENTRY_LENGTH] # :200
                    if line and not "Ord, som saknas under" in line:
                        # --- BOLD MATCHING ---
                        if line.startswith(tuple([f"<b>{l}" for l in ALPHABET])): #tuple([f"<b>{l}" for l in first_letter_list])):
                            is_entry = True
                            classifier_type = 0
                            headword = ""
                            matches = re.findall(r'<b>(.*?)<\/b>', line)
                            bold_hits += 1
                            if matches:
                                headword = re.sub(r'[,.]$', '', matches[0])
                            else: 
                                headword = ru.get_headword_no_closing_bold_tag(line)

                        elif line[0] in first_letter_list and (len(line) > 50 or " Se " in line): # and (len(line) > 40) and not (len(line) < 75 and line.find(". Se ") != -1): # Removing special case

                            # --- INDEX MATCHING ---    
                            if index and not any("..." in s for s in index): # index 
                                for i, index_word in enumerate(index):
                                    temp_line, temp_index = clean_text_and_index(line, index_word)
                                    if relative_edit_distance(temp_line[:len(temp_index)], temp_index) < INDEX_SEGMENTER_THRESHOLD: 
                                        headword = ru.get_headword_from_index(index_word)
                                        is_entry = True
                                        classifier_type = 1
                                        index.pop(i)
                                        index_hits += 1
                                        # print(f"Line = {line[:20]}, Index_word: {headword}")
                                        break
                                if not is_entry:
                                    # print(f"NOT FOUND IN INDEX: {line[:20]}")
                                    pass
                            
                            # --- CLASSIFIER MATCHING ---
                            else: 
                                x = mcu.transform_sentence(line)
                                if model.predict(x)[0] == 1:
                                    is_entry = True
                                    classifier_type = 2
                                    classifier_hits += 1
                                    headword = ru.get_headword_from_text(line)
                                # else:
                                    # print(f"NON-ENTRY ACCORDING TO CLASSIFIER: {line[:20]}")

                            
                        if is_entry:
                            item = {
                                "headword": headword,
                                "entryid": entryid,
                                "text": line,
                                "classifier_type": classifier_type,
                                "class": 0,
                                "qid": "0",
                                "e2_key": "",
                                "e4_key": "",
                                # "is_cross_ref": 0,
                                "cross_ref_key": "",
                                "latitude": None,
                                "longitude": None,
                            }
                            data.append(item)
                            page_entry_nbr += 1
                            entry_nbr += 1
                            is_entry = False

    jh.write_items(data, f"{ENCYCLOPEDIAS_JSONS_FOLDER}e{edition_nbr}")

    print(f"Edition {edition_nbr} stats\n------------")
    print(f"Bold hits: {bold_hits}")
    print(f"Index hits: {index_hits}")
    print(f"Classifier hits: {classifier_hits}")

### Segmentation of the text files

In [None]:
segment(folder_edition1, edition1_volumes, 1)
segment(folder_edition2, edition2_volumes, 2)