In [None]:
from urllib.request import urlopen
import regex as re
import json
import numpy as np

INF = 10**9
MAX_ENTRY_LENGTH = 200
INDEX_SEGMENTER_THRESHOLD = 0.15
ENCYCLOPEDIAS_FOLDER = "encyclopedias/"
PAGE_NUMBER_STRING = "page_number="
INDEX_STRING = "index="

INDEX_BEGIN = "<b>On this page / på denna sida</b>\n"
INDEX_END = "<p>"

DELIM_BEGIN = "<!-- mode=normal -->"
DELIM_END = "<!-- NEWIMAGE2 -->"

html_entities = [
                ["&quot;", "\""],
                ["&rsquo;", "\'"],
                ["&lsquo;", "\'"],
                ["&ndash;", "-"],
                ["<br>", ""],
                ['<span class="sp">', ""],
                ["</span>", ""],
                ["&lt;", "<"],
                ["&gt;", ">"],
                ["&nbsp;", " "],
                ["&amp;", "&"],
                ]

base_url = "https://runeberg.org/nf"
# base_url = "http://runeberg.org/download.pl?mode=ocrtext&work=nf"

#the ranges for the urls, they have a slightly weird format

edition1_url_range = {
    'a': "abcdefghijklmnopqr",
}

edition2_url_range = {
    'b': "abcdefghijklmnopqrst",
    'c': "abcdefghijklmn",
}

edition1_volume_start_end = {
    "aa": (9, 1579),
    "ab": (9, 800),
    "ac": (7, 798),
    "ad": (7, 797),
    "ae": (7, 798),
    "af": (5, 795),
    "ag": (7, 798),
    "ah": (5, 800),
    "ai": (7, 798),
    "aj": (7, 798),
    "ak": (7, 798),
    "al": (7, 798),
    "am": (7, 798),
    "an": (7, 798),
    "ao": (7, 798),
    "ap": (7, 826),
    "aq": (5, 804),
    "ar": (3, 430),
}

edition2_volume_start_end = {
    "ba": (13, 824),
    "bb": (13, 798),
    "bc": (17, 808),
    "bd": (17, 814),
    "be": (17, 800),
    "bf": (17, 814),
    "bg": (17, 802),
    "bh": (17, 806),
    "bi": (17, 782),
    "bj": (17, 804),
    "bk": (17, 784),
    "bl": (17, 816),
    "bm": (17, 784),
    "bn": (17, 784),
    "bo": (17, 788),
    "bp": (17, 812),
    "bq": (17, 785),
    "br": (17, 779),
    "bs": (17, 820),
    "bt": (17, 796),
    "ca": (17, 812),
    "cb": (17, 778),
    "cc": (17, 817),
    "cd": (17, 784),
    "ce": (17, 794),
    "cf": (17, 820),
    "cg": (17, 806),
    "ch": (17, 688),
    "ci": (17, 458),
    "cj": (17, 719),
    "ck": (17, 688),
    "cl": (17, 686),
    "cm": (17, 685),
    "cn": (17, 180),
}

#folder to save the .txt files in
folder_edition1 = ENCYCLOPEDIAS_FOLDER + "first/"
folder_edition2 = ENCYCLOPEDIAS_FOLDER + "second/"


In [None]:
def get_substring_between_delimiters(s: str, start: str, end: str):
    start_index = s.find(start)
    if start_index == -1:
        return None
    start_index += len(start)
    end_index = s.find(end, start_index)
    if end_index == -1:
        return None

    return s[start_index: end_index]

def clean(s: str, html_entities: list):
    res = s
    for pair in html_entities:
        res = res.replace(pair[0], pair[1])
    return res

def remove_single_newline(s: str):
    return re.sub(r'(?<!\n)\n(?!\n)|(\n+)(?=\n)', ' ', s)

def scrape_page(url: str) -> str:
    try:
        page = urlopen(url)
    except:
        return None, None
    html = page.read().decode("utf-8")
    index = get_substring_between_delimiters(html, INDEX_BEGIN, INDEX_END)
    html = get_substring_between_delimiters(html, DELIM_BEGIN, DELIM_END)
    if not index == None:
        index = clean(index, html_entities)
        index = remove_single_newline(index)
    if not html == None:
        html = clean(html, html_entities)
        html = remove_single_newline(html)
    return html, index

def create_url(partial_url: str, i: int):
    return partial_url + f"{i:04d}" + ".html"

def scrape_volume(base_url: str, volume_start_number: int, volume_end_number: int = 9999999):
    i = volume_start_number
    volume_str: str = ""
    while(i <= volume_end_number):
        url = create_url(base_url, i)
        text, index = scrape_page(url)
        if text == None or index == None:
            i += 1
            continue
        volume_str += PAGE_NUMBER_STRING + str(i) + ", "
        volume_str += INDEX_STRING + index + "\n"
        volume_str += text
        print(f"i = {i}: {volume_str[-10:]}")
        i += 1
    return volume_str

## Getting the first edition (1800-tals utgåvan)

In [None]:
# --- SCRAPING ---
first_letter = 'a'
for second_letter in edition1_url_range[first_letter]:
    volume_index = first_letter + second_letter
    f = open(folder_edition1 + volume_index + ".txt", "w")
    volume_url = base_url + volume_index + "/"
    print(volume_url)
    f.write(scrape_volume(volume_url, 
        volume_start_number=edition1_volume_start_end[volume_index][0], 
        volume_end_number=edition1_volume_start_end[volume_index][1])) 
    print(f"volume index: {volume_index}")
    f.close()

## Getting the second edition (ugglan)

In [None]:
# --- SCRAPING ---
for first_letter in ('b', 'c'):
    for second_letter in edition2_url_range[first_letter]:
        volume_index = first_letter + second_letter
        f = open(folder_edition2 + volume_index + ".txt", "w")
        volume_url = base_url + volume_index + "/"
        f.write(scrape_volume(volume_url, 
            volume_start_number=edition2_volume_start_end[volume_index][0], 
            volume_end_number=edition2_volume_start_end[volume_index][1])) 
        print(f"volume index: {volume_index}")
        print(volume_url)
        f.close()

# volume_index = 'bo'
# f = open(folder + volume_index + ".txt", "w", encoding='utf-8')
# print(f"volume index: {volume_index}")
# volume_url = base_url + volume_index + "/"
# print(volume_url)
# if volume_index in ["ba", "bb"]:
#     text = scrape_volume(volume_url, volume_start_number_ba_bb)
#     print(text)
#     f.write(text)
# else:
#     f.write(scrape_volume(volume_url, volume_start_number))




In [None]:
def prep_index(index: str):
    return [query.strip() for query in index.split(" - ")][1:]

def pre_dist_clean(text_word: str, index_word: str) -> str:
    # Clean text_word, e.g., remove italic tags, [...].
    tags = [
                ["<b>", ""],
                ["</b>", ""],
                ["<i>", ""],
                ["</i>", ""],
                ]
    text_word = clean(text_word, tags)

    #if not '[' in index_word:
    text_word = re.sub(r'\s*\[(.*?)\]', '', text_word)
    index_word = re.sub(r'\s*\[(.*?)\]', '', index_word)    
    #if not '(' in index_word:
    text_word = re.sub(r'\s*\((.*?)\)', '', text_word)
    index_word = re.sub(r'\s*\((.*?)\)', '', index_word)

    return text_word, index_word

def edit_distance(text_word: str, index_word: str) -> int:
    
    #Initializing distance matrix
    distances = np.zeros((len(text_word) + 1, len(index_word) + 1))
    for t1 in range(len(text_word) + 1):
        distances[t1][0] = t1
    for t2 in range(len(index_word) + 1):
        distances[0][t2] = t2

    # Computation
    a = 0
    b = 0
    c = 0
    
    for t1 in range(1, len(text_word) + 1):
        for t2 in range(1, len(index_word) + 1):
            if (text_word[t1-1] == index_word[t2-1]):
                distances[t1][t2] = distances[t1 - 1][t2 - 1]
            else:
                a = distances[t1][t2 - 1]
                b = distances[t1 - 1][t2]
                c = distances[t1 - 1][t2 - 1]
                
                if (a <= b and a <= c):
                    distances[t1][t2] = a + 1
                elif (b <= a and b <= c):
                    distances[t1][t2] = b + 1
                else:
                    distances[t1][t2] = c + 1

    return distances[len(text_word)][len(index_word)]

def printDistances(distances, token1Length, token2Length):
    for t1 in range(token1Length + 1):
        for t2 in range(token2Length + 1):
            print(int(distances[t1][t2]), end=" ")
        print()

def relative_edit_distance(text_word: str, index_word: str) -> float:
    return edit_distance(text_word, index_word) / len(index_word)

def line_contains_index(line: str, index_word) -> bool:
    return edit_distance(line[:len(index_word)], index_word) < 5

In [None]:
volume = open(folder_edition2 + "bo.txt", "r", encoding='utf-8')
json_file = open("nf.json", 'a', encoding='utf-8')

#loopa igenom hela filen, om raden har bold tags så tar vi tills
#nästa newline eller de första 200 tecknen, den som kommer först
#och sparar till en json-fil

#TEMPORARY
edition_nbr = 2
volume_nbr = 1

data = []
entry_nbr = 0
page_nbr = 0
index = []
is_entry = False
bold_hits = 0
index_hits = 0
classifier_hits = 0
for line in volume:
    entryid = f"e{edition_nbr}_v{volume_nbr}_{page_nbr}_{entry_nbr}"
    pagenbr_matches = re.search(r'page_number=(\d+)', line)
    if pagenbr_matches:
        page_nbr = int(pagenbr_matches.group(1))
        #if page_nbr == 796: # REMOVE
        #    break           # REMOVE
        entry_nbr = 0
        start_index = line.find(INDEX_STRING)
        index = prep_index(line[start_index + len(INDEX_STRING):]) #Identical for lines on same page
        index = sorted(index, key=len, reverse=True) # To solve problem (Arm, Armadillo)
        print(f"page_nbr: {page_nbr}: ", index)
    else:
        line = line.rstrip()[:MAX_ENTRY_LENGTH] # :200
        
        # --- BOLD MATCHING ---
        if line.startswith("<b>"):
            is_entry = True
            headword = ""
            matches = re.findall(r'<b>(.*?)<\/b>', line)
            bold_hits += 1
            if matches:
                headword = re.sub(r'[,.]$', '', matches[0])

        elif len(line) > 40 or " Se " in line: # Removing special case

            # --- INDEX MATCHING ---    
            if index and not "..." in index[0]: # index 
                smallest_dist = INF
                smallest_index = -1
                for i, index_word in enumerate(index):
                    temp_line, temp_index = pre_dist_clean(line, index_word)
                    if relative_edit_distance(temp_line[:len(temp_index)], temp_index) < INDEX_SEGMENTER_THRESHOLD: 
                        headword = index_word
                        is_entry = True
                        index.pop(i)
                        index_hits += 1
                        print(f"Line = {line[:20]}, Index_word: {headword}")
                        break
            
            # --- CLASSIFIER MATCHING ---
            # elif line[0] not capital and not current uppslagsbokstav
            #else: use index or neural network
            
            
        if is_entry:
            item = {
                "headword": headword,
                "entryid": entryid,
                "text": line,
                "type": 0,
                "qid": "0",
                "first_edition_key": "",
                "fourth_edition_key": ""
            }
            data.append(item)
            entry_nbr += 1
            is_entry = False
        else: 
            print(f"NOT FOUND FOR: line = {line[:20]}")

        

json.dump(data, json_file, ensure_ascii=False, indent=4)
print(f"Bold hits: {bold_hits}")
print(f"Index hits: {index_hits}")
print(f"Classifier hits: {classifier_hits}")

volume.close()
json_file.close()

## Initial test for creating annotated training and test data

In [None]:
#take some real data, create regexes that can extract some features

#Comma or period immediately after first word
#Comma or period immediately after [...] or (...) explaining pronounciation or something else
#Category word (mus. , bygnk. , kem. ) after first comma or period

classifier_remove_tags = [
    ["<b>", ""],
    ["</b>", ""],
]

volume = open(folder_edition2 + "bo.txt", "r", encoding='utf-8')

labeled_data = []

i = 1
page_nbr = 143
for line in volume:
    if i > 1900:
        break
    if i > 1750:
        pagenbr_matches = re.search(r'page_number=(\d+)', line)
        if pagenbr_matches:
            page_nbr = int(pagenbr_matches.group(1))
            print(f"page_nbr: {page_nbr}: ")
        else:
            line = line.rstrip()[:MAX_ENTRY_LENGTH] # maybe remove?
            item = {}
            # regexes


            
            # --- BOLD MATCHING --- create ground truth
            if line.startswith("<b>"):
                line = clean(line, classifier_remove_tags)     
                item["class"] = 1
            else:
                item["class"] = 0
            item["text"] = line
            labeled_data.append(item)
        
    i += 1

with open('training_data.txt', 'w', encoding='utf-8') as outfile:
    print("MAKE TO JSON")
    for line in labeled_data:
        json.dump(labeled_data, outfile, ensure_ascii=False, indent=4)
