In [None]:
from urllib.request import urlopen
import regex as re
import json
import numpy as np

INF = 10**9
MAX_ENTRY_LENGTH = 200
INDEX_SEGMENTER_THRESHOLD = 0.15
ENCYCLOPEDIAS_FOLDER = "encyclopedias/"
PAGE_NUMBER_STRING = "page_number="
INDEX_STRING = "index="

INDEX_BEGIN = "<b>On this page / på denna sida</b>\n"
INDEX_END = "<p>"

DELIM_BEGIN = "<!-- mode=normal -->"
DELIM_END = "<!-- NEWIMAGE2 -->"

html_entities = [
                ["&quot;", "\""],
                ["&rsquo;", "\'"],
                ["&lsquo;", "\'"],
                ["&ndash;", "-"],
                ["<br>", ""],
                ['<span class="sp">', ""],
                ["</span>", ""],
                ]

In [None]:
def get_substring_between_delimiters(s: str, start: str, end: str):
    start_index = s.find(start)
    if start_index == -1:
        return None
    start_index += len(start)
    end_index = s.find(end, start_index)
    if end_index == -1:
        return None

    return s[start_index: end_index]

def clean(s: str, tag: str, new: str = ""):
    return s.replace(tag, new)

def remove_single_newline(s: str):
    return re.sub(r'(?<!\n)\n(?!\n)|(\n+)(?=\n)', ' ', s)

def scrape_page(url: str) -> str:
    try:
        page = urlopen(url)
    except:
        return None
    html = page.read().decode("utf-8")
    index = get_substring_between_delimiters(html, INDEX_BEGIN, INDEX_END)
    html = get_substring_between_delimiters(html, DELIM_BEGIN, DELIM_END)
    if not index == None:
        for pair in html_entities:
            index = clean(index, pair[0], pair[1])
        index = remove_single_newline(index)
    if not html == None:
        for pair in html_entities:
            html = clean(html, pair[0], pair[1])
        html = remove_single_newline(html)
    # print(html)
    return html, index

def create_url(partial_url: str, i: int):
    return partial_url + f"{i:04d}" + ".html"

def scrape_volume(base_url: str, volume_start_number: int, volume_end_number: int = 9999999):
    i = volume_start_number
    volume_str: str = ""
    while(True):
        url = create_url(base_url, i)
        text, index = scrape_page(url)
        if text == None or index == None:
            break
        volume_str += PAGE_NUMBER_STRING + str(i) + ", "
        volume_str += INDEX_STRING + index + "\n"
        volume_str += text
        print(f"i = {i}: {volume_str[-10:]}")
        i += 1
        if i > volume_end_number:
            break
    return volume_str

## Getting the second edition (ugglan)

In [None]:
base_url = "https://runeberg.org/nf"
# base_url = "http://runeberg.org/download.pl?mode=ocrtext&work=nf"

#the ranges for the urls, they have a slightly weird format
uggla_url_range = {
    'b': "abcdefghijklmnopqrst",
    'c': "abcdefghijklmn",
}

#first two volumes start on this number
volume_start_number_ba_bb = 795 #13

#the rest start on this one
volume_start_number = 192 #17

#folder to save the .txt files in
folder = ENCYCLOPEDIAS_FOLDER + "second/"

# main loop
# for first_letter in ('b', 'c'):
#     for second_letter in uggla_url_range[first_letter]:
#         volume_index = first_letter + second_letter
#         f = open(folder + volume_index + ".txt", "w")
#         idx_file = open(folder + volume_index + "_idx.txt")
#         print(f"volume index: {volume_index}")
#         volume_url = base_url + volume_index + "/"
#         print(volume_url)
#         if volume_index in ["ba", "bb"]:
#             f.write(scrape_volume(volume_url, volume_start_number_ba_bb)) #här ska det läggas in indexgrejer också
#         else:
#             f.write(scrape_volume(volume_url, volume_start_number))
#         f.close()
#         idx_file.close()

volume_index = 'ba'
f = open(folder + volume_index + ".txt", "w", encoding='utf-8')
print(f"volume index: {volume_index}")
volume_url = base_url + volume_index + "/"
print(volume_url)
if volume_index in ["ba", "bb"]:
    text = scrape_volume(volume_url, volume_start_number_ba_bb)
    print(text)
    f.write(text)
else:
    f.write(scrape_volume(volume_url, volume_start_number, volume_end_number=200))




In [None]:
def prep_index(index: str):
    return [query.strip() for query in index.split(" - ")][1:]

def pre_dist_clean(text_word: str, index_word: str) -> str:
    # Clean text_word, e.g., remove italic tags, [...].
    tags = [
                ["<b>", ""],
                ["</b>", ""],
                ["<i>", ""],
                ["</i>", ""],
                ]
    for pair in tags:
        text_word = clean(text_word, pair[0], pair[1])

    #if not '[' in index_word:
    text_word = re.sub(r'\s*\[(.*?)\]', '', text_word)
    index_word = re.sub(r'\s*\[(.*?)\]', '', index_word)    
    #if not '(' in index_word:
    text_word = re.sub(r'\s*\((.*?)\)', '', text_word)
    index_word = re.sub(r'\s*\((.*?)\)', '', index_word)

    return text_word, index_word

def edit_distance(text_word: str, index_word: str) -> int:
    
    #Initializing distance matrix
    distances = np.zeros((len(text_word) + 1, len(index_word) + 1))
    for t1 in range(len(text_word) + 1):
        distances[t1][0] = t1
    for t2 in range(len(index_word) + 1):
        distances[0][t2] = t2

    # Computation
    a = 0
    b = 0
    c = 0
    
    for t1 in range(1, len(text_word) + 1):
        for t2 in range(1, len(index_word) + 1):
            if (text_word[t1-1] == index_word[t2-1]):
                distances[t1][t2] = distances[t1 - 1][t2 - 1]
            else:
                a = distances[t1][t2 - 1]
                b = distances[t1 - 1][t2]
                c = distances[t1 - 1][t2 - 1]
                
                if (a <= b and a <= c):
                    distances[t1][t2] = a + 1
                elif (b <= a and b <= c):
                    distances[t1][t2] = b + 1
                else:
                    distances[t1][t2] = c + 1

    return distances[len(text_word)][len(index_word)]

def printDistances(distances, token1Length, token2Length):
    for t1 in range(token1Length + 1):
        for t2 in range(token2Length + 1):
            print(int(distances[t1][t2]), end=" ")
        print()

def relative_edit_distance(text_word: str, index_word: str) -> float:
    return edit_distance(text_word, index_word) / len(index_word)

def line_contains_index(line: str, index_word) -> bool:
    return edit_distance(line[:len(index_word)], index_word) < 5
    
# def is_match_head_index(text_word: str, index_word: str) -> bool:
#     edit_dist = edit_distance(text_word, index_word)
#     return relative_edit_distance(edit_dist, len(index_word)) < 0.1 # Need to test


In [None]:
# Text som borde matchas men gör det inte

# Denna går in att matcha pga, index_word innehåller '(' så då tar vi inte bort "(...)"
# Den innehåller två sekvenser av "(...)" där den sista är med. Lösning: IDK, kanske ta bort från båda, det skulle lösa problemet såvida
# det inte finns två index ord där det enda som skiljer är nå parantes. 
text_word1 = "<b>Argyroid</b> l. Argyrofan (af grek. <i>argyros</i>, silfver, och <i>eidos</i>, utseende, l. <i>fainesthai</i>, synas), namn på vissa slag af nysilfver (se d. o.). "
index_word1 = "Argyroid l. Argyrofan, namn på vissa slag af nysilfver (se d. o.)"
temp_line1, temp_index1 = pre_dist_clean(text_word1, index_word1)[:len(index_word1)]
print(temp_line1[:len(temp_index1)])
print(temp_index1)
print(relative_edit_distance(temp_line1[:len(temp_index1)], temp_index1))

In [None]:
text_word  = "XXXXXXXX"
index_word = "XXXXX"
print(relative_edit_distance(text_word, index_word)) # Armadillon

#Armadillo
# Arm
# Arma
# 1: Arm - Armadillo , OOPS dessa matchade perfekt för vi gjorde de till samma längd
# 2: Arm - Arm, OOPS nu kan vi inte längre matcha dessa.
# 3: Den måste föredra matchningar med längre index ord


In [None]:
test = ["a", "ac", "abc"]
test = sorted(test, key=len, reverse=True)
print(test)

In [None]:
volume = open(folder + "ba.txt", "r", encoding='utf-8')
json_file = open("nf.json", 'a', encoding='utf-8')

#loopa igenom hela filen, om raden har bold tags så tar vi tills
#nästa newline eller de första 200 tecknen, den som kommer först
#och sparar till en json-fil

#TEMPORARY
edition_nbr = 2
volume_nbr = 1

data = []
entry_nbr = 0
page_nbr = 0
index = []
is_entry = False
index_hits = 0
for line in volume:
    entryid = f"e{edition_nbr}_v{volume_nbr}_{page_nbr}_{entry_nbr}"
    pagenbr_matches = re.search(r'page_number=(\d+)', line)
    if pagenbr_matches:
        page_nbr = int(pagenbr_matches.group(1))
        if page_nbr == 796: # REMOVE
            break           # REMOVE
        entry_nbr = 0
        start_index = line.find(INDEX_STRING)
        index = prep_index(line[start_index + len(INDEX_STRING):]) #Identical for lines on same page
        index = sorted(index, key=len, reverse=True) # To solve problem (Arm, Armadillo)
        print(f"page_nbr: {page_nbr}: ", index)
    else:
        line = line.rstrip()[:MAX_ENTRY_LENGTH] # :200
        # if line.startswith("<b>"):
        #     is_entry = True
        #     headword = ""
        #     matches = re.findall(r'<b>(.*?)<\/b>', line)
        #     if matches:
        #         headword = re.sub(r'[,.]$', '', matches[0])
            
        if not "..." in index[0]: # index 
            smallest_dist = INF
            smallest_index = -1
            print(len(index))
            for i, index_word in enumerate(index):
                temp_line, temp_index = pre_dist_clean(line, index_word)
                # dist = (relative_edit_distance(temp_line, index_word))
                # if dist < smallest_dist:
                #     smallest_dist = dist
                #     smallest_index = i
                if relative_edit_distance(temp_line[:len(temp_index)], temp_index) < INDEX_SEGMENTER_THRESHOLD: 
                    smallest_dist = 0
                    smallest_index = i
                    print(f"smallest_dist = {smallest_dist}, Index example: line = {line[:20]}, index_word: {index[smallest_index]}")
                    index.pop(smallest_index)
                    index_hits += 1
                    break
            if smallest_index == -1:
                print(f"NOT FOUND FOR: line = {line[:20]}")
                
            #threshold
            # if smallest_dist < INDEX_SEGMENTER_THRESHOLD:
            #     is_entry = True
            #     headword = index_word
            #     index_hits += 1
                
        if is_entry:
            item = {
                "headword": headword,
                "entryid": entryid,
                "text": line,
                "type": 0,
                "qid": "0",
                "first_edition_key": "",
                "fourth_edition_key": ""
            }
            data.append(item)
            entry_nbr += 1
            is_entry = False
        

    #else: use index or neural network

json.dump(data, json_file, ensure_ascii=False, indent=4)
print(f"index hits: {index_hits}")

volume.close()
json_file.close()

In [None]:
relative_edit_distance("Argus IV, en ", "Argus-fjärilen, äfven kallad Allmänna blåvingen, Lycæna argus, zool.")

In [None]:
import random
characters = ['a','b','c','d','f']
characters.pop(2)
print(characters)
