# Scraper

This notebook is for scraping and segmenting the first and second editions of Nordisk Familjebok.

In [1]:
from urllib.request import urlopen
import regex as re
import json
import numpy as np
from tqdm import tqdm
import pickle
import joblib
import regex_utils as ru
import classification_utils as cu

INF = 10**9
MAX_ENTRY_LENGTH = 200
INDEX_SEGMENTER_THRESHOLD = 0.15 #relative edit distance threshold
ENCYCLOPEDIAS_FOLDER = "encyclopedias/"
PAGE_NUMBER_STRING = "page_number="
INDEX_STRING = "index="
ALPHABET = "ABCDEFGHIJKLMNOPQRSTUVWXYZÅÄÖÜ"

INDEX_BEGIN = "<b>On this page / på denna sida</b>\n"
INDEX_END = "<p>"

DELIM_BEGIN = "<!-- mode=normal -->"
DELIM_END = "<!-- NEWIMAGE2 -->"

html_entities = [
                ["&quot;", "\""],
                ["&rsquo;", "\'"],
                ["&lsquo;", "\'"],
                ["&ndash;", "-"],
                ["<br>", ""],
                ['<span class="sp">', ""],
                ['<span class="overline">', ""],
                ['<span class="sc">', ""],
                ["</span>", ""],
                ["&lt;", "<"],
                ["&gt;", ">"],
                ["&nbsp;", " "],
                ["&amp;", "&"],
                ]

classifier_remove_tags = [
    ["<b>", ""],
    ["</b>", ""],
]

word_frequencies_remove_tags = [
    ["<b>", ""],
    ["</b>", ""],
    ["<i>", ""],
    ["</i>", ""],
]

base_url = "https://runeberg.org/nf"
# base_url = "http://runeberg.org/download.pl?mode=ocrtext&work=nf"

#the ranges for the urls, they have a slightly weird format

edition1_url_range = {
    'a': "abcdefghijklmnopqr",
}

edition2_url_range = {
    'b': "abcdefghijklmnopqrst",
    'c': "abcdefghijklmn",
}

#the start and end pages for each volume
edition1_volume_start_end = {
    "aa": (9, 1579),
    "ab": (9, 800),
    "ac": (7, 798),
    "ad": (7, 797),
    "ae": (7, 798),
    "af": (5, 795),
    "ag": (7, 798),
    "ah": (5, 799),
    "ai": (7, 798),
    "aj": (7, 798),
    "ak": (7, 798),
    "al": (7, 798),
    "am": (7, 798),
    "an": (7, 798),
    "ao": (7, 798),
    "ap": (7, 826),
    "aq": (5, 804),
    "ar": (3, 403),
}

#the pages where the lookup letter changes
edition1_volume_letters = {
    "aa": [(["A"], 1383), (["B"], INF)],
    "ab": [(["B"], 751), (["C"], INF)], 
    "ac": [(["C"], 369), (["D"], INF)],
    "ad": [(["D"], 58), (["E"], 464), (["F"], INF)],
    "ae": [(["F"], 380), (["G"], INF)],
    "af": [(["G"], 220), (["H"], INF)],
    "ag": [(["H"], 196), (["I"], 489), (["J"], 778), (["K"], INF)],
    "ah": [(["K"], INF)], 
    "ai": [(["K"], 232), (["L"], INF)], 
    "aj": [(["L"], 255), (["M"], INF)],
    "ak": [(["M"], 380), (["N"], INF)],
    "al": [(["N"], 30), (["O"], 277), (["P"], INF)],
    "am": [(["P"], 262), (["Q"], 306), (["R"], INF)],
    "an": [(["R"], 147), (["S"], INF)],
    "ao": [(["S"], 641), (["T"], INF)],
    "ap": [(["T"], 625), (["U", "Ü"], INF)], #special case for Ü
    "aq": [(["V", "W"], INF)], #special case with W
    "ar": [(["V", "W"], 35), (["X"], 42), (["Y"], 78), (["Z"], 178), (["Å"], 243), (["Ä"], 277), (["Ö"], INF)] #special case with W
}

#the start and end pages for each volume
edition2_volume_start_end = {
    "ba": (13, 824),
    "bb": (13, 798),
    "bc": (17, 808),
    "bd": (17, 814),
    "be": (17, 800),
    "bf": (17, 814),
    "bg": (17, 802),
    "bh": (17, 806),
    "bi": (17, 782),
    "bj": (17, 804),
    "bk": (17, 784),
    "bl": (17, 816),
    "bm": (17, 784),
    "bn": (17, 784),
    "bo": (17, 788),
    "bp": (17, 812),
    "bq": (17, 785),
    "br": (17, 779),
    "bs": (17, 820),
    "bt": (17, 796),
    "ca": (17, 812),
    "cb": (17, 778),
    "cc": (17, 817),
    "cd": (17, 784),
    "ce": (17, 794),
    "cf": (17, 820),
    "cg": (17, 806),
    "ch": (17, 688),
    "ci": (17, 458),
    "cj": (17, 719),
    "ck": (17, 688),
    "cl": (17, 686),
    "cm": (17, 685),
    "cn": (17, 180),
}

#the pages where the lookup letter changes
edition2_volume_letters = {
    "ba": [(["A"], INF)],
    "bb": [(["A"], 310), (["B"], INF)],
    "bc": [(["B"], INF)],
    "bd": [(["B"], 519), (["C"], INF)],
    "be": [(["C"], 558), (["D"], INF)],
    "bf": [(["D"], 678), (["E"], INF)],
    "bg": [(["E"], 651), (["F"], INF)],
    "bh": [(["F"], INF)],
    "bi": [(["F"], 281), (["G"], INF)],
    "bj": [(["G"], 506), (["H"], INF)],
    "bk": [(["H"], INF)],
    "bl": [(["H"], 180), (["I"], 611), (["J"], INF)],
    "bm": [(["J"], 275), (["K"], INF)],
    "bn": [(["K"], INF)],
    "bo": [(["K"], 385), (["L"], INF)],
    "bp": [(["L"],  INF)],
    "bq": [(["L"], 180), (["M"], INF)],
    "br": [(["M"], INF)],
    "bs": [(["M"], 213), (["N"], INF)],
    "bt": [(["N"], 213), (["O"], 641), (["P"], INF)],
    "ca": [(["P"], INF)],
    "cb": [(["P"], 385), (["Q"], 418), (["R"], INF)],
    "cc": [(["R"], INF)],
    "cd": [(["R"], 136), (["S"], INF)],
    "ce": [(["S"], INF)],
    "cf": [(["S"], INF)],
    "cg": [(["S"], INF)],
    "ch": [(["S"], 138), (["T"], INF)],
    "ci": [(["T"], INF)],
    "cj": [(["T"], 441), (["U"], INF)],
    "ck": [(["U"], 116), (["V"], INF)],
    "cl": [(["V", "W"], INF)], #Special case with W
    "cm": [(["V", "W"], 281), (["X"], 291), (["Y"], 357), (["Z"], 488), (["Å"], 619), (["Ä"], INF)], #special case with W?
    "cn": [(["Ö"], INF)]
}

edition2_volumes = edition2_volume_start_end.keys()

#folder to save the .txt files in
folder_edition1 = ENCYCLOPEDIAS_FOLDER + "first/"
folder_edition2 = ENCYCLOPEDIAS_FOLDER + "second/"


In [2]:
def get_substring_between_delimiters(s: str, start: str, end: str) -> str:
    start_index = s.find(start)
    if start_index == -1:
        return None
    start_index += len(start)
    end_index = s.find(end, start_index)
    if end_index == -1:
        return None

    return s[start_index: end_index]

def clean_html_markup(s: str, html_entities: list) -> str:
    res = s
    for pair in html_entities:
        res = res.replace(pair[0], pair[1])
    return res

def remove_single_newline(s: str) -> str:
    return re.sub(r'(?<!\n)\n(?!\n)|(\n+)(?=\n)', ' ', s)

def scrape_page_text_and_index(url: str) -> tuple[str, str]:
    try:
        page = urlopen(url)
    except:
        return None, None
    html = page.read().decode("utf-8")
    index = get_substring_between_delimiters(html, INDEX_BEGIN, INDEX_END)
    html = get_substring_between_delimiters(html, DELIM_BEGIN, DELIM_END)
    if not index == None:
        index = clean_html_markup(index, html_entities)
        index = remove_single_newline(index)
    if not html == None:
        html = clean_html_markup(html, html_entities)
        html = remove_single_newline(html)
    return html, index

def create_url(partial_url: str, i: int) -> str:
    return partial_url + f"{i:04d}" + ".html"

def scrape_volume(base_url: str, volume_start_number: int, volume_end_number: int = 9999999) -> str:
    i = volume_start_number
    volume_str: str = ""
    while(i <= volume_end_number):
        url = create_url(base_url, i)
        text, index = scrape_page_text_and_index(url)
        if text == None or index == None:
            i += 1
            continue
        volume_str += PAGE_NUMBER_STRING + str(i) + ", "
        volume_str += INDEX_STRING + index + "\n"
        volume_str += text
        print(f"i = {i}: {volume_str[-10:]}")
        i += 1
    return volume_str

## Getting the first edition (1800-tals utgåvan)

In [None]:
# --- SCRAPING ---
first_letter_list = 'a'
for second_letter in edition1_url_range[first_letter_list]:
    volume_index = first_letter_list + second_letter
    f = open(folder_edition1 + volume_index + ".txt", "w")
    volume_url = base_url + volume_index + "/"
    print(volume_url)
    f.write(scrape_volume(volume_url, 
        volume_start_number=edition1_volume_start_end[volume_index][0], 
        volume_end_number=edition1_volume_start_end[volume_index][1])) 
    print(f"volume index: {volume_index}")
    f.close()

## Getting the second edition (ugglan)

In [None]:
# --- SCRAPING ---
for first_letter_list in ('b', 'c'):
    for second_letter in edition2_url_range[first_letter_list]:
        volume_index = first_letter_list + second_letter
        f = open(folder_edition2 + volume_index + ".txt", "w", encoding='utf-8')
        volume_url = base_url + volume_index + "/"
        print(volume_url)
        f.write(scrape_volume(volume_url, 
            volume_start_number=edition2_volume_start_end[volume_index][0], 
            volume_end_number=edition2_volume_start_end[volume_index][1])) 
        print(f"volume index: {volume_index}")
        f.close()

# volume_index = 'bo'
# f = open(folder + volume_index + ".txt", "w", encoding='utf-8')
# print(f"volume index: {volume_index}")
# volume_url = base_url + volume_index + "/"
# print(volume_url)
# if volume_index in ["ba", "bb"]:
#     text = scrape_volume(volume_url, volume_start_number_ba_bb)
#     print(text)
#     f.write(text)
# else:
#     f.write(scrape_volume(volume_url, volume_start_number))




## Functions for entry classification using the index

In [3]:
def index_string_to_list(index: str) -> list[str]:
    return [query.strip() for query in index.split(" - ")][1:]

def clean_text_and_index(text_word: str, index_word: str) -> str:
    # Clean text_word, e.g., remove italic tags, [...].
    tags = [
                ["<b>", ""],
                ["</b>", ""],
                ["<i>", ""],
                ["</i>", ""],
                ]
    text_word = clean_html_markup(text_word, tags)

    #if not '[' in index_word:
    text_word = re.sub(r'\s*\[(.*?)\]', '', text_word)
    index_word = re.sub(r'\s*\[(.*?)\]', '', index_word)    
    #if not '(' in index_word:
    text_word = re.sub(r'\s*\((.*?)\)', '', text_word)
    index_word = re.sub(r'\s*\((.*?)\)', '', index_word)

    return text_word, index_word

def edit_distance(text_word: str, index_word: str) -> int:
    
    #Initializing distance matrix
    distances = np.zeros((len(text_word) + 1, len(index_word) + 1))
    for t1 in range(len(text_word) + 1):
        distances[t1][0] = t1
    for t2 in range(len(index_word) + 1):
        distances[0][t2] = t2

    # Computation
    a = 0
    b = 0
    c = 0
    
    for t1 in range(1, len(text_word) + 1):
        for t2 in range(1, len(index_word) + 1):
            if (text_word[t1-1] == index_word[t2-1]):
                distances[t1][t2] = distances[t1 - 1][t2 - 1]
            else:
                a = distances[t1][t2 - 1]
                b = distances[t1 - 1][t2]
                c = distances[t1 - 1][t2 - 1]
                
                if (a <= b and a <= c):
                    distances[t1][t2] = a + 1
                elif (b <= a and b <= c):
                    distances[t1][t2] = b + 1
                else:
                    distances[t1][t2] = c + 1

    return distances[len(text_word)][len(index_word)]

def print_distances(distances, token1_length, token2_length):
    for t1 in range(token1_length + 1):
        for t2 in range(token2_length + 1):
            print(int(distances[t1][t2]), end=" ")
        print()

def relative_edit_distance(text_word: str, index_word: str) -> float:
    return edit_distance(text_word, index_word) / len(index_word)

## Classification

First checks if a word is bold, in which case we say it's an entry

If not, we check if we can find it in the index

Otherwise, we use a logistic regression

In [4]:
# Classifier model
# Load the pre-trained logistic regression model from disk
model = joblib.load('logistic_regression_model.pkl')

#TODO: ta bort jättefult så här får man inte göra
frequencies, max_frequency = cu.get_word_frequencies()

#TEMPORARY
edition_nbr = 2

data = []
entry_nbr = 0
page_nbr = 0
index = []
is_entry = False
bold_hits = 0
index_hits = 0
classifier_hits = 0
first_letter_list: list[str] = []
first_letter_boundary = 0
volume_letters_index = -1
for volume in edition2_volumes:
    first_letter_boundary = 0
    volume_letters_index = -1
    page_nbr = 0
    with open(folder_edition2 + f"{volume}.txt", "r", encoding='utf-8') as f:
        for line in f:
            entryid = f"e{edition_nbr}_{volume}_{page_nbr}_{entry_nbr}"
            pagenbr_matches = re.search(r'page_number=(\d+)', line)
            if pagenbr_matches:
                page_nbr = int(pagenbr_matches.group(1))
                if page_nbr > first_letter_boundary:
                    volume_letters_index += 1
                    first_letter_list = edition2_volume_letters[volume][volume_letters_index][0]
                    first_letter_boundary = edition2_volume_letters[volume][volume_letters_index][1]
                entry_nbr = 0
                start_index = line.find(INDEX_STRING)
                index = index_string_to_list(line[start_index + len(INDEX_STRING):]) #Identical for lines on same page
                index = sorted(index, key=len, reverse=True) # To solve problem (Arm, Armadillo)
                print(f"page_nbr: {page_nbr}: ", index)
            else:
                line = line.rstrip()[:MAX_ENTRY_LENGTH] # :200
                if line:
                    # --- BOLD MATCHING ---
                    if line.startswith(tuple([f"<b>{l}" for l in first_letter_list])):
                        is_entry = True
                        headword = ""
                        matches = re.findall(r'<b>(.*?)<\/b>', line)
                        bold_hits += 1
                        if matches:
                            headword = re.sub(r'[,.]$', '', matches[0])

                    elif line and line[0] in first_letter_list and (len(line) > 40) and not (len(line) < 75 and line.find(". Se ") != -1): # Removing special case

                        # --- INDEX MATCHING ---    
                        if index and not any("..." in s for s in index): # index 
                            smallest_dist = INF
                            smallest_index = -1
                            for i, index_word in enumerate(index):
                                temp_line, temp_index = clean_text_and_index(line, index_word)
                                if relative_edit_distance(temp_line[:len(temp_index)], temp_index) < INDEX_SEGMENTER_THRESHOLD: 
                                    headword = index_word
                                    is_entry = True
                                    index.pop(i)
                                    index_hits += 1
                                    # print(f"Line = {line[:20]}, Index_word: {headword}")
                                    break
                            if not is_entry:
                                print(f"NOT FOUND IN INDEX: {line[:20]}")
                        
                        # --- CLASSIFIER MATCHING ---
                        else: 
                            x = cu.line_to_datapoint(line, frequencies, max_frequency)
                            if model.predict(x)[0] == 1:
                                is_entry = True
                                classifier_hits += 1
                                headword = ru.words_in_line(line)[0]
                            else:
                                print(f"NON-ENTRY ACCORDING TO CLASSIFIER: {line[:20]}")

                        
                    if is_entry:
                        item = {
                            "headword": headword,
                            "entryid": entryid,
                            "text": line,
                            "type": 0,
                            "qid": "0",
                            "first_edition_key": "",
                            "fourth_edition_key": ""
                        }
                        data.append(item)
                        entry_nbr += 1
                        is_entry = False

        
with open("nf.json", 'w', encoding='utf-8') as json_file:
    json.dump(data, json_file, ensure_ascii=False, indent=4)

print(f"Bold hits: {bold_hits}")
print(f"Index hits: {index_hits}")
print(f"Classifier hits: {classifier_hits}")

page_nbr: 13:  ['A är den första bokstafven']
page_nbr: 14:  ['Aachen. 1. Regeringsområde i preussiska Rhenprovinsen', 'Aabenraa, ty. Apenrade, stad i Schleswig (Slesvig)', 'A. a. C., förkortning för Anno ante Christum natum', 'a. a., förkortning af lat. ad acta. Se Acta.', 'A A A., förkortning för Amalgam (se d. o.).', 'Aachen. 2. Hufvudort i nyssnämnda område', 'A, latinsk preposition. Se Ab.', 'À, fr. (lat. ad), för, till', 'Aa, namn på en mängd floder', 'A är den första bokstafven', 'aa l. aa, på recept', 'Aach. Se Aa.']
NOT FOUND IN INDEX: Af ännu större betyd
page_nbr: 15:  ['A. a. C. n., förkortning för Anno ante Christum natum', 'A. ær. vulg., förkortning för Anno ærae vulgaris', 'Aachen. 2. Hufvudort i nyssnämnda område', 'Aakirkeby l. Åkirkeby, stad på Bornholm', 'Aalbæk-bukten, på Jyllands östra kust', 'Aagesön, Svend (lat. Sveno Agonis)', 'Aalborg, stad i norra Jylland', 'Aagaard, Karl Frederik', 'Aalberg, Ida Emilia', 'Aachen, Hans von', 'Aagesen, Andreas', 'Aak']
page_nbr

AttributeError: 'str' object has no attribute 'close'

## Getting word frequencies

In [None]:
#volume = open(folder_edition2 + "bo.txt", "r", encoding='utf-8')

volumes = edition2_volume_start_end.keys()

occurences = {}

total_words = 0

page_nbr = 0
is_entry = False
for volume in volumes:
    with open(folder_edition2 + f"{volume}.txt", "r", encoding='utf-8') as f:
        for line in tqdm(f):
            pagenbr_matches = re.search(r'page_number=(\d+)', line)
            if pagenbr_matches:
                page_nbr = int(pagenbr_matches.group(1))
                # print(f"page_nbr: {page_nbr}: ")
            else:
                line = line.rstrip()[:MAX_ENTRY_LENGTH]
                if line:
                    line = clean_html_markup(line, word_frequencies_remove_tags)
                    words = [word.lower() for word in ru.words_in_line(line)]
                    total_words += len(words)
                    for word in words:
                        if word in occurences.keys():
                            occurences[word] += 1
                        else:
                            occurences[word] = 1

occurences_file = 'word_frequencies.pickle'

with open(occurences_file, 'wb') as handle:
    pickle.dump(occurences, handle, protocol=pickle.HIGHEST_PROTOCOL)

print(f"total_words: {total_words}, occurences[\"ett\"] = {occurences['ett']}, frequency = {occurences['ett']/ total_words}, freq(monopolsystemet) = {occurences['monopolsystemet'] / total_words} ")

## Creating (automatically) annotated training and test data

In [None]:
volumes = edition2_volume_start_end.keys()

labeled_data = []

frequencies, max_frequency = cu.get_word_frequencies()
page_nbr = 0
is_entry = False
first_letter_list: list[str] = []
first_letter_boundary = 0
volume_letters_index = -1
for volume in volumes:
    first_letter_boundary = 0
    volume_letters_index = -1
    page_nbr = 0
    with open(folder_edition2 + f"{volume}.txt", "r", encoding='utf-8') as f:
    
        for line in tqdm(f):
            pagenbr_matches = re.search(r'page_number=(\d+)', line)
            if pagenbr_matches:
                page_nbr = int(pagenbr_matches.group(1))
                if page_nbr > first_letter_boundary:
                    volume_letters_index += 1
                    first_letter_list = edition2_volume_letters[volume][volume_letters_index][0]
                    # try:
                    first_letter_boundary = edition2_volume_letters[volume][volume_letters_index][1]
                    # except:
                    #     print(f"volume = {volume}, volume_letters_index = {volume_letters_index}, page_nbr: {page_nbr}, ")
                    #     break
            else:
                line = line.rstrip()[:MAX_ENTRY_LENGTH]
                if line and (len(line) > 40) and (len(line) > 75 or line.find(". Se ") == -1):
                    item = {}
                    # --- BOLD MATCHING --- create ground truth
                    if line.startswith(tuple([f"<b>{l}" for l in first_letter_list])):
                        line = clean_html_markup(line, classifier_remove_tags)
                        item["class"] = 1
                        is_entry = True


                    elif line and (not line.startswith("Fig. ")) and (not line.startswith("Ord, som saknas under K")) and (not (line[0] in first_letter_list)) and line[0] in ALPHABET:
                        item["class"] = 0
                        is_entry = True
                    
                    if is_entry:
                        # regexes
                        item["punctuation_after_first_word"] = ru.punctuation_after_first_word(line)
                        item["square_bracket"] = ru.square_bracket(line)
                        item["square_bracket_with_punctuation"] = ru.square_bracket_with_punctuation(line)
                        item["parentheses"] = ru.parentheses(line)
                        item["parentheses_with_punctuation"] = ru.parentheses_with_punctuation(line)
                        item["category_word"] = ru.category_word(line)
                        
                        #first word frequency
                        line = clean_html_markup(line, word_frequencies_remove_tags)
                        word = ru.words_in_line(line)[0].lower()
                        item["first_word_frequency"] = cu.relative_word_frequency(word, frequencies, max_frequency)                    
                        item["text"] = line #this one should be last

                        labeled_data.append(item)
                        is_entry = False

with open('training_data.json', 'w', encoding='utf-8') as outfile:
    print("MAKE TO JSON")
    json.dump(labeled_data, outfile, ensure_ascii=False, indent=4)