# Scraper

This notebook is for scraping and segmenting the first and second editions of Nordisk Familjebok.

In [1]:
from urllib.request import urlopen
import regex as re
import json
import numpy as np
from tqdm import tqdm

INF = 10**9
MAX_ENTRY_LENGTH = 200
INDEX_SEGMENTER_THRESHOLD = 0.15 #relative edit distance threshold
ENCYCLOPEDIAS_FOLDER = "encyclopedias/"
PAGE_NUMBER_STRING = "page_number="
INDEX_STRING = "index="

INDEX_BEGIN = "<b>On this page / på denna sida</b>\n"
INDEX_END = "<p>"

DELIM_BEGIN = "<!-- mode=normal -->"
DELIM_END = "<!-- NEWIMAGE2 -->"

html_entities = [
                ["&quot;", "\""],
                ["&rsquo;", "\'"],
                ["&lsquo;", "\'"],
                ["&ndash;", "-"],
                ["<br>", ""],
                ['<span class="sp">', ""],
                ['<span class="overline">', ""],
                ['<span class="sc">', ""],
                ["</span>", ""],
                ["&lt;", "<"],
                ["&gt;", ">"],
                ["&nbsp;", " "],
                ["&amp;", "&"],
                ]

base_url = "https://runeberg.org/nf"
# base_url = "http://runeberg.org/download.pl?mode=ocrtext&work=nf"

#the ranges for the urls, they have a slightly weird format

edition1_url_range = {
    'a': "abcdefghijklmnopqr",
}

edition2_url_range = {
    'b': "abcdefghijklmnopqrst",
    'c': "abcdefghijklmn",
}

#the start and end pages for each volume
edition1_volume_start_end = {
    "aa": (9, 1579),
    "ab": (9, 800),
    "ac": (7, 798),
    "ad": (7, 797),
    "ae": (7, 798),
    "af": (5, 795),
    "ag": (7, 798),
    "ah": (5, 799),
    "ai": (7, 798),
    "aj": (7, 798),
    "ak": (7, 798),
    "al": (7, 798),
    "am": (7, 798),
    "an": (7, 798),
    "ao": (7, 798),
    "ap": (7, 826),
    "aq": (5, 804),
    "ar": (3, 403),
}

#the pages where the lookup letter changes
edition1_volume_letters = {
    "aa": [(["A"], 1383), (["B"], INF)],
    "ab": [(["B"], 751), (["C"], INF)], 
    "ac": [(["C"], 369), (["D"], INF)],
    "ad": [(["D"], 58), (["E"], 464), (["F"], INF)],
    "ae": [(["F"], 380), (["G"], INF)],
    "af": [(["G"], 220), (["H"], INF)],
    "ag": [(["H"], 196), (["I"], 489), (["J"], 778), (["K"], INF)],
    "ah": [(["K"], INF)], 
    "ai": [(["K"], 232), (["L"], INF)], 
    "aj": [(["L"], 255), (["M"], INF)],
    "ak": [(["M"], 380), (["N"], INF)],
    "al": [(["N"], 30), (["O"], 277), (["P"], INF)],
    "am": [(["P"], 262), (["Q"], 306), (["R"], INF)],
    "an": [(["R"], 147), (["S"], INF)],
    "ao": [(["S"], 641), (["T"], INF)],
    "ap": [(["T"], 625), (["U", "Ü"], INF)], #special case for Ü
    "aq": [(["V", "W"], INF)], #special case with W
    "ar": [(["V", "W"], 35), (["X"], 42), (["Y"], 78), (["Z"], 178), (["Å"], 243), (["Ä"], 277), (["Ö"], INF)] #special case with W
}

#the start and end pages for each volume
edition2_volume_start_end = {
    "ba": (13, 824),
    "bb": (13, 798),
    "bc": (17, 808),
    "bd": (17, 814),
    "be": (17, 800),
    "bf": (17, 814),
    "bg": (17, 802),
    "bh": (17, 806),
    "bi": (17, 782),
    "bj": (17, 804),
    "bk": (17, 784),
    "bl": (17, 816),
    "bm": (17, 784),
    "bn": (17, 784),
    "bo": (17, 788),
    "bp": (17, 812),
    "bq": (17, 785),
    "br": (17, 779),
    "bs": (17, 820),
    "bt": (17, 796),
    "ca": (17, 812),
    "cb": (17, 778),
    "cc": (17, 817),
    "cd": (17, 784),
    "ce": (17, 794),
    "cf": (17, 820),
    "cg": (17, 806),
    "ch": (17, 688),
    "ci": (17, 458),
    "cj": (17, 719),
    "ck": (17, 688),
    "cl": (17, 686),
    "cm": (17, 685),
    "cn": (17, 180),
}

#the pages where the lookup letter changes
edition2_volume_letters = {
    "ba": [(["A"], INF)],
    "bb": [(["A"], 310), (["B"], INF)],
    "bc": [(["B"], INF)],
    "bd": [(["B"], 519), (["C"], INF)],
    "be": [(["C"], 558), (["D"], INF)],
    "bf": [(["D"], 678), (["E"], INF)],
    "bg": [(["E"], 651), (["F"], INF)],
    "bh": [(["F"], INF)],
    "bi": [(["F"], 281), (["G"], INF)],
    "bj": [(["G"], 506), (["H"], INF)],
    "bk": [(["H"], INF)],
    "bl": [(["H"], 180), (["I"], 611), (["J"], INF)],
    "bm": [(["J"], 275), (["K"], INF)],
    "bn": [(["K"], INF)],
    "bo": [(["K"], 385), (["L"], INF)],
    "bp": [(["L"],  INF)],
    "bq": [(["L"], 180), (["M"], INF)],
    "br": [(["M"], INF)],
    "bs": [(["M"], 213), (["N"], INF)],
    "bt": [(["N"], 213), (["O"], 641), (["P"], INF)],
    "ca": [(["P"], INF)],
    "cb": [(["P"], 385), (["Q"], 418), (["R"], INF)],
    "cc": [(["R"], INF)],
    "cd": [(["R"], 136), (["S"]), INF],
    "ce": [(["S"], INF)],
    "cf": [(["S"], INF)],
    "cg": [(["S"], INF)],
    "ch": [(["S"], 138), (["T"], INF)],
    "ci": [(["T"], INF)],
    "cj": [(["T"], 441), (["U"], INF)],
    "ck": [(["U"], 116), (["V"], INF)],
    "cl": [(["V", "W"], INF)], #Special case with W
    "cm": [(["V", "W"], 281), (["X"], 291), (["Y"], 357), (["Z"], 488), (["Å"], 619), (["Ä"], INF)], #special case with W?
    "cn": [(["Ö"], INF)]
}

#folder to save the .txt files in
folder_edition1 = ENCYCLOPEDIAS_FOLDER + "first/"
folder_edition2 = ENCYCLOPEDIAS_FOLDER + "second/"


In [2]:
def get_substring_between_delimiters(s: str, start: str, end: str) -> str:
    start_index = s.find(start)
    if start_index == -1:
        return None
    start_index += len(start)
    end_index = s.find(end, start_index)
    if end_index == -1:
        return None

    return s[start_index: end_index]

def clean_html_markup(s: str, html_entities: list) -> str:
    res = s
    for pair in html_entities:
        res = res.replace(pair[0], pair[1])
    return res

def remove_single_newline(s: str) -> str:
    return re.sub(r'(?<!\n)\n(?!\n)|(\n+)(?=\n)', ' ', s)

def scrape_page_text_and_index(url: str) -> tuple[str, str]:
    try:
        page = urlopen(url)
    except:
        return None, None
    html = page.read().decode("utf-8")
    index = get_substring_between_delimiters(html, INDEX_BEGIN, INDEX_END)
    html = get_substring_between_delimiters(html, DELIM_BEGIN, DELIM_END)
    if not index == None:
        index = clean_html_markup(index, html_entities)
        index = remove_single_newline(index)
    if not html == None:
        html = clean_html_markup(html, html_entities)
        html = remove_single_newline(html)
    return html, index

def create_url(partial_url: str, i: int) -> str:
    return partial_url + f"{i:04d}" + ".html"

def scrape_volume(base_url: str, volume_start_number: int, volume_end_number: int = 9999999) -> str:
    i = volume_start_number
    volume_str: str = ""
    while(i <= volume_end_number):
        url = create_url(base_url, i)
        text, index = scrape_page_text_and_index(url)
        if text == None or index == None:
            i += 1
            continue
        volume_str += PAGE_NUMBER_STRING + str(i) + ", "
        volume_str += INDEX_STRING + index + "\n"
        volume_str += text
        print(f"i = {i}: {volume_str[-10:]}")
        i += 1
    return volume_str

## Getting the first edition (1800-tals utgåvan)

In [None]:
# --- SCRAPING ---
first_letter_list = 'a'
for second_letter in edition1_url_range[first_letter_list]:
    volume_index = first_letter_list + second_letter
    f = open(folder_edition1 + volume_index + ".txt", "w")
    volume_url = base_url + volume_index + "/"
    print(volume_url)
    f.write(scrape_volume(volume_url, 
        volume_start_number=edition1_volume_start_end[volume_index][0], 
        volume_end_number=edition1_volume_start_end[volume_index][1])) 
    print(f"volume index: {volume_index}")
    f.close()

## Getting the second edition (ugglan)

In [None]:
# --- SCRAPING ---
for first_letter_list in ('b', 'c'):
    for second_letter in edition2_url_range[first_letter_list]:
        volume_index = first_letter_list + second_letter
        f = open(folder_edition2 + volume_index + ".txt", "w")
        volume_url = base_url + volume_index + "/"
        f.write(scrape_volume(volume_url, 
            volume_start_number=edition2_volume_start_end[volume_index][0], 
            volume_end_number=edition2_volume_start_end[volume_index][1])) 
        print(f"volume index: {volume_index}")
        print(volume_url)
        f.close()

# volume_index = 'bo'
# f = open(folder + volume_index + ".txt", "w", encoding='utf-8')
# print(f"volume index: {volume_index}")
# volume_url = base_url + volume_index + "/"
# print(volume_url)
# if volume_index in ["ba", "bb"]:
#     text = scrape_volume(volume_url, volume_start_number_ba_bb)
#     print(text)
#     f.write(text)
# else:
#     f.write(scrape_volume(volume_url, volume_start_number))




## Functions for entry classification using the index

In [3]:
def index_string_to_list(index: str) -> list[str]:
    return [query.strip() for query in index.split(" - ")][1:]

def clean_text_and_index(text_word: str, index_word: str) -> str:
    # Clean text_word, e.g., remove italic tags, [...].
    tags = [
                ["<b>", ""],
                ["</b>", ""],
                ["<i>", ""],
                ["</i>", ""],
                ]
    text_word = clean_html_markup(text_word, tags)

    #if not '[' in index_word:
    text_word = re.sub(r'\s*\[(.*?)\]', '', text_word)
    index_word = re.sub(r'\s*\[(.*?)\]', '', index_word)    
    #if not '(' in index_word:
    text_word = re.sub(r'\s*\((.*?)\)', '', text_word)
    index_word = re.sub(r'\s*\((.*?)\)', '', index_word)

    return text_word, index_word

def edit_distance(text_word: str, index_word: str) -> int:
    
    #Initializing distance matrix
    distances = np.zeros((len(text_word) + 1, len(index_word) + 1))
    for t1 in range(len(text_word) + 1):
        distances[t1][0] = t1
    for t2 in range(len(index_word) + 1):
        distances[0][t2] = t2

    # Computation
    a = 0
    b = 0
    c = 0
    
    for t1 in range(1, len(text_word) + 1):
        for t2 in range(1, len(index_word) + 1):
            if (text_word[t1-1] == index_word[t2-1]):
                distances[t1][t2] = distances[t1 - 1][t2 - 1]
            else:
                a = distances[t1][t2 - 1]
                b = distances[t1 - 1][t2]
                c = distances[t1 - 1][t2 - 1]
                
                if (a <= b and a <= c):
                    distances[t1][t2] = a + 1
                elif (b <= a and b <= c):
                    distances[t1][t2] = b + 1
                else:
                    distances[t1][t2] = c + 1

    return distances[len(text_word)][len(index_word)]

def print_distances(distances, token1_length, token2_length):
    for t1 in range(token1_length + 1):
        for t2 in range(token2_length + 1):
            print(int(distances[t1][t2]), end=" ")
        print()

def relative_edit_distance(text_word: str, index_word: str) -> float:
    return edit_distance(text_word, index_word) / len(index_word)

In [None]:
volume_index = "bo"

volume = open(folder_edition2 + f"{volume_index}.txt", "r", encoding='utf-8')
json_file = open("nf.json", 'a', encoding='utf-8')

#TEMPORARY
edition_nbr = 2
volume_nbr = 1

data = []
entry_nbr = 0
page_nbr = 0
index = []
is_entry = False
bold_hits = 0
index_hits = 0
classifier_hits = 0
first_letter_list: list[str] = []
first_letter_boundary = 0
volume_letters_index = -1
for line in tqdm(volume):
    entryid = f"e{edition_nbr}_v{volume_nbr}_{page_nbr}_{entry_nbr}"
    pagenbr_matches = re.search(r'page_number=(\d+)', line)
    if pagenbr_matches:
        page_nbr = int(pagenbr_matches.group(1))
        if page_nbr > first_letter_boundary:
            volume_letters_index += 1
            first_letter_list = edition2_volume_letters[volume_index][volume_letters_index][0]
            first_letter_boundary = edition2_volume_letters[volume_index][volume_letters_index][1]
        entry_nbr = 0
        start_index = line.find(INDEX_STRING)
        index = index_string_to_list(line[start_index + len(INDEX_STRING):]) #Identical for lines on same page
        index = sorted(index, key=len, reverse=True) # To solve problem (Arm, Armadillo)
        print(f"page_nbr: {page_nbr}: ", index)
    else:
        line = line.rstrip()[:MAX_ENTRY_LENGTH] # :200
        if line:
            # --- BOLD MATCHING ---
            if line.startswith(tuple([f"<b>{l}" for l in first_letter_list])):
                is_entry = True
                headword = ""
                matches = re.findall(r'<b>(.*?)<\/b>', line)
                bold_hits += 1
                if matches:
                    headword = re.sub(r'[,.]$', '', matches[0])

            elif line and line[0] in first_letter_list and (len(line) > 40 or " Se " in line): # Removing special case

                # --- INDEX MATCHING ---    
                if index and not "..." in index[0]: # index 
                    smallest_dist = INF
                    smallest_index = -1
                    for i, index_word in enumerate(index):
                        temp_line, temp_index = clean_text_and_index(line, index_word)
                        if relative_edit_distance(temp_line[:len(temp_index)], temp_index) < INDEX_SEGMENTER_THRESHOLD: 
                            headword = index_word
                            is_entry = True
                            index.pop(i)
                            index_hits += 1
                            print(f"Line = {line[:20]}, Index_word: {headword}")
                            break
                
                # --- CLASSIFIER MATCHING ---
                # elif line[0] not capital and not current uppslagsbokstav
                #else: use index or neural network
                
                
            if is_entry:
                item = {
                    "headword": headword,
                    "entryid": entryid,
                    "text": line,
                    "type": 0,
                    "qid": "0",
                    "first_edition_key": "",
                    "fourth_edition_key": ""
                }
                data.append(item)
                entry_nbr += 1
                is_entry = False
            else: 
                print(f"NOT FOUND FOR: line = {line[:20]}")

        

json.dump(data, json_file, ensure_ascii=False, indent=4)
print(f"Bold hits: {bold_hits}")
print(f"Index hits: {index_hits}")
print(f"Classifier hits: {classifier_hits}")

volume.close()
json_file.close()

## Initial test for creating annotated training and test data

In [11]:
#take some real data, create regexes that can extract some features

#Comma or period immediately after first word
# r"^[\p{L}'\-]+(?=,|\.)"

#[ character within first 40 characters
# r"^.{0,40}\["

#[ character within first 40 characters, comma or period after ]
# r"^.{0,40}\[.{1,20}?\](?=\,|\.)"

#( character within first 40 characters
# r"^.{0,40}\("

#( character within first 40 characters, comma or period after )
# r"^.{0,40}\(.{1,20}?\)(?=\,|\.)"

#Category word (mus. , bygnk. , kem. ) after first comma or period
# r"^.{1,70}, \p{L}{1,11}\."

#Match first sentence and: . (Capital letter)
# r"^(.*?)\.\s\p{Lu}"

def extract_feature_with_regex(line:str, regex: str) -> int:
    return 1 if re.search(regex, line) else 0

# Returns a list of sentences in the line
# Used to create negative classifications
# Add ". X" to line in function call to get last sentence
def extract_sentences_from_line(line:str, sentences: list) -> list[str]:
    match = re.findall(r'^(.*?\.\s)\p{Lu}', line)
    if not match:
        return sentences
    text =  re.findall(r'^(.*?\.\s)\p{Lu}', line)[0]
    sentences.append(text)
    return extract_sentences_from_line(line[len(text):], sentences)

classifier_remove_tags = [
    ["<b>", ""],
    ["</b>", ""],
]

#volume = open(folder_edition2 + "bo.txt", "r", encoding='utf-8')
volumes = [
    open(folder_edition2 + "bo.txt", "r", encoding='utf-8')
]

labeled_data = []
first_letter_list = ["K", "L"]

# i = 1
page_nbr = 143
is_entry = False
for volume in volumes:
    for line in tqdm(volume):
        # if i > 1900:
        #     break
        # if i > 1750:
        pagenbr_matches = re.search(r'page_number=(\d+)', line)
        if pagenbr_matches:
            page_nbr = int(pagenbr_matches.group(1))
            # print(f"page_nbr: {page_nbr}: ")
        else:
            line = line.rstrip()[:MAX_ENTRY_LENGTH]
            if line:
                item = {}
                # --- BOLD MATCHING --- create ground truth
                if line.startswith("<b>"):
                    line = clean_html_markup(line, classifier_remove_tags)     
                    item["class"] = 1
                    is_entry = True


                elif line and line[0] in first_letter_list and (len(line) > 40 or " Se " in line):
                    item["class"] = 0
                    is_entry = True
                
                if is_entry:
                    # regexes
                    item["punctuation_after_first_word"] = extract_feature_with_regex(line, r"^[\p{L}'\-]+(?=,|\.)")
                    item["square_bracket"] = extract_feature_with_regex(line, r"^.{0,40}\[")
                    item["square_bracket_with_punctuation"] = extract_feature_with_regex(line, r"^.{0,40}\[.{1,20}?\](?=\,|\.)")
                    item["parentheses"] = extract_feature_with_regex(line, r"^.{0,40}\(")
                    item["parentheses_with_punctuation"] = extract_feature_with_regex(line, r"^.{0,40}\(.{1,20}?\)(?=\,|\.)")
                    item["category_word"] = extract_feature_with_regex(line, r"^.{1,70},\s+\p{L}{1,11}\.")
                    # item["Se_keyword"] = 
                    
                    item["text"] = line #this one should be last

                    labeled_data.append(item)
                    is_entry = False

                line = re.sub(r'^(.*?\.\s)\p{Lu}', '', line)
                for sentence in extract_sentences_from_line(line + ". X", []):
                    other_item = {}
                    other_item["class"] = 0
                    other_item["punctuation_after_first_word"] = extract_feature_with_regex(sentence, r"^[\p{L}'\-]+(?=,|\.)")
                    other_item["square_bracket"] = extract_feature_with_regex(sentence, r"^.{0,40}\[")
                    other_item["square_bracket_with_punctuation"] = extract_feature_with_regex(sentence, r"^.{0,40}\[.{1,20}?\](?=\,|\.)")
                    other_item["parentheses"] = extract_feature_with_regex(sentence, r"^.{0,40}\(")
                    other_item["parentheses_with_punctuation"] = extract_feature_with_regex(sentence, r"^.{0,40}\(.{1,20}?\)(?=\,|\.)")
                    other_item["category_word"] = extract_feature_with_regex(sentence, r"^.{1,70},\s+\p{L}{1,11}\.")
                    other_item["text"] = sentence
                    labeled_data.append(other_item)
                    
            
        # i += 1

    volume.close()

with open('training_data.json', 'w', encoding='utf-8') as outfile:
    print("MAKE TO JSON")
    json.dump(labeled_data, outfile, ensure_ascii=False, indent=4)

9632it [00:01, 6968.85it/s]


MAKE TO JSON
