In [34]:
from urllib.request import urlopen
import regex as re
import json

MAX_ENTRY_LENGTH = 200
ENCYCLOPEDIAS_FOLDER = "encyclopedias/"
PAGE_NUMBER_STRING = "page_number="
INDEX_STRING = "index="

INDEX_BEGIN = "<b>On this page / på denna sida</b>\n"
INDEX_END = "<p>"

DELIM_BEGIN = "<!-- mode=normal -->"
DELIM_END = "<!-- NEWIMAGE2 -->"

html_entities = [
                ["&quot;", "\""],
                ["&rsquo;", "\'"],
                ["&lsquo;", "\'"],
                ["&ndash;", "-"],
                ["<br>", ""],
                ['<span class="sp">', ""],
                ["</span>", ""],
                ]

In [35]:
def get_substring_between_delimiters(s: str, start: str, end: str):
    start_index = s.find(start)
    if start_index == -1:
        return None
    start_index += len(start)
    end_index = s.find(end, start_index)
    if end_index == -1:
        return None

    return s[start_index: end_index]

def clean(s: str, tag: str, new: str = ""):
    return s.replace(tag, new)

def remove_single_newline(s: str):
    return re.sub(r'(?<!\n)\n(?!\n)|(\n+)(?=\n)', ' ', s)

def scrape_page(url: str) -> str:
    try:
        page = urlopen(url)
    except:
        return None
    html = page.read().decode("utf-8")
    index = get_substring_between_delimiters(html, INDEX_BEGIN, INDEX_END)
    html = get_substring_between_delimiters(html, DELIM_BEGIN, DELIM_END)
    if not index == None:
        for pair in html_entities:
            index = clean(index, pair[0], pair[1])
        index = remove_single_newline(index)
    if not html == None:
        for pair in html_entities:
            html = clean(html, pair[0], pair[1])
        html = remove_single_newline(html)
    # print(html)
    return html, index

def create_url(partial_url: str, i: int):
    return partial_url + f"{i:04d}" + ".html"

def scrape_volume(volume_start_number: int, base_url: str):
    i = volume_start_number
    volume_str: str = ""
    while(True):
        url = create_url(base_url, i)
        text, index = scrape_page(url)
        if text == None or index == None:
            break
        volume_str += PAGE_NUMBER_STRING + str(i) + ", "
        volume_str += INDEX_STRING + index + "\n"
        volume_str += text
        print(f"i = {i}: {volume_str[-10:]}")
        i += 1
        # if i > volume_end_number:
        #     break
    return volume_str

## Getting the second edition (ugglan)

In [36]:
base_url = "https://runeberg.org/nf"
# base_url = "http://runeberg.org/download.pl?mode=ocrtext&work=nf"

#the ranges for the urls, they have a slightly weird format
uggla_url_range = {
    'b': "abcdefghijklmnopqrst",
    'c': "abcdefghijklmn",
}

#first two volumes start on this number
volume_start_number_ba_bb = 795 #13

#the rest start on this one
volume_start_number = 780 #17

#folder to save the .txt files in
folder = ENCYCLOPEDIAS_FOLDER + "second/"

# main loop
# for first_letter in ('b', 'c'):
#     for second_letter in uggla_url_range[first_letter]:
#         volume_index = first_letter + second_letter
#         f = open(folder + volume_index + ".txt", "w")
#         idx_file = open(folder + volume_index + "_idx.txt")
#         print(f"volume index: {volume_index}")
#         volume_url = base_url + volume_index + "/"
#         print(volume_url)
#         if volume_index in ["ba", "bb"]:
#             f.write(scrape_volume(volume_start_number_ba_bb, volume_url)) #här ska det läggas in indexgrejer också
#         else:
#             f.write(scrape_volume(volume_start_number, volume_url))
#         f.close()
#         idx_file.close()

volume_index = 'ba'
f = open(folder + volume_index + ".txt", "w", encoding='utf-8')
print(f"volume index: {volume_index}")
volume_url = base_url + volume_index + "/"
print(volume_url)
if volume_index in ["ba", "bb"]:
    text = scrape_volume(volume_start_number_ba_bb, volume_url)
    print(text)
    f.write(text)
else:
    f.write(scrape_volume(volume_start_number, volume_url))




volume index: ba
https://runeberg.org/nfba/


i = 795: bolag) i 

i = 796: tologisk 

i = 797: ödde den 

i = 798: . J. H.* 

i = 799: på grund 

i = 800: domontes 

i = 801: i</i> (i 

i = 802:  mäktade 

i = 803: örskaffa 

i = 804: nodotos, 

i = 805: 
1 b. 47 

i = 806: lfälliga 

i = 807: erkligt, 

i = 808: elsätter 

i = 809: . J. LR. 

i = 810: skilliga 

i = 811:  (1900). 

i = 812: r namnet 

i = 813: umbanus' 

i = 814: erarkien 

i = 815: na eller 

i = 816: ryckare, 

i = 817: aktersta 

i = 818: pfann en 

i = 819: rrikiska 

i = 820: ver- och 

i = 821: 
1 b. 48 

i = 822: rån hela 

i = 823: ygellösa 

i = 824: con. 
-- 

page_number=795, index= - Argus, "Politisk, Litterär och Commerciell Tidning"  - Argus IV, en mot Argus III utgifven veckotidning i Stockholm  - Argus-fjärilen, äfven kallad Allmänna blåvingen, Lycæna argus, zool.  - Argus giganteus, zool.  - Argus-ögon  - Argylshire. Se Argyllshire  - Argyll l. Argyle  - Argyllshire l. Argyleshire, grefskap på Skottlands västra kust  - Argyri, äfven Argyros, afla

In [45]:
def prep_index(index: str):
    return [query.strip() for query in index.split("-")][1:]


def edit_distance(word1: str, word2: str) -> int:
    pass

In [57]:
prep_index(' - Argus, "Politisk, Litterär och Commerciell Tidning"  - Argus IV, en mot Argus III utgifven veckotidning i Stockholm  - Argus-fjärilen, äfven kallad Allmänna blåvingen, Lycæna argus, zool.  - Argus giganteus, zool.  - Argus-ögon  - Argylshire. Se Argyllshire  - Argyll l. Argyle  - Argyllshire l. Argyleshire, grefskap på Skottlands västra kust  - Argyri, äfven Argyros, aflagring af silfver i organismen  - Argyrion. Se Argyro-Kastro  - Argyrit, miner. Se Silfverglans  - Argyrodit, miner., monsymmetriskt kristalliserande, stålgrått mineral  - Argyrofan. Se Argyroid  - Argyroid l. Argyrofan, namn på vissa slag af nysilfver (se d. o.)  - Argyro-Kastro l. Ergeri Kastri, stad i europeisk-turkiska vilajetet Janina  - Argyroneta, zool., vattenspindel  - Argyropulos, Johannes  - Argyropulos, Perikles  - Argyros. Se Argyri  - Arhippa, en af de berömdaste bland de runosångare  - Arholma-inloppet. Se Stockholms skärgård  - Arhusiander, Magnus ')
volume = open(folder + "ba.txt", "r", encoding='utf-8')
count = 0
for line in volume:
    count += 1
    start_index = line.find(INDEX_STRING)
    res = line[start_index + len(INDEX_STRING):]
    if count == 1:
        break

res

' - Argus, "Politisk, Litterär och Commerciell Tidning"  - Argus IV, en mot Argus III utgifven veckotidning i Stockholm  - Argus-fjärilen, äfven kallad Allmänna blåvingen, Lycæna argus, zool.  - Argus giganteus, zool.  - Argus-ögon  - Argylshire. Se Argyllshire  - Argyll l. Argyle  - Argyllshire l. Argyleshire, grefskap på Skottlands västra kust  - Argyri, äfven Argyros, aflagring af silfver i organismen  - Argyrion. Se Argyro-Kastro  - Argyrit, miner. Se Silfverglans  - Argyrodit, miner., monsymmetriskt kristalliserande, stålgrått mineral  - Argyrofan. Se Argyroid  - Argyroid l. Argyrofan, namn på vissa slag af nysilfver (se d. o.)  - Argyro-Kastro l. Ergeri Kastri, stad i europeisk-turkiska vilajetet Janina  - Argyroneta, zool., vattenspindel  - Argyropulos, Johannes  - Argyropulos, Perikles  - Argyros. Se Argyri  - Arhippa, en af de berömdaste bland de runosångare  - Arholma-inloppet. Se Stockholms skärgård  - Arhusiander, Magnus \n'

In [38]:
volume = open(folder + "ba.txt", "r", encoding='utf-8')
json_file = open("nf.json", 'a', encoding='utf-8')

#loopa igenom hela filen, om raden har bold tags så tar vi tills
#nästa newline eller de första 200 tecknen, den som kommer först
#och sparar till en json-fil

#TEMPORARY
edition_nbr = 2
volume_nbr = 1

data = []
entry_nbr = 0
page_nbr = 0
for line in volume:
    line = line.rstrip()[:MAX_ENTRY_LENGTH]
    pagenbr_matches = re.search(r'page_number=(\d+)', line)
    if pagenbr_matches:
        page_nbr = int(pagenbr_matches.group(1))
        entry_nbr = 0
    
    if line.startswith("<b>"):
        headword = ""
        matches = re.findall(r'<b>(.*?)<\/b>', line)
        if matches:
            headword = re.sub(r'[,.]$', '', matches[0])
        entryid = f"e{edition_nbr}_v{volume_nbr}_{page_nbr}_{entry_nbr}"
        item = {
            "headword": headword,
            "entryid": entryid,
            "text": line,
            "type": 0,
            "qid": "0",
            "first_edition_key": "",
            "fourth_edition_key": ""
        }
        data.append(item)
        entry_nbr += 1
    #else: use index or neural network

json.dump(data, json_file, ensure_ascii=False, indent=4)

volume.close()
json_file.close()