# Scraper

This notebook scrapes the first and second editions of Nordisk Familjebok to raw text, e.g., (aa.txt and ba.txt)

### Importing necessary packages and defining constants

In [None]:
import regex as re
from urllib.request import urlopen
from tqdm.notebook import tqdm
from scraping_and_segmenting_helpers import *

INDEX_BEGIN = "<b>On this page / på denna sida</b>\n"
INDEX_END = "<p>"

DELIM_BEGIN = "<!-- mode=normal -->"
DELIM_END = "<!-- NEWIMAGE2 -->"

html_entities = [
                ["&quot;", "\""],
                ["&rsquo;", "\'"],
                ["&lsquo;", "\'"],
                ["&ndash;", "-"],
                ["<br>", ""],
                ['<span class="sp">', ""],
                ['<span class="overline">', ""],
                ['<span class="sc">', ""],
                ["</span>", ""],
                ["&lt;", "<"],
                ["&gt;", ">"],
                ["&nbsp;", " "],
                ["&amp;", "&"],
                ["<sub>", ""],
                ["</sub>", ""],
                ["<sup>", ""],
                ["</sup>", ""],
                ["<s>", ""],
                ["</s>", ""],
                ["<sp>", ""],
                ["</sp>", ""],
                ]


## Getting started: Scraping edition 1 and 2
#### Functions for scraping

In [None]:
def get_substring_between_delimiters(s: str, start: str, end: str) -> str:
    start_index = s.find(start)
    if start_index == -1:
        return None
    start_index += len(start)
    end_index = s.find(end, start_index)
    if end_index == -1:
        return None

    return s[start_index: end_index]

def remove_single_newline(s: str) -> str:
    return re.sub(r'(?<!\n)\n(?!\n)|(\n+)(?=\n)', ' ', s)

def create_url(partial_url: str, i: int) -> str:
    return partial_url + f"{i:04d}" + ".html"

def scrape_page_text_and_index(url: str) -> tuple[str, str]:
    try:
        page = urlopen(url)
    except:
        return None, None
    html = page.read().decode("utf-8")
    index = get_substring_between_delimiters(html, INDEX_BEGIN, INDEX_END)
    html = get_substring_between_delimiters(html, DELIM_BEGIN, DELIM_END)
    if not index == None:
        index = clean_html_markup(index, html_entities)
        index = remove_single_newline(index)
    if not html == None:
        html = clean_html_markup(html, html_entities)
        html = remove_single_newline(html)
    return html, index

def scrape_volume(base_url: str, volume_start_number: int, volume_end_number: int = 9999999) -> str:
    i = volume_start_number
    volume_str: str = ""
    while(i <= volume_end_number):
        url = create_url(base_url, i)
        text, index = scrape_page_text_and_index(url)
        if text == None or index == None:
            i += 1
            continue
        volume_str += PAGE_NUMBER_STRING + str(i) + ", "
        volume_str += INDEX_STRING + index + "\n"
        volume_str += text
        print(f"i = {i}: {volume_str[-10:]}")
        i += 1
    return volume_str

def scrape_edition(
    folder_edition: str, 
    edition_url_range: dict[str, str], 
    first_letter_list: list[str],
    edition_volume_start_end: dict[str, tuple[int, int]],
    ) -> None:
    for first_letter in first_letter_list:
        for second_letter in edition_url_range[first_letter]:
            volume_index = first_letter + second_letter
            f = open(folder_edition + volume_index + ".txt", "w", encoding='utf-8')
            volume_url = BASE_URL + volume_index + "/"
            print(volume_url)
            f.write(scrape_volume(volume_url, 
                volume_start_number=edition_volume_start_end[volume_index][0], 
                volume_end_number=edition_volume_start_end[volume_index][1])) 
            print(f"volume index: {volume_index}")
            f.close()

#### Getting the first edition (1800-tals utgåvan)

In [None]:
scrape_edition(folder_edition1, 
               edition1_url_range, 
               edition1_first_letter_list, 
               edition1_volume_start_end)

#### Getting the second edition (ugglan)

In [None]:
scrape_edition(folder_edition2, 
               edition2_url_range, 
               edition2_first_letter_list, 
               edition2_volume_start_end)

## Post processing of txt files
Here we do some additional cleaning of the txt files:
* Replace all multiple spaces with a single space
* Any remaining html tags except `<b>` and `<i>` tags, such as image tags and references, are removed
* `[rättelse ...]` brackets are removed from the first edition

In [None]:
def remove_multiple_spaces(file: str):
    return re.sub(r" {2,}", r" ", file)

def remove_html_tags(file: str):
    return re.sub(r"<(?!\/?(b|i)\b)[^>\n]*?>", r"", file)
    # return re.findall(r"<(?!\/?(b|i)\b)[^>]*>", file)

def remove_rattelse(file: str):
    return re.sub(r"\[rättelse.*\]", r"", file)

def remove_line_with_only_spaces(file: str):
    return re.sub(r"^ \n", r"", file)

def remove_space_at_start_of_line(file: str):
    return re.sub(r"\n\s+", r"\n", file)

def remove_until_first_bold_tag(file: str):
    return re.sub(r'^.*?(<b>)', r'\1', file)

def post_processing(folder: str, volumes: list[str]):
    for volume in tqdm(volumes):
        volume_string = ""
        with open(folder + f"{volume}.txt", "r", encoding='utf-8') as f:
            volume_string = f.read()
            # print(f"before: {volume_string[:100]}")
            volume_string = remove_rattelse(volume_string)
            volume_string = remove_html_tags(volume_string)
            volume_string = remove_multiple_spaces(volume_string)
            volume_string = remove_line_with_only_spaces(volume_string)
            volume_string = remove_space_at_start_of_line(volume_string)
            volume_string = remove_until_first_bold_tag(volume_string)
            # print(f"after: {volume_string[:100]}")
            
        with open(folder + f"{volume}.txt", "w", encoding='utf-8') as f:
            f.write(volume_string)

In [None]:
post_processing(folder_edition1, edition1_volumes)
post_processing(folder_edition2, edition2_volumes)