# Scrape Website

In [2]:
from bs4 import BeautifulSoup
from colorama import Fore, Back, Style
import csv
import os
import pandas as pd
import re
import requests
import sys
import time
import urllib.parse
from IPython.display import clear_output
from przona import *

dummy = csv.field_size_limit(sys.maxsize)

## Get web pages with python's request (2021-04-19)

In [3]:
def split_url(url):
    if re.search("^https?://", url, flags=re.IGNORECASE):
        return("/".join(url.split("/")[:3]), "/"+"/".join(url.split("/")[3:]))
    else:
        return("", url)

In [4]:
def get_web_page(url, debug=True):
    assert re.search(r"^https://", url), f"get_web_page: url has unexpected format: {url}"
    time.sleep(1)
    web_page = requests.get(url)
    if web_page.status_code == 200:
        if debug:
            print(f"retrieved web page {url} (200/{len(web_page.content)})")
    else:
        print(Fore.RED, f"web page {url} returned status code {web_page.status_code}", Style.RESET_ALL)
    return(web_page.content)

In [5]:
def get_page_links(web_page, patterns=[]):
    page_links = []
    for a in BeautifulSoup(web_page, "html.parser").select('a'):
        try:
            href = a.get("href")
            for pattern in patterns:
                if re.search(pattern, href):
                    page_links.append(href)
        except TypeError:
            pass
    return(page_links)

In [6]:
def store_content(out_file_name, content):
    out_file_name_parts = out_file_name.split("/")
    for i in range(1, len(out_file_name_parts)):
        if not os.path.isdir("/".join(out_file_name_parts[0: i])):
            os.mkdir("/".join(out_file_name_parts[0: i]))
    out_file = open(out_file_name, "w")
    print(content.decode("utf8"), file=out_file)
    out_file.close()

In [7]:
def make_local_file_name(remote_file, url):
    url = re.sub("^https*://", "", url)
    base_dir = f"../data/{'.'.join(url.split('.')[-2:])}"
    remote_file_parts = remote_file.split("/")
    if re.search(r'\.html$', remote_file):
        dir_name = "/".join(remote_file_parts[:-1])
        file_name = remote_file_parts[-1]
    else:
        dir_name = remote_file
        file_name = "index.html"
        remote_file = os.path.join(dir_name, file_name)
    remote_file = re.sub(r"^/", "", remote_file)
    out_file_name = os.path.join(base_dir, remote_file)
    return(out_file_name)

In [8]:
def get_web_pages(url, patterns=[], processed_urls=[], debug=True):
    base_url, remote_file = split_url(url)
    if remote_file in processed_urls or re.search(r'\.pdf$', remote_file) or re.search(r'\?', remote_file):
        return
    out_file_name = make_local_file_name(remote_file, base_url)
    if os.path.isfile(out_file_name) and not re.search("index.html$", out_file_name):
        processed_urls[remote_file] = ""
        return
    processed_urls[remote_file] = get_web_page(url, debug)
    store_content(out_file_name, processed_urls[remote_file])
    page_links = get_page_links(processed_urls[remote_file], patterns)
    for page_link in sorted(page_links):
        get_web_pages(base_url + page_link, patterns, processed_urls, debug)
    return

In [9]:
url = "https://richtlijnendatabase.nl/"
patterns = ["^/richtlijn/", "^/en/richtlijn/"]
processed_urls = {}
debug = True
get_web_pages(url, patterns, processed_urls, debug)
len(processed_urls)

retrieved web page https://richtlijnendatabase.nl/ (200/98085)
retrieved web page https://richtlijnendatabase.nl/richtlijn/22q13_deletiesyndroom_pms (200/64275)
retrieved web page https://richtlijnendatabase.nl/richtlijn/aandoeningen_van_de_pleura (200/67887)
retrieved web page https://richtlijnendatabase.nl/richtlijn/abdominoplastiek (200/141985)
retrieved web page https://richtlijnendatabase.nl/richtlijn/achilles_tendinopathie (200/87225)
retrieved web page https://richtlijnendatabase.nl/richtlijn/acne (200/53504)
retrieved web page https://richtlijnendatabase.nl/richtlijn/actinische_keratose (200/105139)
retrieved web page https://richtlijnendatabase.nl/richtlijn/acute_appendicitis (200/76516)
retrieved web page https://richtlijnendatabase.nl/richtlijn/acute_buikpijn_bij_volwassenen (200/58575)
retrieved web page https://richtlijnendatabase.nl/richtlijn/anesthesie_bij_kinderen (200/101318)
retrieved web page https://richtlijnendatabase.nl/richtlijn/beleid_rondom_spoedoperaties (200/

9705