# Scrape Website

In [None]:
from bs4 import BeautifulSoup
from colorama import Fore
import pandas as pd
import re
import requests
import time

In [None]:
def get_web_page(url):
    time.sleep(1)
    web_page = requests.get(url)
    if web_page.status_code == 200:
        print(f"retrieved web page {url} (200/{len(web_page.content)})")
    else:
        print(Fore.RED, f"web page {url} returned status code {web_page.status_code}")
    return(web_page.content)



def get_page_links(web_page, patterns=[]):
    page_links = []
    for a in BeautifulSoup(web_page, "html.parser").select('a'):
        try:
            href = a.get("href")
            for pattern in patterns:
                if re.search(pattern, href):
                    page_links.append(href)
        except TypeError:
            pass
    return(page_links)


def get_web_pages(url, patterns=[]):
    web_page = get_web_page(url)
    target_urls = get_page_links(web_page, patterns)
    base_url = "/".join(url.split("/")[0:3])
    web_pages = {}
    retrieved_urls = []
    while len(set(target_urls)) > len(web_pages):
        target_url = list(set(target_urls).difference(set(web_pages.keys())))[0]
        if not re.search("\.html$",target_url) and not re.search("/[^.]*$",target_url):
            web_pages[target_url] = "SKIPPED"
            print(f"skipped {target_url}")
        elif re.search("/gerelateerde_documenten/", target_url) and \
             "/".join(target_url.split("/")[6:]) in retrieved_urls:
            web_pages[target_url] = "DUPLICATE"
            print(f"duplicate {target_url}")
        else:
            web_pages[target_url] = get_web_page(base_url+target_url)
            target_urls.extend(get_page_links(web_pages[target_url], patterns))
            if re.search("/gerelateerde_documenten/", target_url):
                retrieved_urls.append("/".join(target_url.split("/")[6:]))
    return(web_pages)


def get_recommendation_list(web_pages):
    recommendation_list = []
    for key in web_pages:
        for a in BeautifulSoup(web_pages[key], "html.parser").select('a'):
            try:
                href = a.get("href")
                if re.search("^/richtlijn/", href):
                    recommendation = href.split("/")[2]
                    if recommendation not in recommendation_list:
                        recommendation_list.append(recommendation)
            except TypeError:
                pass
    return(recommendation_list)


def save_dict(dictionary, out_file_name, mode="w"):
    pd.DataFrame(dictionary, index=[0]).T.to_csv(out_file_name, header=False, mode=mode)

In [None]:
BASE_URL = "https://richtlijnendatabase.nl"

main_web_pages = get_web_pages(BASE_URL, patterns=["^/\?page=\d+$"])
save_dict(main_web_pages, "main_web_pages.csv")
print(f"number of pages: {len(main_web_pages)}")

In [None]:
recommendation_list = get_recommendation_list(main_web_pages)
print(f"found {len(recommendation_list)} recommendations")

In [None]:
for recommendation in recommendation_list:
    recommendation_web_pages = get_web_pages(BASE_URL+"/richtlijn/"+recommendation,
                                             patterns=["^/richtlijn/", "^/gerelateerde_documenten"])
    save_dict(recommendation_web_pages, "recommendation_web_pages.csv", mode="a")