# Scrape Website

In [None]:
from bs4 import BeautifulSoup
from colorama import Fore, Back, Style
import pandas as pd
import re
import requests
import time

In [None]:
def get_web_page(url, debug=True):
    time.sleep(1)
    web_page = requests.get(url)
    if web_page.status_code == 200:
        if debug:
            print(f"retrieved web page {url} (200/{len(web_page.content)})")
    else:
        print(Fore.RED, f"web page {url} returned status code {web_page.status_code}", Style.RESET_ALL)
    return(web_page.content)



def get_page_links(web_page, patterns=[]):
    page_links = []
    for a in BeautifulSoup(web_page, "html.parser").select('a'):
        try:
            href = a.get("href")
            for pattern in patterns:
                if re.search(pattern, href):
                    page_links.append(href)
        except TypeError:
            pass
    return(page_links)


def get_web_pages(url, patterns=[], processed_urls=[], debug=True):
    web_page = get_web_page(url, debug)
    target_urls = get_page_links(web_page, patterns)
    base_url = "/".join(url.split("/")[0:3])
    web_pages = {}
    retrieved_urls = []
    while len(set(target_urls)) > len(web_pages):
        target_url = list(set(target_urls).difference(set(web_pages.keys())))[0]
        if target_url in processed_urls:
            web_pages[target_url] = "PROCESSED"
            if debug:
                print(f"already processed {target_url}")
        elif not re.search("\.html*$",target_url) and not re.search("/[^.]*$",target_url):
            web_pages[target_url] = "SKIPPED"
            if debug:
                print(f"skipped {target_url}")
        elif re.search("/gerelateerde_documenten/", target_url) and \
             "/".join(target_url.split("/")[6:]) in retrieved_urls:
            web_pages[target_url] = "DUPLICATE"
            if debug:
                print(f"duplicate {target_url}")
        else:
            web_pages[target_url] = get_web_page(base_url+target_url, debug)
            target_urls.extend(get_page_links(web_pages[target_url], patterns))
            if re.search("/gerelateerde_documenten/", target_url):
                retrieved_urls.append("/".join(target_url.split("/")[6:]))
    return(web_pages)


def get_recommendation_list(web_pages):
    recommendation_list = []
    for key in web_pages:
        for a in BeautifulSoup(web_pages[key], "html.parser").select('a'):
            try:
                href = a.get("href")
                if re.search("^/richtlijn/", href):
                    recommendation = href.split("/")[2]
                    if recommendation not in recommendation_list:
                        recommendation_list.append(recommendation)
            except TypeError:
                pass
    return(recommendation_list)


def save_dict(dictionary, out_file_name, mode="w"):
    pd.DataFrame(dictionary, index=[0]).T.to_csv(out_file_name, header=False, mode=mode)

## Download data

In [None]:
BASE_URL = "https://richtlijnendatabase.nl"

main_web_pages = get_web_pages(BASE_URL, patterns=["^/\?page=\d+$"])
save_dict(main_web_pages, "main_web_pages.csv")
print(f"number of pages: {len(main_web_pages)}")

In [None]:
recommendation_list = get_recommendation_list(main_web_pages)
print(f"found {len(recommendation_list)} recommendations")

In [None]:
OUTFILE = "recommendation_web_pages.csv"

processed_urls = list(pd.read_csv(OUTFILE, header=None)[0])

In [None]:
counter = 0
last_skipped = ""
for recommendation in recommendation_list:
    counter += 1
    print(counter,recommendation)
    if "/richtlijn/"+recommendation not in processed_urls:
        recommendation_web_pages = get_web_pages(BASE_URL+"/richtlijn/"+recommendation,
                                                 patterns=["^/richtlijn/", "^/gerelateerde_documenten"],
                                                 processed_urls = processed_urls,
                                                 debug=False)
        save_dict(recommendation_web_pages, OUTFILE, mode="a")
        processed_urls += list(recommendation_web_pages.keys())

## Examine data

In [None]:
soup = BeautifulSoup(main_web_pages['/?page=1'])

In [None]:
categories = {}
for option in soup.select("option"):
    key = option.get("value")
    value = option.text
    categories[key] = value
del(categories[""])
len(categories)

In [None]:
for key in sorted(categories.keys(), key=lambda key:int(key)):
    print(key, categories[key])

In [None]:
BASE_URL = "https://richtlijnendatabase.nl"
BASE_QUERY = "/?query=&page=1&specialism="

recommendations_per_category = {}
for key in categories:
    web_pages = get_web_pages(BASE_URL+BASE_QUERY+str(key),
                              patterns=["^/\?query=\&page=\d+"],
                              processed_urls=[BASE_URL+BASE_QUERY+str(key)])
    web_pages[BASE_QUERY+str(key)] = get_web_page(BASE_URL+BASE_QUERY+str(key))
    print(f"category {key}; number of pages: {len(web_pages)}")
    recommendation_list = get_recommendation_list(web_pages)
    print(f"found {len(recommendation_list)} recommendations for category {key} {categories[key]}\n")
    recommendations_per_category[key] = recommendation_list

In [None]:
table = []
categories_per_recommendation = {}
for category in recommendations_per_category:
    for recommendation in recommendations_per_category[category]:
        if recommendation not in categories_per_recommendation:
            categories_per_recommendation[recommendation] = {}
            for category in recommendations_per_category:
                categories_per_recommendation[recommendation][category] = " "
        categories_per_recommendation[recommendation][category] = "+"
categories_per_recommendation = {r:categories_per_recommendation[r] for r in sorted(categories_per_recommendation.keys(),\
    key=lambda r:len([c for c in categories_per_recommendation[r] if categories_per_recommendation[r][c] == "+"]),reverse=True)} 
r_per_c = {c:{r:categories_per_recommendation[r][c] for r in categories_per_recommendation} for c in categories_per_recommendation[list(categories_per_recommendation.keys())[0]]}
r_per_c = {c:r_per_c[c] for c in sorted(r_per_c.keys(), key=lambda c:len([r for r in r_per_c[c] if r_per_c[c][r] == "+"]), reverse=True)} 
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.DataFrame(r_per_c).to_csv("richtlijnen-categorie.csv", index_label="richtlijn")
pd.DataFrame(r_per_c)