# Scrape Website

In [1]:
from bs4 import BeautifulSoup
from colorama import Fore, Back, Style
import pandas as pd
import re
import requests
import time

## Download html pages

In [46]:
def get_web_page(url, debug=True):
    time.sleep(1)
    web_page = requests.get(url)
    if web_page.status_code == 200:
        if debug:
            print(f"retrieved web page {url} (200/{len(web_page.content)})")
    else:
        print(Fore.RED, f"web page {url} returned status code {web_page.status_code}", Style.RESET_ALL)
    return(web_page.content)



def get_page_links(web_page, patterns=[]):
    page_links = []
    for a in BeautifulSoup(web_page, "html.parser").select('a'):
        try:
            href = a.get("href")
            for pattern in patterns:
                if re.search(pattern, href):
                    page_links.append(href)
        except TypeError:
            pass
    return(page_links)


def split_url(url):
    if re.search("^https?://", url, flags=re.IGNORECASE):
        return("/".join(url.split("/")[:3]), "/"+"/".join(url.split("/")[3:]))
    else:
        return("", url)

                            
def get_web_pages(url, patterns=[], processed_urls=[], debug=True):
    web_page_contents = get_web_page(url, debug)
    target_urls = get_page_links(web_page_contents, patterns)
    base_url, remote_file = split_url(url)
    web_pages = {remote_file: web_page_contents}
    retrieved_urls = [remote_file]
    while len(set(target_urls)) > len(web_pages):
        target_url = list(set(target_urls).difference(set(web_pages.keys())))[0]
        if target_url in processed_urls:
            web_pages[target_url] = "PROCESSED"
            if debug:
                print(f"already processed {target_url}")
        elif not re.search("\.html*$",target_url) and not re.search("/[^.]*$",target_url):
            web_pages[target_url] = "SKIPPED"
            if debug:
                print(f"skipped {target_url}")
        elif re.search("/gerelateerde_documenten/", target_url) and \
             "/".join(target_url.split("/")[6:]) in retrieved_urls:
            web_pages[target_url] = "DUPLICATE"
            if debug:
                print(f"duplicate {target_url}")
        else:
            web_pages[target_url] = get_web_page(base_url+target_url, debug)
            target_urls.extend(get_page_links(web_pages[target_url], patterns))
            if re.search("/gerelateerde_documenten/", target_url):
                retrieved_urls.append("/".join(target_url.split("/")[6:]))
    return(web_pages)


def get_recommendation_list(web_pages):
    recommendation_list = []
    for key in web_pages:
        for a in BeautifulSoup(web_pages[key], "html.parser").select('a'):
            try:
                href = a.get("href")
                if re.search("^/richtlijn/", href):
                    recommendation = href.split("/")[2]
                    if recommendation not in recommendation_list:
                        recommendation_list.append(recommendation)
            except TypeError:
                pass
    return(recommendation_list)


def save_dict(dictionary, out_file_name, mode="w"):
    pd.DataFrame(dictionary, index=[0]).T.to_csv(out_file_name, header=False, mode=mode)

In [None]:
BASE_URL = "https://richtlijnendatabase.nl"
PAGE1 = "/?page=1"

main_web_pages = get_web_pages(BASE_URL+PAGE1, patterns=["^/\?page=\d+$"])
save_dict(main_web_pages, "main_web_pages.csv")
print(f"number of pages: {len(main_web_pages)}")

In [None]:
recommendation_list = get_recommendation_list(main_web_pages)
print(f"found {len(recommendation_list)} recommendations")

In [None]:
OUTFILE = "recommendation_web_pages.csv"

processed_urls = list(pd.read_csv(OUTFILE, header=None)[0])

In [None]:
counter = 0
last_skipped = ""
for recommendation in recommendation_list:
    counter += 1
    print(counter,recommendation)
    if "/richtlijn/"+recommendation not in processed_urls:
        recommendation_web_pages = get_web_pages(BASE_URL+"/richtlijn/"+recommendation,
                                                 patterns=["^/richtlijn/", "^/gerelateerde_documenten"],
                                                 processed_urls = processed_urls,
                                                 debug=False)
        save_dict(recommendation_web_pages, OUTFILE, mode="a")
        processed_urls += list(recommendation_web_pages.keys())

## Derive categories of recommendations

In [21]:
def read_dict(file_name):
    return(dict(pd.read_csv(file_name, header=None, index_col=0).to_dict(orient="series")[1]))

In [22]:
main_web_pages = read_dict("main_web_pages.csv")

In [24]:
soup = BeautifulSoup(main_web_pages['/?page=1'])

In [25]:
categories = {}
for option in soup.select("option"):
    key = option.get("value")
    value = option.text
    categories[key] = value
del(categories[""])
len(categories)

51

In [26]:
for key in sorted(categories.keys(), key=lambda key:int(key)):
    print(key, categories[key])

2 Fysiotherapie
3 Urologie
4 Neurologie
5 Radiologie
6 Radiotherapie
7 Plastische chirurgie
8 Pathologie
9 Interne geneeskunde
11 Klinische genetica
12 Orthopedie
13 Keel-neus-oorheelkunde
14 Psycholoog
15 Abortusartsen
16 Anesthesiologie
17 Cardiologie
18 Cardio thoracale chirurgie
19 Dermatologie en venerologie
21 Heelkunde
22 Huisartsgeneeskunde
25 Kindergeneeskunde
27 Klinische geriatrie
28 Longziekten en tuberculose
29 Maag-darm-leverziekten
30 Medische microbiologie
31 Neurochirurgie
33 Nucleaire geneeskunde
34 Obstetrie en gynaecologie
36 Oogheelkunde
37 Ouderengeneeskunde
40 Psychiatrie
43 Revalidatiegeneeskunde
44 Reumatologie
45 Sociale geneeskunde
46 Spoedeisende geneeskunde
47 Sportgeneeskunde
49 Artsen verstandelijk gehandicapten
50 Verpleegkunde
51 Ziekenhuisapothekers
52 Oncologie
53 Arbeid- en bedrijfsgeneeskunde
54 Klinische fysica
55 Klinische chemie
56 Orthodontisten
57 Jeugdartsen 
58 Klinische neurofysiologie
59 Laboratorium artsen
60 Mondziekten
61 Verpleegkundige

In [50]:
BASE_URL = "https://richtlijnendatabase.nl"
BASE_QUERY = "/?query=&page=1&specialism="

def get_recommendations_per_category(categories):
    recommendations_per_category = {}
    for key in categories:
        query = BASE_QUERY+str(key)
        web_pages = get_web_pages(BASE_URL+query,
                                  patterns=["^/\?query=\&page=\d+"],
                                  processed_urls=[BASE_URL+query])
        print(f"category {key}; number of pages: {len(web_pages)}")
        recommendation_list = get_recommendation_list(web_pages)
        print(f"found {len(recommendation_list)} recommendations for category {key} {categories[key]}\n")
        recommendations_per_category[key] = recommendation_list
    return(recommendations_per_category)

In [54]:
recommendations_per_category = get_recommendations_per_category({"2": "Fysiotherapie", "3": "Urologie"})

retrieved web page https://richtlijnendatabase.nl/?query=&page=1&specialism=2 (200/20168)
category 2; number of pages: 1
found 3 recommendations for category 2 Fysiotherapie

retrieved web page https://richtlijnendatabase.nl/?query=&page=1&specialism=3 (200/31804)
retrieved web page https://richtlijnendatabase.nl/?query=&page=4&specialism=3 (200/27562)
retrieved web page https://richtlijnendatabase.nl/?query=&page=2&specialism=3 (200/31727)
retrieved web page https://richtlijnendatabase.nl/?query=&page=3&specialism=3 (200/32254)
category 3; number of pages: 4
found 38 recommendations for category 3 Urologie



In [56]:
recommendations_per_category

{'2': ['amputatie_prothesiologie_onderste_extremiteit',
  'artrose_in_heup_of_knie',
  'informatie-uitwisseling_tussen_huisarts_en_specialist_hasp'],
 '3': ['preconceptioneel_advies_bij_ags',
  'perioperatief_traject',
  'chronische_buikpijn',
  'sepsis_fase_1',
  'urineweginfecties_uwi_bij_volwassenen',
  'prostaatcarcinoom',
  'obstructieve_azoospermie',
  'niet_kleincellig_longcarcinoom',
  'behandeling_voorhuidpathologie',
  'uwi_bij_kinderen',
  'urine-incontinentie_bij_vrouwen',
  'blaasinstillatie',
  'minimaal_invasieve_chirurgie_laparoscopie',
  'niercelcarcinoom',
  'subfertiliteit',
  'hematurie',
  'urine-incontinentie_bij_kinderen',
  'blaascarcinoom_-_brachytherapie',
  'melanoom',
  'seksueel_overdraagbare_aandoeningen_soa',
  'antenatale_hydronefrose',
  'plaveiselcelcarcinoom_pcc_van_de_huid',
  'anesthesie_bij_kinderen',
  'necrotiserende_wekedeleninfecties',
  'informatie-uitwisseling_tussen_huisarts_en_specialist_hasp',
  'mannelijke_niet-neurogene_luts',
  'veilig_

In [63]:
def get_categories_per_recommendation(recommendations_per_category):
    categories_per_recommendation = {}
    for category in recommendations_per_category:
        for recommendation in recommendations_per_category[category]:
            if recommendation not in categories_per_recommendation:
                categories_per_recommendation[recommendation] = {}
                for c in recommendations_per_category:
                    categories_per_recommendation[recommendation][c] = " "
            categories_per_recommendation[recommendation][category] = "+"
    categories_per_recommendation = {r:categories_per_recommendation[r] for r in sorted(categories_per_recommendation.keys(),\
        key=lambda r:len([c for c in categories_per_recommendation[r] if categories_per_recommendation[r][c] == "+"]),reverse=True)} 
    return(categories_per_recommendation)

def pretty_print(recommendations_per_category):
    categories_per_recommendation = get_categories_per_recommendation(recommendations_per_category)
    r_per_c = {c:{r:categories_per_recommendation[r][c] for r in categories_per_recommendation} for c in categories_per_recommendation[list(categories_per_recommendation.keys())[0]]}
    r_per_c = {c:r_per_c[c] for c in sorted(r_per_c.keys(), key=lambda c:len([r for r in r_per_c[c] if r_per_c[c][r] == "+"]), reverse=True)} 
    pd.set_option('display.max_rows', None)
    pd.set_option('display.max_columns', None)
    pd.DataFrame(r_per_c).to_csv("richtlijnen-categorie.csv", index_label="richtlijn")
    return(pd.DataFrame(r_per_c))
    
pretty_print(recommendations_per_category)

Unnamed: 0,3,2
informatie-uitwisseling_tussen_huisarts_en_specialist_hasp,+,+
amputatie_prothesiologie_onderste_extremiteit,,+
artrose_in_heup_of_knie,,+
preconceptioneel_advies_bij_ags,+,
perioperatief_traject,+,
chronische_buikpijn,+,
sepsis_fase_1,+,
urineweginfecties_uwi_bij_volwassenen,+,
prostaatcarcinoom,+,
obstructieve_azoospermie,+,
