# Scrape Website

In [1]:
from bs4 import BeautifulSoup
from colorama import Fore, Back, Style
import csv
import os
import pandas as pd
import re
import requests
import sys
import time
import urllib.parse
from IPython.display import clear_output
from przona import *

dummy = csv.field_size_limit(sys.maxsize)

## Download html pages

In [2]:
BASE_URL = "https://richtlijnendatabase.nl"
CSV_DIR = "csv/"
MAIN_WEB_PAGES_FILE = "main_web_pages.csv"
PAGE1 = "/?page=1"
RECOMMENDATIONS_FILE = "recommendation_web_pages.csv"

In [3]:
main_web_pages = get_web_pages(BASE_URL+PAGE1, patterns=["^/\?page=\d+$"])
save_dict(main_web_pages, CSV_DIR+MAIN_WEB_PAGES_FILE)
print(f"number of pages: {len(main_web_pages)}")

retrieved web page https://richtlijnendatabase.nl/?page=1 (200/30600)
retrieved web page https://richtlijnendatabase.nl/?page=4 (200/31175)
retrieved web page https://richtlijnendatabase.nl/?page=7 (200/30521)
retrieved web page https://richtlijnendatabase.nl/?page=8 (200/31983)
retrieved web page https://richtlijnendatabase.nl/?page=6 (200/33475)
retrieved web page https://richtlijnendatabase.nl/?page=10 (200/31063)
retrieved web page https://richtlijnendatabase.nl/?page=12 (200/31005)
retrieved web page https://richtlijnendatabase.nl/?page=14 (200/32686)
retrieved web page https://richtlijnendatabase.nl/?page=15 (200/30723)
retrieved web page https://richtlijnendatabase.nl/?page=17 (200/33130)
retrieved web page https://richtlijnendatabase.nl/?page=20 (200/31275)
retrieved web page https://richtlijnendatabase.nl/?page=21 (200/33569)
retrieved web page https://richtlijnendatabase.nl/?page=24 (200/32192)
retrieved web page https://richtlijnendatabase.nl/?page=27 (200/31271)
retrieved w

In [4]:
try:
    main_web_pages
except NameError:
    print("Reading main_web_pages from disk...")
    main_web_pages = read_dict(CSV_DIR+MAIN_WEB_PAGES_FILE)
recommendation_list = get_recommendation_list(main_web_pages)
print(f"found {len(recommendation_list)} recommendations")

found 440 recommendations


In [5]:
web_pages = read_dict(CSV_DIR+RECOMMENDATIONS_FILE, spy=True)
processed_urls = list(web_pages.keys())

157104


In [6]:
update_recommendations(recommendation_list, processed_urls, CSV_DIR+RECOMMENDATIONS_FILE, BASE_URL=BASE_URL)

1 het_vergrote_ovarium
2 beleid_zwangerschap_41_weken
3 anafylaxie_bij_kinderen
4 follow-up_na_cholesteatoomchirurgie
5 visuele_beperkingen_-_verwijzing_en_revalidatie
6 acuut_leverfalen
7 ddh_dysplastische_heupontwikkeling_bij_kinderen_onder_n_jaar
8 delier_bij_volwassenen_en_ouderen
9 polyfarmacie_bij_ouderen
10 postnatale_zorg_in_de_algemene_kindergeneeskunde
11 preconceptioneel_advies_bij_ags
12 pijnbehandeling_tijdens_de_bevalling
13 ziektemodulerende_behandeling_van_multiple_sclerose_bij_volwassenen
14 perioperatief_traject
15 diagnostiek_en_behandeling_van_ernstig_astma
16 high_flow_bij_kinderen
17 varicella
18 epilepsie
19 zorg_bij_eindstadium_nierfalen
20 bijtverwondingen
21 constitutioneel_eczeem
22 uwi_bij_kinderen
23 pijn_bij_patienten_met_gevorderde_stadia_van_copd_of_hartfalen
24 behandeling_van_pijn_bij_patienten_met_kanker
25 gliomen
26 gebruik_mri_bij_patienten_met_implantaten
27 bloedtransfusiebeleid
28 hidradenitis_suppurativa_hs
29 kwaliteitsstandaard_transgenderzor

In [7]:
web_pages = read_dict(CSV_DIR+RECOMMENDATIONS_FILE, spy=True)

157239


## Derive categories of recommendations

In [6]:
BASE_QUERY = "/?query=&page=1&specialism="
BASE_URL = "https://richtlijnendatabase.nl"
CATEGORIES_FILE = "categories.csv"
CSV_DIR = "csv/"
MAIN_WEB_PAGES_FILE = "main_web_pages.csv"
RECOMMENDATIONS_PER_CATEGORY_FILE = "recommendations_per_category.csv"
PAGE1 = "/?page=1"

In [7]:
if os.path.isfile(CSV_DIR+CATEGORIES_FILE):
    categories = read_dict(CSV_DIR+CATEGORIES_FILE)
else:
    main_web_pages = read_dict(CSV_DIR+MAIN_WEB_PAGES_FILE)
    categories = get_categories(main_web_pages[PAGE1])
    save_dict(categories, CSV_DIR+CATEGORIES_FILE)
if os.path.isfile(CSV_DIR+RECOMMENDATIONS_PER_CATEGORY_FILE):
    recommendations_per_category = read_dict(CSV_DIR+RECOMMENDATIONS_PER_CATEGORY_FILE)
else:
    recommendations_per_category = get_recommendations_per_category(categories)
    save_dict(recommendations_per_category, CSV_DIR+RECOMMENDATIONS_PER_CATEGORY_FILE)

In [9]:
pd.set_option('display.max_rows', None)
pd.set_option('display.min_rows', None)
pd.set_option('display.max_columns', None)
pretty_df = pretty_print(recommendations_per_category, CSV_DIR+"richtlijnen-categorie.csv")
recommendations = get_recommendations(web_pages)
for recommendation in recommendations:
    if recommendation not in pretty_df.index:
        pretty_df = pretty_df.append(pd.Series({i:' ' for i in pretty_df.iloc[0].index},name=recommendation))
pretty_df.index = [x[:30] for x in pretty_df.index]
pretty_df

Unnamed: 0,9,25,21,34,5,4,16,40,43,52,6,19,12,7,30,3,13,8,28,51,17,27,29,31,55,11,36,33,59,44,18,62,22,54,46,49,2,37,50,53,60,14,47,61,15,63,57,58,64,56,45
informatie-uitwisseling_tussen,+,+,+,+,+,+,+,+,+,+,+,+,+,+,+,+,+,+,+,,+,+,+,+,+,+,+,+,+,+,+,+,+,+,+,+,+,+,+,+,+,+,+,+,+,+,+,+,+,+,+
acute_neurologie,+,+,+,+,+,+,+,+,+,+,+,,+,,+,+,+,+,,+,+,+,+,+,+,,,,,,,,,,,,,,,,,,,,,,,,,,
antitrombotisch_beleid,+,+,+,+,+,+,+,,,,,,+,,,,,,+,,+,+,,,+,,,,+,,,,,,,,,,,,,,,,,,,,,,
primaire_tumor_onbekend,+,,+,+,+,+,,,,+,+,,,,,,+,+,+,,,,+,,+,,,+,,,,,,,,,,,,,,,,,,,,,,,
lymeziekte,+,+,,,,+,,+,+,,,+,,,+,,,,,+,+,,,,+,,,,+,+,,,,,,,,,,,,,,,,,,,,,
craniosynostose,,+,,+,+,+,+,,,,,,,+,,,+,,,,,,,+,,+,+,,+,,,,,,,,,,,,,,,,,,,,,,
plaveiselcelcarcinoom_pcc_van_,,,+,+,+,,,,,+,+,+,,+,,+,+,+,,,,,,,,,+,,,,,,,,,,,,,,,,,,,,,,,,
kleine_vaten_vasculitis,+,+,,,+,+,,,,,,+,,,,,+,+,+,,,,+,,,,+,,,+,,,,,,,,,,,,,,,,,,,,,
kwaliteitsstandaard_intramural,+,+,+,+,+,+,+,+,,,,,,,,,,,,,+,+,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
anesthesie_bij_kinderen,,+,+,,,,+,,,,,,+,+,,+,+,,,,,,,+,,,+,,,,+,,,,,,,,,,,,,,,,,,,,


## Summary

In [10]:
CSV_DIR = "csv/"
MAIN_WEB_PAGES_FILE = "main_web_pages.csv"
RECOMMENDATIONS_FILE = "recommendation_web_pages.csv"

In [11]:
main_web_pages = read_dict(CSV_DIR+MAIN_WEB_PAGES_FILE)
web_pages = read_dict(CSV_DIR+RECOMMENDATIONS_FILE, spy=True)

157104


In [12]:
print(f"number of main web pages: {len(main_web_pages)}")
print(f"number of categories: {len(recommendations_per_category)}")
print(f"number of recommendations: {len(get_recommendations(web_pages))}")
print(f"number of web pages: {len(web_pages)}")
print(f"file suffixes: {count_suffixes(web_pages)}")

number of main web pages: 44
number of categories: 51
number of recommendations: 440
number of web pages: 49725
file suffixes: {'html': 43717, '': 442, 'htm': 1, 'pdf': 4965, 'pptx': 129, 'php': 226, 'ppt': 14, 'png': 23, 'docx': 135, 'jpg': 17, 'xlsx': 11, 'doc': 35, 'xls': 3, 'PNG': 1, 'tif': 6}


In [13]:
recommendation = "/richtlijn/abdominoplastiek"
url_part = "Algemene"

for href in get_links(web_pages, recommendation, url_part):
    print(f"url: {href}; web page size: {len(web_pages[href][0])}")



  soup = BeautifulSoup(web_pages[url][0])


url: /gerelateerde_documenten/bijlage/015124/1/60/Algemene%20inleiding.html; web page size: 2673
url: /gerelateerde_documenten/bijlage/018015/1/60/Algemene%20inleiding.html; web page size: 2180
url: /gerelateerde_documenten/bijlage/015130/1/60/Algemene%20inleiding.html; web page size: 2673
url: /gerelateerde_documenten/bijlage/018017/1/60/Algemene%20inleiding.html; web page size: 2180
url: /gerelateerde_documenten/bijlage/018010/1/60/Algemene%20inleiding.html; web page size: 2180
url: /gerelateerde_documenten/bijlage/015128/1/60/Algemene%20inleiding.html; web page size: 2673
url: /gerelateerde_documenten/bijlage/015131/1/60/Algemene%20inleiding.html; web page size: 2673
url: /gerelateerde_documenten/bijlage/015129/1/60/Algemene%20inleiding.html; web page size: 2673
url: /gerelateerde_documenten/bijlage/015125/1/60/Algemene%20inleiding.html; web page size: 2673
url: /gerelateerde_documenten/bijlage/015126/1/60/Algemene%20inleiding.html; web page size: 2673
url: /gerelateerde_documenten/

## Summary in Dutch

We hebben de webpagina's opgehaald van de website richtlijnendatabase.nl met als startpagina:

https://richtlijnendatabase.nl/?page=1

| Soort             | Aantal | Voorbeeld | Opmerking |
| :---------------- | -----: | :-------- | :-------- |
| hoofdpagina       |     42 | /?page=1 | 1-42 |
| alle webpagina's  | 47.569 | /?page=1 | |
| zonder extensie:  |    417 | /richtlijn/acne | |
| extensie: .html   | 42.190 | /gerelateerde_documenten/bijlage/001094/1/90/Afkortingen.html | |
| extensie: .pdf    |  4.435 | /gerelateerde_documenten/f/21504/Kennisdocument%20-%20Statines.pdf | |
| extensie: .php    |    153 | /richtlijn/item/pagina.php?id=24679&richtlijn_id=480&tab=1 | |
| extensie: .docx   |    135 | /gerelateerde_documenten/f/11337/Vragenlijst.docx | |
| extensie: .pptx   |    129 | /gerelateerde_documenten/f/19293/presentatie%20richtlijn%20DCD.pptx | |
| overige extensies |    110 | /gerelateerde_documenten/f/3691/Interventiegrenzen.doc | .doc:35 .htm:1 .jpg:16 .PNG: 1 .png:23 .ppt:14 .tif:6 .xls:3 .xlsx:11 |
| richtlijnen       |    418 | /richtlijn/acne | |
| categorieën       |     51 | /?query=&page=1&specialism=61 | 2-64 (missen: 10 20 23 24 26 32 35 38 39 41 42 48) |

Onder de richtlijnen bleken identieke bijlagen gelinkt te zijn onder verschilende namen. Bijvoorbeeld, op de veertien pagina's voor "/richtlijn/abdominoplastiek" werd de bijlage Algemene inleiding.html viertien keer gelinkt vanuit dertien verschillende folders:

* /gerelateerde_documenten/bijlage/015124/1/60/Algemene%20inleiding.html
* /gerelateerde_documenten/bijlage/018015/1/60/Algemene%20inleiding.html
* /gerelateerde_documenten/bijlage/015130/1/60/Algemene%20inleiding.html
* /gerelateerde_documenten/bijlage/018017/1/60/Algemene%20inleiding.html
* /gerelateerde_documenten/bijlage/018010/1/60/Algemene%20inleiding.html
* /gerelateerde_documenten/bijlage/015128/1/60/Algemene%20inleiding.html
* /gerelateerde_documenten/bijlage/015131/1/60/Algemene%20inleiding.html
* /gerelateerde_documenten/bijlage/015129/1/60/Algemene%20inleiding.html
* /gerelateerde_documenten/bijlage/015125/1/60/Algemene%20inleiding.html
* /gerelateerde_documenten/bijlage/015126/1/60/Algemene%20inleiding.html
* /gerelateerde_documenten/bijlage/015127/1/60/Algemene%20inleiding.html
* /gerelateerde_documenten/bijlage/018016/1/60/Algemene%20inleiding.html
* /gerelateerde_documenten/bijlage/015132/1/60/Algemene%20inleiding.html

Uit een analyse kwam naar voren dat het hier om twee verschillende documenten ging die konden worden onderscheiden op basis van de eerste drie cijfers in het eerste getal in het webadres (015 vs 018). Omdat we niet aan het webadres konden zien welke documenten uniek waren en welke duplicaten, hebben we alle webdocumenten opgehaald.  

## Wget download check

In [9]:
BASE_URL = "https://richtlijnendatabase.nl"
CSV_DIR = "csv/"
RECOMMENDATIONS_FILE = "recommendation_web_pages.csv"
WGET = "../data/wget.sh"

In [10]:
web_pages = read_dict(CSV_DIR+RECOMMENDATIONS_FILE, spy=True)

157239


In [11]:
BASE_DIR = "../data/richtlijnendatabase.nl"

def find_missing_files():
    file_counter = 0
    missing_recommendations = []
    urls = []
    for original_url in web_pages:
        url = urllib.parse.unquote(original_url)
        url = re.sub(r'\t', '%09', url)
        if not os.path.isfile(BASE_DIR+url) and not os.path.isdir(BASE_DIR+url) and not os.path.isfile((BASE_DIR+url)[:244]):
            if re.search("^/richtlijn/[^.]*$", url): 
                missing_recommendations.append(url)
            original_url = re.sub(" ", "%20", original_url)
            original_url = re.sub("ë", "%C3%AB", original_url)
            urls.append(original_url)
    return(urls, missing_recommendations)

urls, missing_recommendations = find_missing_files()
print(f"missing urls: {len(urls)}, of which missing recommendations: {len(missing_recommendations)}: {missing_recommendations}")

missing urls: 553, of which missing recommendations: 5: ['/richtlijn/farmacogenetica_en_therapeutic_drug_monitoring_-_in_ontwikkeling', '/richtlijn/item', '/richtlijn/beleid_zwangerschap_41_weken', '/richtlijn/postnatale_zorg_in_de_algemene_kindergeneeskunde', '/richtlijn/neonatale_gehoorzorg_0-4_jarigen']


In [12]:
LOGFILE = "../data/logfile"

def find_cause_of_missing_files(urls):
    status = 0
    msgs = {}
    url = ""
    logfile = open(LOGFILE, "r")
    for line in logfile:
        if re.search(BASE_URL, line):
            url = re.sub(".*"+BASE_URL, "", line.strip())
            if url in urls:
                msgs[url] = line.strip()
                status = 1
            else:
                status = 0
                url = ""
        elif status == 1 and re.search("^HTTP", line):
            msgs[url] = line.strip()+" "+msgs[url]
    logfile.close()
    return(msgs)

msgs = find_cause_of_missing_files(urls)
print("missing urls in logfile:",len(msgs.keys()))

missing urls in logfile: 418


In [13]:
def count_cause_of_missing_files(msgs, fetch_web_pages = False):
    counter_404_error = 0
    counter_500_error = 0

    for url in msgs:
        if re.search("404 Not Found", msgs[url]):
            counter_404_error += 1
        elif re.search("500 Internal Server Error", msgs[url]):
            counter_500_error += 1
        else:
            print(msgs[url])
            if fetch_web_pages:
                print("fetching", BASE_URL+url)
                time.sleep(2)
                os.system(WGET+" "+'"'+BASE_URL+url+'"')

    counter_other = len(msgs.keys())-counter_404_error-counter_500_error
    return(counter_404_error, counter_500_error, counter_other)

counter_404_error, counter_500_error, counter_other = count_cause_of_missing_files(msgs)
print(f"cause of missing files: 404 error (file not found): {counter_404_error}; 500 error (access denied): {counter_500_error}; other: {counter_other}")

cause of missing files: 404 error (file not found): 387; 500 error (access denied): 31; other: 0


In [14]:
fetch_web_pages = True

def find_missing_files_without_cause(urls, msgs, fetch_web_pages = False):
    list_without_cause = []
    for url in urls:
        if not url in msgs:
            list_without_cause.append(url)
            if fetch_web_pages:
                print("fetching", BASE_URL+url)
                time.sleep(2)
                os.system(WGET+" "+'"'+BASE_URL+url+'"')
    return(list_without_cause)

print(f"Processed: {len(find_missing_files_without_cause(urls, msgs, fetch_web_pages=fetch_web_pages))} urls")

fetching https://richtlijnendatabase.nl/richtlijn/beleid_zwangerschap_41_weken
fetching https://richtlijnendatabase.nl/gerelateerde_documenten/bijlage/022596/1/100/Counseling%20volgens%20BRAINS-methode.html
fetching https://richtlijnendatabase.nl/richtlijn/beleid_zwangerschap_41_weken/antepartum_foetale_bewaking_vanaf_41_weken_bij_expectatief_beleid.html
fetching https://richtlijnendatabase.nl/gerelateerde_documenten/bijlage/022597/1/100/Counseling%20volgens%20BRAINS-methode.html
fetching https://richtlijnendatabase.nl/richtlijn/beleid_zwangerschap_41_weken/beleid_zwangerschap_vanaf_41_weken.html
fetching https://richtlijnendatabase.nl/richtlijn/postnatale_zorg_in_de_algemene_kindergeneeskunde
fetching https://richtlijnendatabase.nl/gerelateerde_documenten/bijlage/022286/1/80/Implementatieplan.html
fetching https://richtlijnendatabase.nl/richtlijn/postnatale_zorg_in_de_algemene_kindergeneeskunde/screening_preventie_en_behandeling_van_neonatale_hypoglycemie/interventie_of_behandeling_bi

In [16]:
def rename_web_pages(web_pages, rename_web_pages = False):
    file_counter = 0
    missing_recommendations = []
    urls = []
    counter = 0
    for original_url in web_pages:
        url = urllib.parse.unquote(original_url)
        url = re.sub(r'\t', '%09', url)
        if not os.path.isfile(BASE_DIR+url) and not os.path.isdir(BASE_DIR+url) and os.path.isfile((BASE_DIR+url)[:244]):
            counter += 1
            path_parts = (BASE_DIR+url)[:244].split("/")
            file_name_clipped = path_parts.pop(-1)
            directory = "/".join(path_parts)
            file_name_complete = url.split("/")[-1]
            if rename_web_pages:
                os.system(f"mv {directory}/{file_name_clipped} {directory}/{file_name_complete}")
    print(f"found {counter} relevant webpages")
                
rename_web_pages(web_pages)

found 0 relevant webpages
