# Scrape Website

In [1]:
from bs4 import BeautifulSoup
from colorama import Fore, Back, Style
import csv
import os
import pandas as pd
import re
import requests
import sys
import time
from IPython.display import clear_output
from przona import *

dummy = csv.field_size_limit(sys.maxsize)

## Download html pages

In [4]:
BASE_URL = "https://richtlijnendatabase.nl"
CSV_DIR = "csv/"
MAIN_WEB_PAGES_FILE = "main_web_pages.csv"
PAGE1 = "/?page=1"
RECOMMENDATIONS_FILE = "recommendation_web_pages.csv"

In [None]:
main_web_pages = get_web_pages(BASE_URL+PAGE1, patterns=["^/\?page=\d+$"])
save_dict(main_web_pages, CSV_DIR+MAIN_WEB_PAGES_FILE)
print(f"number of pages: {len(main_web_pages)}")

In [None]:
recommendation_list = get_recommendation_list(main_web_pages)
print(f"found {len(recommendation_list)} recommendations")

In [None]:
processed_urls = list(read_dict(CSV_DIR+RECOMMENDATIONS_FILE).keys())

In [None]:
update_recommendations(recommendation_list, processed_urls, out_file_name)

## Derive categories of recommendations

In [28]:
BASE_QUERY = "/?query=&page=1&specialism="
BASE_URL = "https://richtlijnendatabase.nl"
CATEGORIES_FILE = "categories.csv"
CSV_DIR = "csv/"
MAIN_WEB_PAGES_FILE = "main_web_pages.csv"
RECOMMENDATIONS_PER_CATEGORY_FILE = "recommendations_per_category.csv"
PAGE1 = "/?page=1"

In [62]:
if os.path.isfile(CSV_DIR+CATEGORIES_FILE):
    categories = read_dict(CSV_DIR+CATEGORIES_FILE)
else:
    main_web_pages = read_dict(CSV_DIR+MAIN_WEB_PAGES_FILE)
    categories = get_categories(main_web_pages[PAGE1])
    save_dict(categories, CSV_DIR+CATEGORIES_FILE)
if os.path.isfile(CSV_DIR+RECOMMENDATIONS_PER_CATEGORY_FILE):
    recommendations_per_category = read_dict(CSV_DIR+RECOMMENDATIONS_PER_CATEGORY_FILE)
else:
    recommendations_per_category = get_recommendations_per_category(categories)
    save_dict(recommendations_per_category, CSV_DIR+RECOMMENDATIONS_PER_CATEGORY_FILE)

In [237]:
pd.set_option('display.max_rows', None)
pd.set_option('display.min_rows', None)
pd.set_option('display.max_columns', None)
pretty_df = pretty_print(recommendations_per_category)
recommendations = get_recommendations(web_pages)
for recommendation in recommendations:
    if recommendation not in pretty_df.index:
        pretty_df = pretty_df.append(pd.Series({i:' ' for i in pretty_df.iloc[0].index},name=recommendation))
pretty_df.index = [x[:30] for x in pretty_df.index]
pretty_df

Unnamed: 0,9,25,21,34,5,4,16,40,43,52,6,19,12,7,30,3,13,8,28,51,17,27,29,31,55,11,36,33,59,44,18,62,22,54,46,49,2,37,50,53,60,14,47,61,15,63,57,58,64,56,45
informatie-uitwisseling_tussen,+,+,+,+,+,+,+,+,+,+,+,+,+,+,+,+,+,+,+,,+,+,+,+,+,+,+,+,+,+,+,+,+,+,+,+,+,+,+,+,+,+,+,+,+,+,+,+,+,+,+
acute_neurologie,+,+,+,+,+,+,+,+,+,+,+,,+,,+,+,+,+,,+,+,+,+,+,+,,,,,,,,,,,,,,,,,,,,,,,,,,
antitrombotisch_beleid,+,+,+,+,+,+,+,,,,,,+,,,,,,+,,+,+,,,+,,,,+,,,,,,,,,,,,,,,,,,,,,,
primaire_tumor_onbekend,+,,+,+,+,+,,,,+,+,,,,,,+,+,+,,,,+,,+,,,+,,,,,,,,,,,,,,,,,,,,,,,
lymeziekte,+,+,,,,+,,+,+,,,+,,,+,,,,,+,+,,,,+,,,,+,+,,,,,,,,,,,,,,,,,,,,,
craniosynostose,,+,,+,+,+,+,,,,,,,+,,,+,,,,,,,+,,+,+,,+,,,,,,,,,,,,,,,,,,,,,,
plaveiselcelcarcinoom_pcc_van_,,,+,+,+,,,,,+,+,+,,+,,+,+,+,,,,,,,,,+,,,,,,,,,,,,,,,,,,,,,,,,
kleine_vaten_vasculitis,+,+,,,+,+,,,,,,+,,,,,+,+,+,,,,+,,,,+,,,+,,,,,,,,,,,,,,,,,,,,,
kwaliteitsstandaard_intramural,+,+,+,+,+,+,+,+,,,,,,,,,,,,,+,+,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
anesthesie_bij_kinderen,,+,+,,,,+,,,,,,+,+,,+,+,,,,,,,+,,,+,,,,+,,,,,,,,,,,,,,,,,,,,


## Summary

In [101]:
CSV_DIR = "csv/"
MAIN_WEB_PAGES_FILE = "main_web_pages.csv"
RECOMMENDATIONS_FILE = "recommendation_web_pages.csv"

In [104]:
main_web_pages = read_dict(CSV_DIR+MAIN_WEB_PAGES_FILE)
web_pages = read_dict(CSV_DIR+RECOMMENDATIONS_FILE, spy=True)

In [176]:
print(f"number of main web pages: {len(main_web_pages)}")
print(f"number of categories: {len(recommendations_per_category)}")
print(f"number of recommendations: {len(get_recommendations(web_pages))}")
print(f"number of web pages: {len(web_pages)}")
print(f"file suffixes: {count_suffixes()}")

number of main web pages: 42
number of categories: 51
number of recommendations: 418
number of web pages: 47689
file suffixes: {'html': 42307, '': 420, 'htm': 1, 'pdf': 4435, 'pptx': 129, 'php': 153, 'ppt': 14, 'png': 23, 'docx': 135, 'jpg': 16, 'xlsx': 11, 'doc': 35, 'xls': 3, 'PNG': 1, 'tif': 6}


In [117]:
recommendation = "/richtlijn/abdominoplastiek"
url_part = "Algemene"

for href in get_links(web_pages, recommendation, url_part):
    print(f"url: {href}; web page size: {len(web_pages[href][0])}")

url: /gerelateerde_documenten/bijlage/015124/1/60/Algemene%20inleiding.html; web page size: 2673
url: /gerelateerde_documenten/bijlage/018015/1/60/Algemene%20inleiding.html; web page size: 2180
url: /gerelateerde_documenten/bijlage/015130/1/60/Algemene%20inleiding.html; web page size: 2673
url: /gerelateerde_documenten/bijlage/018017/1/60/Algemene%20inleiding.html; web page size: 2180
url: /gerelateerde_documenten/bijlage/018010/1/60/Algemene%20inleiding.html; web page size: 2180
url: /gerelateerde_documenten/bijlage/015128/1/60/Algemene%20inleiding.html; web page size: 2673
url: /gerelateerde_documenten/bijlage/015131/1/60/Algemene%20inleiding.html; web page size: 2673
url: /gerelateerde_documenten/bijlage/015129/1/60/Algemene%20inleiding.html; web page size: 2673
url: /gerelateerde_documenten/bijlage/015125/1/60/Algemene%20inleiding.html; web page size: 2673
url: /gerelateerde_documenten/bijlage/015126/1/60/Algemene%20inleiding.html; web page size: 2673
url: /gerelateerde_documenten/

## Summary in Dutch

We hebben de webpagina's opgehaald van de website richtlijnendatabase.nl met als startpagina:

https://richtlijnendatabase.nl/?page=1

| Soort             | Aantal | Voorbeeld | Opmerking |
| :---------------- | -----: | :-------- | :-------- |
| hoofdpagina       |     42 | /?page=1 | 1-42 |
| alle webpagina's  | 47.569 | /?page=1 | |
| zonder extensie:  |    417 | /richtlijn/acne | |
| extensie: .html   | 42.190 | /gerelateerde_documenten/bijlage/001094/1/90/Afkortingen.html | |
| extensie: .pdf    |  4.435 | /gerelateerde_documenten/f/21504/Kennisdocument%20-%20Statines.pdf | |
| extensie: .php    |    153 | /richtlijn/item/pagina.php?id=24679&richtlijn_id=480&tab=1 | |
| extensie: .docx   |    135 | /gerelateerde_documenten/f/11337/Vragenlijst.docx | |
| extensie: .pptx   |    129 | /gerelateerde_documenten/f/19293/presentatie%20richtlijn%20DCD.pptx | |
| overige extensies |    110 | /gerelateerde_documenten/f/3691/Interventiegrenzen.doc | .doc:35 .htm:1 .jpg:16 .PNG: 1 .png:23 .ppt:14 .tif:6 .xls:3 .xlsx:11 |
| richtlijnen       |    418 | /richtlijn/acne | |
| categorieën       |     51 | /?query=&page=1&specialism=61 | 2-64 (missen: 10 20 23 24 26 32 35 38 39 41 42 48) |

Onder de richtlijnen bleken identieke bijlagen gelinkt te zijn onder verschilende namen. Bijvoorbeeld, op de veertien pagina's voor "/richtlijn/abdominoplastiek" werd de bijlage Algemene inleiding.html viertien keer gelinkt vanuit dertien verschillende folders:

* /gerelateerde_documenten/bijlage/015124/1/60/Algemene%20inleiding.html
* /gerelateerde_documenten/bijlage/018015/1/60/Algemene%20inleiding.html
* /gerelateerde_documenten/bijlage/015130/1/60/Algemene%20inleiding.html
* /gerelateerde_documenten/bijlage/018017/1/60/Algemene%20inleiding.html
* /gerelateerde_documenten/bijlage/018010/1/60/Algemene%20inleiding.html
* /gerelateerde_documenten/bijlage/015128/1/60/Algemene%20inleiding.html
* /gerelateerde_documenten/bijlage/015131/1/60/Algemene%20inleiding.html
* /gerelateerde_documenten/bijlage/015129/1/60/Algemene%20inleiding.html
* /gerelateerde_documenten/bijlage/015125/1/60/Algemene%20inleiding.html
* /gerelateerde_documenten/bijlage/015126/1/60/Algemene%20inleiding.html
* /gerelateerde_documenten/bijlage/015127/1/60/Algemene%20inleiding.html
* /gerelateerde_documenten/bijlage/018016/1/60/Algemene%20inleiding.html
* /gerelateerde_documenten/bijlage/015132/1/60/Algemene%20inleiding.html

Uit een analyse kwam naar voren dat het hier om twee verschillende documenten ging die konden worden onderscheiden op basis van de eerste drie cijfers in het eerste getal in het webadres (015 vs 018). Omdat we niet aan het webadres konden zien welke documenten uniek waren en welke duplicaten, hebben we alle webdocumenten opgehaald.  

## Wget download

In [12]:
BASE_URL = "https://richtlijnendatabase.nl"
CSV_DIR = "csv/"
RECOMMENDATIONS_FILE = "recommendation_web_pages.csv"
WGET = "../data/wget.sh"

In [3]:
web_pages = read_dict(CSV_DIR+RECOMMENDATIONS_FILE, spy=True)

155051


In [30]:
BASE_DIR = "../data/richtlijnendatabase.nl"

file_counter = 0
rec_counter = 0
counter_404 = 0
for url in web_pages:
    original_url = url
    url = re.sub("%20", " ", url)
    url = re.sub("%C3%AB", "ë", url)
    url = re.sub("%C3%A9", "é", url)
    url = re.sub("%C3%B6", "ö", url)
    url = re.sub("%C3%AF", "ï", url)
    url = re.sub("%C3%AA", "ê", url)
    url = re.sub("%D1%97", "ї", url)
    url = re.sub("%E2%80%99", "’", url)
    url = re.sub("%E2%80%98", "‘", url)
    url = re.sub("%E2%89%A4", "≤", url)
    url = re.sub("%E2%80%93", "–", url)
    url = re.sub("%E2%80%9C", "“", url)
    url = re.sub("%E2%80%9D", "”", url)
    url = re.sub("%3C", "<", url)
    url = re.sub("%3E", ">", url)
    url = re.sub("%22", '"', url)
    url = re.sub("%5B", '[', url)
    url = re.sub("%5D", ']', url)
    if not os.path.isfile(BASE_DIR+url) and not os.path.isdir(BASE_DIR+url) and not os.path.isfile((BASE_DIR+url)[:244]):
        if re.search("^/richtlijn/[^.]*$", url): 
            rec_counter += 1
        file_counter += 1
        logfile = open("../data/logfile", "r")
        status = 0
        msg = ""
        search_url = re.sub(r'\?', r'\\?', original_url)
        search_url = re.sub(r'\&', r'\\&', search_url)
        for line in logfile:
            if status == 0 and re.search(search_url, line):
                msg = line.strip()
                status += 1
            elif status == 1 and re.search("^HTTP", line):
                msg = line.strip()+" "+msg
        logfile.close()
        # print(file_counter, msg, url)
        if not re.search("404 Not Found", msg):
            counter_404 += 1
            print(file_counter, msg, url)
            #print("fetching", BASE_URL+url)
            #time.sleep(2)
            #os.system(WGET+" "+BASE_URL+original_url)
print(f"missing files: {file_counter} ({rec_counter} recommendations; {counter_404} non 404)")

205  /gerelateerde_documenten/bijlage/011471/1/1/Stroomdiagram: Advies voor perioperatief beleid bij patiënten di.html
213  /gerelateerde_documenten/bijlage/012145/1/3/Indeling studies en niveau van conclusie.html
214  /gerelateerde_documenten/bijlage/001315/1/1/Evidence tabellen.html
215  /gerelateerde_documenten/bijlage/001314/1/1/Evidence tabellen.html
266  /gerelateerde_documenten/bijlage/016724/1/20/Patiënteninformatie mannelijke niet/neurogene LUTS.html
269  /gerelateerde_documenten/bijlage/001250/1/1/Samenhang tussen wachttijd en prognose bij patiënten met longkan.html
270  /richtlijn/notitie.php?notitie_id=27&check=t
272  /richtlijn/referentie.php?ref_id=7&check=t
273  /richtlijn/referentie.php?ref_id=37&check=t
287  /gerelateerde_documenten/bijlage/009338/1/10/Stroomdiagram behandelalgoritme hemorroïden.html
328  /richtlijn/notitie.php?notitie_id=7&check=t
329  /richtlijn/referentie.php?ref_id=2&check=t
330  /richtlijn/referentie.php?ref_id=5&check=t
331  /richtlijn/referentie