# Scrape Website

In [1]:
from bs4 import BeautifulSoup
from colorama import Fore, Back, Style
import csv
import os
import pandas as pd
import re
import requests
import sys
import time
import urllib.parse
from IPython.display import clear_output
from przona import *

dummy = csv.field_size_limit(sys.maxsize)

## Get web pages with python's request (2021-04-19)

In [2]:
def split_url(url):
    if re.search("^https?://", url, flags=re.IGNORECASE):
        return("/".join(url.split("/")[:3]), "/"+"/".join(url.split("/")[3:]))
    else:
        return("", url)

In [3]:
def get_web_page(url, debug=True):
    assert re.search(r"^https://", url), f"get_web_page: url has unexpected format: {url}"
    time.sleep(1)
    web_page = requests.get(url)
    if web_page.status_code == 200:
        if debug:
            print(f"retrieved web page {url} (200/{len(web_page.content)})")
    else:
        print(Fore.RED, f"web page {url} returned status code {web_page.status_code}", Style.RESET_ALL)
    return(web_page.content)

In [4]:
def get_page_links(web_page, patterns=[]):
    page_links = []
    for a in BeautifulSoup(web_page, "html.parser").select('a'):
        try:
            href = a.get("href")
            for pattern in patterns:
                if re.search(pattern, href):
                    page_links.append(href)
        except TypeError:
            pass
    return(page_links)

In [5]:
def store_content(remote_file, url, content):
    out_file_name = make_local_file_name(remote_file, url)
    out_file_name_parts = out_file_name.split("/")
    for i in range(4, len(out_file_name_parts)):
        if not os.path.isdir("/".join(out_file_name_parts[0: i])):
            os.mkdir("/".join(out_file_name_parts[0: i]))
    out_file = open(out_file_name, "w")
    print(content.decode("utf8"), file=out_file)
    out_file.close()

In [14]:
def make_local_file_name(remote_file, url):
    base_dir = f"../data/{'.'.join(url.split('.')[-2:])}"
    if not os.path.isdir(base_dir):
        os.mkdir(base_dir)
    remote_file_parts = remote_file.split("/")
    if re.search(r'\.html$', remote_file):
        dir_name = "/".join(remote_file_parts[:-1])
        file_name = remote_file_parts[-1]
    else:
        dir_name = remote_file
        file_name = "index.html"
        remote_file = os.path.join(dir_name, file_name)
    remote_file = re.sub(r"^/", "", remote_file)
    out_file_name = os.path.join(base_dir, remote_file)
    return(out_file_name)

In [7]:
def get_web_pages(url, patterns=[], processed_urls=[], debug=True):
    base_url, remote_file = split_url(url)
    if remote_file in processed_urls or re.search(r'\.pdf$', remote_file) or re.search(r'\?', remote_file):
        return
    out_file_name = make_local_file_name(remote_file, url)
    if os.path.isfile(out_file_name) and not re.search("index.html$", out_file_name):
        processed_urls[remote_file] = ""
        return
    processed_urls[remote_file] = get_web_page(url, debug)
    store_content(remote_file, url, processed_urls[remote_file])
    page_links = get_page_links(processed_urls[remote_file], patterns)
    for page_link in sorted(page_links):
        get_web_pages(base_url + page_link, patterns, processed_urls, debug)
    return

In [15]:
#url = "https://richtlijnendatabase.nl/"
#patterns = ["^/richtlijn/", "^/en/richtlijn/"]
url = "https://www.ggzstandaarden.nl/"
patterns = ["^/generieke-modules/", "^/richtlijnen/", "^/zorgstandaarden/",]

processed_urls = {}
debug = True
get_web_pages(url, patterns, processed_urls, debug)
len(processed_urls)

retrieved web page https://www.ggzstandaarden.nl/ (200/2743)


1

## Read html pages from disk (after wget download) 2021-03-31

**WARNING**: at Linux, wget trunctuates long file names which results in information loss. There seems to be no solution for this.

In [None]:
DATA_DIR = "../data/richtlijnendatabase.nl/"

### Check index.html

In [None]:
def read_index_file(data_dir, index_file_name="index.html"):
    infile = open(data_dir + index_file_name, "r")
    index_file_data = ""
    for line in infile:
        index_file_data += line
    infile.close()
    return(index_file_data)

In [None]:
def get_recommendations(index_file_data):
    recommendations = {}
    soup = BeautifulSoup(index_file_data)
    for a in soup.select("a"):
        href = a["href"]
        if re.search("^/richtlijn/", href):
            recommendation = "/".join(href.split("/")[:3])
            if not recommendation in recommendations:
                recommendations[recommendation] = True
        elif re.search("^/en/richtlijn/", href):
            recommendation = "/".join(href.split("/")[:4])
            if not recommendation in recommendations:
                recommendations[recommendation] = True
    return(recommendations)

In [None]:
index_file_data = read_index_file(DATA_DIR)
recommendations = get_recommendations(index_file_data)
print(f"found: {len(recommendations)} recommendations ")

### Check /richtlijnen folder

**WARNING**: the existance of a folder does not mean that the recommendation exists on the website. It might be that a folder contains a single file index.html, complaining about the recommendation not being present at the website (message *Oeps, pagina niet gevonden*)

In [None]:
recommendation_files = sorted(os.listdir(DATA_DIR + "richtlijn"))
recommendation_files = ["/richtlijn/" + file_name for file_name in recommendation_files]
recommendation_files_en = sorted(os.listdir(DATA_DIR + "en/richtlijn"))
recommendation_files_en = ["/en/richtlijn/" + file_name for file_name in recommendation_files_en]
recommendation_files.extend(recommendation_files_en)
print(f"found: {len(recommendation_files)} recommendations ")

In [None]:
def process(root, dirs, files):
    file_texts = {}
    for file_name in files:
        file_name_with_directory = os.path.join(root, file_name)
        if os.path.isfile(file_name_with_directory):
            infile = open(file_name_with_directory, "r")
            text = ""
            for line in infile:
                text += line
            infile.close()
            file_name_with_directory = "/" + "/".join(file_name_with_directory.split("/")[3:])
            file_texts[file_name_with_directory] = text
    return file_texts

def make_file_texts(data_dir):
    file_texts = {}
    for root, dirs, files in os.walk(data_dir + "richtlijn"):
        for file_name in files:
            file_texts.update(process(root, dirs, files))
    for root, dirs, files in os.walk(data_dir + "en/richtlijn"):
        for file_name in files:
            file_texts.update(process(root, dirs, files))
    return file_texts

In [None]:
if True:
    file_texts = make_file_texts(DATA_DIR)
    len(file_texts)

In [None]:
if True:
    save_dict(file_texts, "csv/recommendation_web_pages.csv")

### Make excel files

In [None]:
file_names = []
for file_name in recommendation_files:
    if file_name in recommendation_files_old:
        status = {"file_name": file_name, "status": "old"}
    else:
        status = {"file_name": file_name, "status": "new"}
    file_name_parts = file_name.split("/")[1:]
    status.update({i:file_name_parts[i] for i in range(0, len(file_name_parts))})
    file_names.append(status)
    
for file_name in recommendation_files_old:
    if file_name not in recommendation_files:
        status = {"file_name": file_name, "status": "removed"}
        file_name_parts = file_name.split("/")[1:]
        status.update({i:file_name_parts[i] for i in range(0, len(file_name_parts))})
        file_names.append(status)

In [None]:
df = pd.DataFrame(file_names).sort_values(by=['file_name'])
del df['file_name']
df.to_csv("csv/recommendations.csv", index=False)

In [None]:
file_names = []
for file_name in file_texts.keys():
    if file_name in file_texts_old:
        status = {"file_name": file_name, "status": "old"}
    else:
        status = {"file_name": file_name, "status": "new"}
    file_name_parts = file_name.split("/")[1:]
    status.update({i:file_name_parts[i] for i in range(0, len(file_name_parts))})
    if not re.search(r'index.html$', file_name) or status["status"] != "new":
        file_names.append(status)

for file_name in file_texts_old.keys():
    if file_name not in file_texts:
        status = {"file_name": file_name, "status": "removed"}
        file_name_parts = file_name.split("/")[1:]
        status.update({i:file_name_parts[i] for i in range(0, len(file_name_parts))})
        file_names.append(status)

In [None]:
df = pd.DataFrame(file_names).sort_values(by=['file_name'])
del df['file_name']
df.to_csv("csv/files.csv", index=False)

### Make diff

In [None]:
DATA_DIR_OLD = "../data/richtlijnendatabase.nl-20210315/"

In [None]:
file_names = os.listdir(DATA_DIR_OLD)
recommendations_old = {}
for file_name in file_names:
    if re.search("index.html", file_name):
        index_file_data = read_index_file(DATA_DIR_OLD, index_file_name=file_name)
        recommendations_new = get_recommendations(index_file_data)
        recommendations_old.update(recommendations_new)
print(len(recommendations_old.keys()))

In [None]:
def compare_dicts(dict1, dict2):
    diff = []
    for key in dict1:
        if key not in dict2:
            diff.append(key)
    return(sorted(diff))

In [None]:
disappeared = compare_dicts(recommendations_old, recommendations)
(len(disappeared), disappeared)

In [None]:
new = compare_dicts(recommendations, recommendations_old)
(len(new), new)

In [None]:
recommendation_files_old = sorted(os.listdir(DATA_DIR_OLD + "richtlijn"))
recommendation_files_old = ["/richtlijn/" + file_name for file_name in recommendation_files_old]
recommendation_files_old_en = sorted(os.listdir(DATA_DIR_OLD + "en/richtlijn"))
recommendation_files_old_en = ["/en/richtlijn/" + file_name for file_name in recommendation_files_old_en]
recommendation_files_old.extend(recommendation_files_old_en)
len(recommendation_files_old)

In [None]:
recommendation_files = sorted(os.listdir(DATA_DIR + "richtlijn"))
recommendation_files = ["/richtlijn/" + file_name for file_name in recommendation_files]
recommendation_files_en = sorted(os.listdir(DATA_DIR + "en/richtlijn"))
recommendation_files_en = ["/en/richtlijn/" + file_name for file_name in recommendation_files_en]
recommendation_files.extend(recommendation_files_en)
len(recommendation_files)

In [None]:
disappeared = compare_dicts(recommendation_files_old, recommendation_files)
(len(disappeared), disappeared)
#processed_urls = {}
#for file_name in disappeared:
#    get_web_pages(os.path.join(url, re.sub(r'^/', '', file_name)), [r'^' + file_name], processed_urls, debug)

In [None]:
new = compare_dicts(recommendation_files, recommendation_files_old)
(len(new), new)

In [None]:
file_texts_old = make_file_texts(DATA_DIR_OLD)
len(file_texts_old)

In [None]:
lengths = {}
for file_name in file_texts:
    if not file_name in file_texts_old:
        length = len(file_name)
        if not length in lengths:
            lengths[length] = 0
        lengths[length] += 1
        print(file_name)
[(length, lengths[length]) for length in sorted(lengths.keys(), reverse=True)][:10]

## Download html pages (old code)

In [None]:
BASE_URL = "https://richtlijnendatabase.nl"
CSV_DIR = "csv/"
MAIN_WEB_PAGES_FILE = "main_web_pages.csv"
PAGE1 = "/?page=1"
RECOMMENDATIONS_FILE = "recommendation_web_pages.csv"

In [None]:
main_web_pages = get_web_pages(BASE_URL+PAGE1, patterns=["^/\?page=\d+$"])
save_dict(main_web_pages, CSV_DIR+MAIN_WEB_PAGES_FILE)
print(f"number of pages: {len(main_web_pages)}")

In [None]:
try:
    main_web_pages
except NameError:
    print("Reading main_web_pages from disk...")
    main_web_pages = read_dict(CSV_DIR+MAIN_WEB_PAGES_FILE)
recommendation_list = get_recommendation_list(main_web_pages)
print(f"found {len(recommendation_list)} recommendations")

In [None]:
web_pages = read_dict(CSV_DIR+RECOMMENDATIONS_FILE, spy=True)
processed_urls = list(web_pages.keys())

In [None]:
update_recommendations(recommendation_list, processed_urls, CSV_DIR+RECOMMENDATIONS_FILE, BASE_URL=BASE_URL)

In [None]:
web_pages = read_dict(CSV_DIR+RECOMMENDATIONS_FILE, spy=True)

## Derive categories of recommendations

In [None]:
BASE_QUERY = "/?query=&page=1&specialism="
BASE_URL = "https://richtlijnendatabase.nl"
CATEGORIES_FILE = "categories.csv"
CSV_DIR = "csv/"
MAIN_WEB_PAGES_FILE = "main_web_pages.csv"
RECOMMENDATIONS_PER_CATEGORY_FILE = "recommendations_per_category.csv"
PAGE1 = "/?page=1"

In [None]:
if os.path.isfile(CSV_DIR+CATEGORIES_FILE):
    categories = read_dict(CSV_DIR+CATEGORIES_FILE)
else:
    main_web_pages = read_dict(CSV_DIR+MAIN_WEB_PAGES_FILE)
    categories = get_categories(main_web_pages[PAGE1])
    save_dict(categories, CSV_DIR+CATEGORIES_FILE)
if os.path.isfile(CSV_DIR+RECOMMENDATIONS_PER_CATEGORY_FILE):
    recommendations_per_category = read_dict(CSV_DIR+RECOMMENDATIONS_PER_CATEGORY_FILE)
else:
    recommendations_per_category = get_recommendations_per_category(categories)
    save_dict(recommendations_per_category, CSV_DIR+RECOMMENDATIONS_PER_CATEGORY_FILE)

In [None]:
pd.set_option('display.max_rows', None)
pd.set_option('display.min_rows', None)
pd.set_option('display.max_columns', None)
pretty_df = pretty_print(recommendations_per_category, CSV_DIR+"richtlijnen-categorie.csv")
recommendations = get_recommendations(web_pages)
for recommendation in recommendations:
    if recommendation not in pretty_df.index:
        pretty_df = pretty_df.append(pd.Series({i:' ' for i in pretty_df.iloc[0].index},name=recommendation))
pretty_df.index = [x[:30] for x in pretty_df.index]
pretty_df

## Summary

In [None]:
CSV_DIR = "csv/"
MAIN_WEB_PAGES_FILE = "main_web_pages.csv"
RECOMMENDATIONS_FILE = "recommendation_web_pages.csv"

In [None]:
main_web_pages = read_dict(CSV_DIR+MAIN_WEB_PAGES_FILE)
web_pages = read_dict(CSV_DIR+RECOMMENDATIONS_FILE, spy=True)

In [None]:
print(f"number of main web pages: {len(main_web_pages)}")
print(f"number of categories: {len(recommendations_per_category)}")
print(f"number of recommendations: {len(get_recommendations(web_pages))}")
print(f"number of web pages: {len(web_pages)}")
print(f"file suffixes: {count_suffixes(web_pages)}")

In [None]:
recommendation = "/richtlijn/abdominoplastiek"
url_part = "Algemene"

for href in get_links(web_pages, recommendation, url_part):
    print(f"url: {href}; web page size: {len(web_pages[href][0])}")

## Summary in Dutch

We hebben de webpagina's opgehaald van de website richtlijnendatabase.nl met als startpagina:

https://richtlijnendatabase.nl/?page=1

| Soort             | Aantal | Voorbeeld | Opmerking |
| :---------------- | -----: | :-------- | :-------- |
| hoofdpagina       |     42 | /?page=1 | 1-42 |
| alle webpagina's  | 47.569 | /?page=1 | |
| zonder extensie:  |    417 | /richtlijn/acne | |
| extensie: .html   | 42.190 | /gerelateerde_documenten/bijlage/001094/1/90/Afkortingen.html | |
| extensie: .pdf    |  4.435 | /gerelateerde_documenten/f/21504/Kennisdocument%20-%20Statines.pdf | |
| extensie: .php    |    153 | /richtlijn/item/pagina.php?id=24679&richtlijn_id=480&tab=1 | |
| extensie: .docx   |    135 | /gerelateerde_documenten/f/11337/Vragenlijst.docx | |
| extensie: .pptx   |    129 | /gerelateerde_documenten/f/19293/presentatie%20richtlijn%20DCD.pptx | |
| overige extensies |    110 | /gerelateerde_documenten/f/3691/Interventiegrenzen.doc | .doc:35 .htm:1 .jpg:16 .PNG: 1 .png:23 .ppt:14 .tif:6 .xls:3 .xlsx:11 |
| richtlijnen       |    418 | /richtlijn/acne | |
| categorieën       |     51 | /?query=&page=1&specialism=61 | 2-64 (missen: 10 20 23 24 26 32 35 38 39 41 42 48) |

Onder de richtlijnen bleken identieke bijlagen gelinkt te zijn onder verschilende namen. Bijvoorbeeld, op de veertien pagina's voor "/richtlijn/abdominoplastiek" werd de bijlage Algemene inleiding.html viertien keer gelinkt vanuit dertien verschillende folders:

* /gerelateerde_documenten/bijlage/015124/1/60/Algemene%20inleiding.html
* /gerelateerde_documenten/bijlage/018015/1/60/Algemene%20inleiding.html
* /gerelateerde_documenten/bijlage/015130/1/60/Algemene%20inleiding.html
* /gerelateerde_documenten/bijlage/018017/1/60/Algemene%20inleiding.html
* /gerelateerde_documenten/bijlage/018010/1/60/Algemene%20inleiding.html
* /gerelateerde_documenten/bijlage/015128/1/60/Algemene%20inleiding.html
* /gerelateerde_documenten/bijlage/015131/1/60/Algemene%20inleiding.html
* /gerelateerde_documenten/bijlage/015129/1/60/Algemene%20inleiding.html
* /gerelateerde_documenten/bijlage/015125/1/60/Algemene%20inleiding.html
* /gerelateerde_documenten/bijlage/015126/1/60/Algemene%20inleiding.html
* /gerelateerde_documenten/bijlage/015127/1/60/Algemene%20inleiding.html
* /gerelateerde_documenten/bijlage/018016/1/60/Algemene%20inleiding.html
* /gerelateerde_documenten/bijlage/015132/1/60/Algemene%20inleiding.html

Uit een analyse kwam naar voren dat het hier om twee verschillende documenten ging die konden worden onderscheiden op basis van de eerste drie cijfers in het eerste getal in het webadres (015 vs 018). Omdat we niet aan het webadres konden zien welke documenten uniek waren en welke duplicaten, hebben we alle webdocumenten opgehaald.  

## Wget download check

In [None]:
BASE_URL = "https://richtlijnendatabase.nl"
CSV_DIR = "csv/"
RECOMMENDATIONS_FILE = "recommendation_web_pages.csv"
WGET = "../data/wget.sh"

In [None]:
web_pages = read_dict(CSV_DIR+RECOMMENDATIONS_FILE, spy=True)

In [None]:
BASE_DIR = "../data/richtlijnendatabase.nl"

def find_missing_files():
    file_counter = 0
    missing_recommendations = []
    urls = []
    for original_url in web_pages:
        url = urllib.parse.unquote(original_url)
        url = re.sub(r'\t', '%09', url)
        if not os.path.isfile(BASE_DIR+url) and not os.path.isdir(BASE_DIR+url) and not os.path.isfile((BASE_DIR+url)[:244]):
            if re.search("^/richtlijn/[^.]*$", url): 
                missing_recommendations.append(url)
            original_url = re.sub(" ", "%20", original_url)
            original_url = re.sub("ë", "%C3%AB", original_url)
            urls.append(original_url)
    return(urls, missing_recommendations)

urls, missing_recommendations = find_missing_files()
print(f"missing urls: {len(urls)}, of which missing recommendations: {len(missing_recommendations)}: {missing_recommendations}")

In [None]:
LOGFILE = "../data/logfile"

def find_cause_of_missing_files(urls):
    status = 0
    msgs = {}
    url = ""
    logfile = open(LOGFILE, "r")
    for line in logfile:
        if re.search(BASE_URL, line):
            url = re.sub(".*"+BASE_URL, "", line.strip())
            if url in urls:
                msgs[url] = line.strip()
                status = 1
            else:
                status = 0
                url = ""
        elif status == 1 and re.search("^HTTP", line):
            msgs[url] = line.strip()+" "+msgs[url]
    logfile.close()
    return(msgs)

msgs = find_cause_of_missing_files(urls)
print("missing urls in logfile:",len(msgs.keys()))

In [None]:
def count_cause_of_missing_files(msgs, fetch_web_pages = False):
    counter_404_error = 0
    counter_500_error = 0

    for url in msgs:
        if re.search("404 Not Found", msgs[url]):
            counter_404_error += 1
        elif re.search("500 Internal Server Error", msgs[url]):
            counter_500_error += 1
        else:
            print(msgs[url])
            if fetch_web_pages:
                print("fetching", BASE_URL+url)
                time.sleep(2)
                os.system(WGET+" "+'"'+BASE_URL+url+'"')

    counter_other = len(msgs.keys())-counter_404_error-counter_500_error
    return(counter_404_error, counter_500_error, counter_other)

counter_404_error, counter_500_error, counter_other = count_cause_of_missing_files(msgs)
print(f"cause of missing files: 404 error (file not found): {counter_404_error}; 500 error (access denied): {counter_500_error}; other: {counter_other}")

In [None]:
fetch_web_pages = True

def find_missing_files_without_cause(urls, msgs, fetch_web_pages = False):
    list_without_cause = []
    for url in urls:
        if not url in msgs:
            list_without_cause.append(url)
            if fetch_web_pages:
                print("fetching", BASE_URL+url)
                time.sleep(2)
                os.system(WGET+" "+'"'+BASE_URL+url+'"')
    return(list_without_cause)

print(f"Processed: {len(find_missing_files_without_cause(urls, msgs, fetch_web_pages=fetch_web_pages))} urls")

In [None]:
def rename_web_pages(web_pages, rename_web_pages = False):
    file_counter = 0
    missing_recommendations = []
    urls = []
    counter = 0
    for original_url in web_pages:
        url = urllib.parse.unquote(original_url)
        url = re.sub(r'\t', '%09', url)
        if not os.path.isfile(BASE_DIR+url) and not os.path.isdir(BASE_DIR+url) and os.path.isfile((BASE_DIR+url)[:244]):
            counter += 1
            path_parts = (BASE_DIR+url)[:244].split("/")
            file_name_clipped = path_parts.pop(-1)
            directory = "/".join(path_parts)
            file_name_complete = url.split("/")[-1]
            if rename_web_pages:
                os.system(f"mv {directory}/{file_name_clipped} {directory}/{file_name_complete}")
    print(f"found {counter} relevant webpages")
                
rename_web_pages(web_pages)