# NHG

In [155]:
from bs4 import BeautifulSoup
import csv
import json
from nltk import word_tokenize
from nltk.corpus import stopwords
import os
import przona
import re

In [7]:
BASE_DIR = "../data/richtlijnen.nhg.org/"

In [124]:
def get_standards():
    standards_dir = BASE_DIR + "standaarden" + "/"
    file_names = sorted(os.listdir(standards_dir))
    standards = []
    for directory in file_names:
        if os.path.isdir(standards_dir + directory) and os.path.isfile(standards_dir + directory + "/print"):
            standards.append(directory)
    return(standards)

def get_omissions(dir_name):
    omissions_dir = BASE_DIR + dir_name + "/"
    file_names = sorted(os.listdir(omissions_dir))
    omissions = []
    for file_name in file_names:
        if os.path.isfile(omissions_dir + file_name):
            omissions.append(file_name)
    return(omissions)

In [119]:
def get_paragraphs(file_names, prefix):
    base_dir = BASE_DIR + prefix + "/"
    paragraphs = {}
    for file_name in file_names:
        file_name_with_directory = base_dir + file_name
        if prefix == "standaarden":
            file_name_with_directory += "/print"
        infile = open(file_name_with_directory)
        text = ""
        for line in infile:
            text += line
        infile.close()
        soup = BeautifulSoup(text)
        paragraphs["/" + prefix + "/" + file_name] = []
        for tag in soup.findAll("div", attrs={"paragraph", "text-formatted"}):
            if len(tag.findAll("div", attrs={"paragraph", "text-formatted"})) == 0:
                paragraphs["/" + prefix + "/" + file_name].append(tag.text)
    return(paragraphs)

In [132]:
standards = get_standards()
paragraphs_standards = get_paragraphs(standards, prefix="standaarden")

In [130]:
omissions = get_omissions("lacunes")
paragraphs_omissions = get_paragraphs(omissions, prefix="lacunes")
studies = get_omissions("onderzoeken")
paragraphs_studies = get_paragraphs(studies, prefix="onderzoeken")

In [133]:
len(standards), len(paragraphs_standards), len(omissions), len(paragraphs_omissions), len(studies), len(paragraphs_studies)

(91, 91, 538, 538, 302, 302)

In [138]:
paragraphs = dict(paragraphs_standards)
paragraphs.update(dict(paragraphs_omissions))
paragraphs.update(dict(paragraphs_studies))
len(paragraphs)

931

In [139]:
przona.save_dict(paragraphs, "csv/paragraphs_nhg.csv")

In [140]:
paragraphs = przona.read_dict("csv/paragraphs_nhg.csv")

In [56]:
KEYWORDS_FILE = "../data/210119 Digitale zorg - sleutelwoorden en combinaties.csv"

def get_keywords(keywords_file):
    infile = open(keywords_file, "r")
    csvreader = csv.reader(infile)
    keywords = []
    for row in csvreader:
        if row[1].strip() == "ja":
            keywords.append(row[0].strip())
        elif row[1].strip() == "nee":
            for i in range(2, len(row)):
                if row[i].strip() != "":
                    keywords.append(row[i].strip())
    infile.close()
    return(keywords)

def make_new_keywords(prefix, suffix_list):
    if len(suffix_list) > 1:
        suffix_list = make_new_keywords(suffix_list[0], suffix_list[1:])
    new_keywords = []
    for suffix in suffix_list:
        new_keywords.append(prefix+suffix)
        new_keywords.append(prefix+" "+suffix)
        new_keywords.append(prefix+"-"+suffix)
    return(new_keywords)

def expand_keywords(keywords):
    keywords.extend(["e health", "e health toepassing", "e learning", "m health", 
                     "tele begeleiding", "tele consultatie", "tele health", "tele medicine", "tele monitoring"])
    for i in range(0,len(keywords)):
        keywords[i] = keywords[i].strip().lower()
    new_keywords = []
    for keyword in keywords:
        keyword_parts = keyword.split()
        if len(keyword_parts) > 1:
            for new_keyword in make_new_keywords(keyword_parts[0], keyword_parts[1:]):
                if new_keyword not in keywords and new_keyword not in new_keywords:
                    new_keywords.append(new_keyword)
    keywords.extend(new_keywords)
    return(sorted(list(set(keywords))))

len(get_keywords(KEYWORDS_FILE))

111

In [58]:
keywords = expand_keywords(get_keywords(KEYWORDS_FILE))
len(keywords)

333

In [163]:
RECOMMENDATION = "richtlijn"
RECOMMENDATIONS = "richtlijnen"
KEYWORDS = "sleuteltermen"
TERM_COUNT = "aantal gevonden sleuteltermen"
TOKEN_COUNT = "aantal tokens"
TYPE_COUNT = "aantal types"
DOCUMENTS = "documenten"
DOCUMENT = "document"
PARAGRAPH = "paragraaf"
PARAGRAPHS = "paragrafen"
BESTPARAGRAPH = "exemplarische paragraaf"

In [141]:
MIN_NBR_OF_WORDS = 10

def count_words(text):
    return(len(text.strip().split()))


def remove_duplicates(matches, queries):
    parts = []
    wholes = []
    for i in range(0, len(matches)):
        for j in range(0, len(matches)):
            if len(matches[i]) > len(matches[j]) and matches[i].startswith(matches[j]) and j not in parts and i not in wholes:
                parts.append(j)
                wholes.append(i)
    for part in [p for p in sorted(parts, reverse=True)]:
        del(matches[part])
        del(queries[part])
    return(matches, queries)


def find_matching_paragraphs(keywords, paragraphs):
    matching_paragraphs = []
    counter = 0
    for url in paragraphs:
        for paragraph in paragraphs[url]:
            if count_words(paragraph) >= MIN_NBR_OF_WORDS:
                matches = []
                queries = []
                for keyword in keywords:
                    if keyword in ["app", "apps"]:
                        keyword = r'\b'+keyword+r'\b'
                    if re.search("^[em] ", keyword):
                        keyword = r'\b'+keyword
                    for match in re.findall(keyword, url + " " + paragraph, flags=re.IGNORECASE):
                        matches.append(match)
                        queries.append(keyword)
                if len(matches) > 0:
                    matches, queries = remove_duplicates(matches, queries)
                    matching_paragraphs.append({"url": url, "matches": matches, "queries": queries, "paragraph": paragraph})
        counter += 1
        przona.squeal(f"documents: {counter}; matches: {len(matching_paragraphs)}")
    return(matching_paragraphs)

matching_paragraphs = find_matching_paragraphs(keywords, paragraphs)

documents: 931; matches: 77


In [142]:
def make_bold(text_in, queries, matches, keyword=None):
    text_out = text_in
    seen = {}
    if keyword == None:
        selected_matches = matches
        selected_queries = queries
    else:
        selected_matches = [matches[i] for i in range(0, len(matches)) if normalize_keyword(matches[i]) == normalize_keyword(keyword)]
        selected_queries = [queries[i] for i in range(0, len(matches)) if normalize_keyword(matches[i]) == normalize_keyword(keyword)]     
    matches_by_length = [selected_matches[i] for i in sorted(range(0, len(selected_matches)), key=lambda i:len(selected_matches[i]), reverse=True)]
    queries_by_length = [selected_queries[i] for i in sorted(range(0, len(selected_matches)), key=lambda i:len(selected_matches[i]), reverse=True)]
    for i in range(0, len(matches_by_length)):
        if queries_by_length[i] not in seen:
            text_out = re.sub("("+queries_by_length[i]+")", "<strong>"+r'\1'+"</strong>", text_out, flags=re.IGNORECASE)
            seen[queries_by_length[i]] = matches_by_length[i].lower()
        elif seen[queries_by_length[i]] != matches_by_length[i].lower():
            print(f"warning: make_bold: replacement problem: {seen[queries_by_length[i]]} vs {matches_by_length[i]}")
    return(text_out)


def update_best_paragraph(best_paragraph, new_paragraph, recommendation, file_name, keyword=None):
    if keyword != None:
        matches = [k for k in new_paragraph["matches"] if k == keyword]
    else:
        matches = new_paragraph["matches"]
    count = len(matches)
    if TERM_COUNT not in best_paragraph or count > best_paragraph[TERM_COUNT]:
        best_paragraph[TERM_COUNT] = count
        best_paragraph[RECOMMENDATION] = make_bold(recommendation, new_paragraph["queries"], new_paragraph["matches"], keyword=keyword)
        best_paragraph[DOCUMENT] = make_bold(file_name, new_paragraph["queries"], new_paragraph["matches"], keyword=keyword)
        best_paragraph[PARAGRAPH] = make_bold(new_paragraph["paragraph"], new_paragraph["queries"], new_paragraph["matches"], keyword=keyword)
        best_paragraph[KEYWORDS] = " | ".join(matches)
    return(best_paragraph)


def normalize_keyword(keyword):
    return(re.sub("[ -]", "", keyword.lower()))


def sort_and_label(dictionary):
    return({f"({dictionary[key][TERM_COUNT]}) {key}":dictionary[key] 
            for key in sorted(dictionary, key=lambda k:dictionary[k][TERM_COUNT], reverse=True)})


def make_paragraph_number(number=None):
    if number == None:
        return(0)
    return(number+1)

In [143]:
len(matching_paragraphs)

77

In [172]:
summary_data = { TERM_COUNT: 0, TOKEN_COUNT:0, TYPE_COUNT:0, BESTPARAGRAPH: {}, RECOMMENDATIONS: {}, KEYWORDS: {} }
last_par = make_paragraph_number()
for paragraph_data in matching_paragraphs:
    url = paragraph_data["url"]
    if len(url.split("/")) > 3:
        recommendation = url.split("/")[2]
        filename = "/".join(url.split("/")[3:])
    else:
        recommendation = url
        filename = ""
    paragraph = make_bold(url+" "+paragraph_data["paragraph"], paragraph_data["queries"], paragraph_data["matches"])
    count_all_matches = len(paragraph_data["matches"])
    summary_data[TERM_COUNT] += count_all_matches
    summary_data[BESTPARAGRAPH] = update_best_paragraph(summary_data[BESTPARAGRAPH], paragraph_data, recommendation, filename)
    
    if recommendation not in summary_data[RECOMMENDATIONS]:
        summary_data[RECOMMENDATIONS][recommendation] = { TERM_COUNT: 0,
                                                          TOKEN_COUNT: 0,
                                                          TYPE_COUNT: 0,
                                                          BESTPARAGRAPH: {},
                                                          KEYWORDS: {},
                                                          DOCUMENTS: {}
                                                        }
    summary_data[RECOMMENDATIONS][recommendation][TERM_COUNT] += count_all_matches
    summary_data[RECOMMENDATIONS][recommendation][BESTPARAGRAPH] = \
        update_best_paragraph(summary_data[RECOMMENDATIONS][recommendation][BESTPARAGRAPH], paragraph_data, recommendation, filename)
    if filename not in summary_data[RECOMMENDATIONS][recommendation][DOCUMENTS]:
        summary_data[RECOMMENDATIONS][recommendation][DOCUMENTS][filename] = { TERM_COUNT: 0, TOKEN_COUNT: 0, TYPE_COUNT: 0, PARAGRAPHS: {} }
    summary_data[RECOMMENDATIONS][recommendation][DOCUMENTS][filename][TERM_COUNT] += count_all_matches
    last_par = make_paragraph_number(last_par)
    summary_data[RECOMMENDATIONS][recommendation][DOCUMENTS][filename][PARAGRAPHS]["PAR"+str(last_par)] = \
        { TERM_COUNT: count_all_matches, PARAGRAPH: paragraph }

    seen = {}
    for i in range(0, len(paragraph_data["queries"])):
        if paragraph_data["queries"][i] in seen:
            continue
        seen[paragraph_data["queries"][i]] = True
        keyword = paragraph_data["matches"][i]
        matches = [k for k in paragraph_data["matches"] if k == keyword]
        count = len(matches)
        normalized_keyword = normalize_keyword(keyword)
        paragraph = make_bold(paragraph_data["paragraph"], 
                              [ paragraph_data["queries"][i] 
                                               for i in range(0, len(paragraph_data["matches"])) 
                                               if paragraph_data["matches"][i] == keyword ], 
                              [ paragraph_data["matches"][i] 
                                               for i in range(0, len(paragraph_data["matches"])) 
                                               if paragraph_data["matches"][i] == keyword ])

        if normalized_keyword not in summary_data[KEYWORDS]:
            summary_data[KEYWORDS][normalized_keyword] = { TERM_COUNT: 0,
                                                           BESTPARAGRAPH: {},
                                                           RECOMMENDATIONS: {}
                                                         }
        summary_data[KEYWORDS][normalized_keyword][TERM_COUNT] += count
        summary_data[KEYWORDS][normalized_keyword][BESTPARAGRAPH] = \
            update_best_paragraph(summary_data[KEYWORDS][normalized_keyword][BESTPARAGRAPH], paragraph_data, recommendation, filename, keyword=keyword)
        if recommendation not in summary_data[KEYWORDS][normalized_keyword][RECOMMENDATIONS]:
            summary_data[KEYWORDS][normalized_keyword][RECOMMENDATIONS][recommendation] = { TERM_COUNT: 0, TOKEN_COUNT: 0, TYPE_COUNT: 0, DOCUMENTS: {} }
        summary_data[KEYWORDS][normalized_keyword][RECOMMENDATIONS][recommendation][TERM_COUNT] += count
        if filename not in summary_data[KEYWORDS][normalized_keyword][RECOMMENDATIONS][recommendation][DOCUMENTS]:
            summary_data[KEYWORDS][normalized_keyword][RECOMMENDATIONS][recommendation][DOCUMENTS][filename] = { TERM_COUNT: 0, TOKEN_COUNT: 0, TYPE_COUNT: 0, PARAGRAPHS: {} }
        summary_data[KEYWORDS][normalized_keyword][RECOMMENDATIONS][recommendation][DOCUMENTS][filename][TERM_COUNT] += count
        summary_data[KEYWORDS][normalized_keyword][RECOMMENDATIONS][recommendation][DOCUMENTS][filename][PARAGRAPHS]["PAR"+str(last_par)] = \
            { TERM_COUNT: count, DOCUMENT: make_bold(filename, paragraph_data["queries"], paragraph_data["matches"], keyword), PARAGRAPH: paragraph }
        
        if normalized_keyword not in summary_data[RECOMMENDATIONS][recommendation][KEYWORDS]:
            summary_data[RECOMMENDATIONS][recommendation][KEYWORDS][normalized_keyword] = { TERM_COUNT: 0, DOCUMENTS: {} }
        summary_data[RECOMMENDATIONS][recommendation][KEYWORDS][normalized_keyword][TERM_COUNT] += count
        if filename not in summary_data[RECOMMENDATIONS][recommendation][KEYWORDS][normalized_keyword][DOCUMENTS]:
            summary_data[RECOMMENDATIONS][recommendation][KEYWORDS][normalized_keyword][DOCUMENTS][filename] = { TERM_COUNT: 0, TOKEN_COUNT: 0, TYPE_COUNT: 0, PARAGRAPHS: {} }
        summary_data[RECOMMENDATIONS][recommendation][KEYWORDS][normalized_keyword][DOCUMENTS][filename][TERM_COUNT] += count
        summary_data[RECOMMENDATIONS][recommendation][KEYWORDS][normalized_keyword][DOCUMENTS][filename][PARAGRAPHS]["PAR"+str(last_par)] = \
            { TERM_COUNT: count, DOCUMENT: make_bold(filename, paragraph_data["queries"], paragraph_data["matches"], keyword), PARAGRAPH: paragraph }

In [173]:
def cleanup_paragraph(paragraph_in):
    paragraph_out = []
    stop_words = stopwords.words('dutch')
    for token in paragraph_in.lower().split():
        if re.search(r"^[-a-zÞ-ÿ']+$", token) and token not in stop_words:
            paragraph_out.append(token)
    return(" ".join(paragraph_out))

In [174]:
counter = 0
token_set_all = set()
recommendation_data = {}
for url in paragraphs:
    if True:
        przona.squeal(counter)
        counter += 1
        token_count = 0
        token_set = set()
        for paragraph in paragraphs[url]:
            paragraph = cleanup_paragraph(paragraph)
            token_count += len(paragraph.split())
            token_set = token_set.union(set(paragraph.lower().split()))
        summary_data[TOKEN_COUNT] += token_count
        token_set_all = token_set_all.union(token_set)
        if len(url.split("/")) > 3:
            recommendation = url.split("/")[2]
            filename = "/".join(url.split("/")[3:])
        else:
            recommendation = url
            filename = ""
        if recommendation not in recommendation_data:
            recommendation_data[recommendation] = { TOKEN_COUNT: 0, TYPE_COUNT: 0 }
        recommendation_data[recommendation][TOKEN_COUNT] += token_count
        recommendation_data[recommendation][TYPE_COUNT] += len(token_set)
        if recommendation in summary_data[RECOMMENDATIONS]:
            if filename in summary_data[RECOMMENDATIONS][recommendation][DOCUMENTS]:
                summary_data[RECOMMENDATIONS][recommendation][DOCUMENTS][filename][TOKEN_COUNT] = token_count
                summary_data[RECOMMENDATIONS][recommendation][DOCUMENTS][filename][TYPE_COUNT] = len(token_set)
                for keyword in summary_data[KEYWORDS].keys():
                    if recommendation in summary_data[KEYWORDS][keyword][RECOMMENDATIONS]:
                        if filename in summary_data[KEYWORDS][keyword][RECOMMENDATIONS][recommendation][DOCUMENTS]:
                            summary_data[KEYWORDS][keyword][RECOMMENDATIONS][recommendation][DOCUMENTS][filename][TOKEN_COUNT] = token_count
                            summary_data[KEYWORDS][keyword][RECOMMENDATIONS][recommendation][DOCUMENTS][filename][TYPE_COUNT] = len(token_set)
summary_data[TYPE_COUNT] += len(token_set_all)
for recommendation in recommendation_data:
    if recommendation in summary_data[RECOMMENDATIONS]:
        summary_data[RECOMMENDATIONS][recommendation][TOKEN_COUNT] = recommendation_data[recommendation][TOKEN_COUNT]
        summary_data[RECOMMENDATIONS][recommendation][TYPE_COUNT] = recommendation_data[recommendation][TYPE_COUNT]
    for keyword in summary_data[KEYWORDS].keys():
        if recommendation in summary_data[KEYWORDS][keyword][RECOMMENDATIONS]:
            summary_data[KEYWORDS][keyword][RECOMMENDATIONS][recommendation][TOKEN_COUNT] = recommendation_data[recommendation][TOKEN_COUNT]
            summary_data[KEYWORDS][keyword][RECOMMENDATIONS][recommendation][TYPE_COUNT] = recommendation_data[recommendation][TYPE_COUNT]

930


In [175]:
summary_data[RECOMMENDATIONS] = sort_and_label(summary_data[RECOMMENDATIONS])
summary_data[KEYWORDS] = sort_and_label(summary_data[KEYWORDS])
for recommendation in summary_data[RECOMMENDATIONS]:
    summary_data[RECOMMENDATIONS][recommendation][KEYWORDS] = sort_and_label(summary_data[RECOMMENDATIONS][recommendation][KEYWORDS])
    for keyword in summary_data[RECOMMENDATIONS][recommendation][KEYWORDS]:
        summary_data[RECOMMENDATIONS][recommendation][KEYWORDS][keyword][DOCUMENTS] = \
            sort_and_label(summary_data[RECOMMENDATIONS][recommendation][KEYWORDS][keyword][DOCUMENTS])
        for url in summary_data[RECOMMENDATIONS][recommendation][KEYWORDS][keyword][DOCUMENTS]:
            summary_data[RECOMMENDATIONS][recommendation][KEYWORDS][keyword][DOCUMENTS][url][PARAGRAPHS] = \
                sort_and_label(summary_data[RECOMMENDATIONS][recommendation][KEYWORDS][keyword][DOCUMENTS][url][PARAGRAPHS])
    summary_data[RECOMMENDATIONS][recommendation][DOCUMENTS] = sort_and_label(summary_data[RECOMMENDATIONS][recommendation][DOCUMENTS])
    for filename in summary_data[RECOMMENDATIONS][recommendation][DOCUMENTS]:
        summary_data[RECOMMENDATIONS][recommendation][DOCUMENTS][filename][PARAGRAPHS] = \
            sort_and_label(summary_data[RECOMMENDATIONS][recommendation][DOCUMENTS][filename][PARAGRAPHS])
for keyword in summary_data[KEYWORDS]:
    summary_data[KEYWORDS][keyword][RECOMMENDATIONS] = sort_and_label(summary_data[KEYWORDS][keyword][RECOMMENDATIONS])
    for recommendation in summary_data[KEYWORDS][keyword][RECOMMENDATIONS]:
        summary_data[KEYWORDS][keyword][RECOMMENDATIONS][recommendation][DOCUMENTS] = \
            sort_and_label(summary_data[KEYWORDS][keyword][RECOMMENDATIONS][recommendation][DOCUMENTS])
        for url in summary_data[KEYWORDS][keyword][RECOMMENDATIONS][recommendation][DOCUMENTS]:
            summary_data[KEYWORDS][keyword][RECOMMENDATIONS][recommendation][DOCUMENTS][url][PARAGRAPHS] = \
                sort_and_label(summary_data[KEYWORDS][keyword][RECOMMENDATIONS][recommendation][DOCUMENTS][url][PARAGRAPHS])

In [176]:
def make_large_number_readable(number_in):
    number_in = str(number_in)
    number_out = ""
    for i in range(1, 1+len(number_in)):
        number_out = number_in[-i] + number_out
        if i % 3 == 0 and i < len(number_in):
            number_out = "." + number_out
    return(number_out)


def make_large_numbers_readable(dict_):
    for key in dict_:
        if isinstance(dict_[key], int):
            dict_[key] = make_large_number_readable(dict_[key])
        elif isinstance(dict_[key], dict):
            dict_[key] = make_large_numbers_readable(dict_[key])
    return(dict_)

In [177]:
summary_data = make_large_numbers_readable(summary_data)

In [178]:
out_file = open("paragraphs.json", "w")
print(json.dumps(summary_data), file=out_file)
out_file.close()

In [180]:
def json2html(data, outfile, top=False, counter=0):
    if type(data) != dict:
        print(f"<font style=\"color:grey;\">{data}</font>", file=outfile)
    else:
        if top:
            print(f"<div id=\"div{counter}\" style=\"display:block\">\n<ul>", file=outfile)
        else:
            print(f"<a href=\"javascript:toggle('div{counter}')\" id=\"div{counter}link\">open</a>", file=outfile)
            print(f"<div id=\"div{counter}\" style=\"display:none\">\n<ul>", file=outfile)
        for key in data:
            print("<li>", key, ":", file=outfile)
            counter += 1
            counter = json2html(data[key], outfile, counter=counter)
            print("</li>", file=outfile)
        print("</ul>\n</div>", file=outfile)
    return(counter)

title = "analyse richtlijnen.nhg.nl"
outfile = open("index.html", "w")
print(f"<html>\n<head>\n<meta charset=\"utf-8\"/>\n<title>{title}</title>\n<script type=\"text/javascript\">", file=outfile)
print("function toggle(divid) {\nvar item=document.getElementById(divid); if (item) { item.style.display=(item.style.display=='none')?'block':'none'; }\n"
      "var itemlink=document.getElementById(divid+'link'); if (itemlink) { itemlink.text=(itemlink.text=='open')?'sluit':'open'; }}", file=outfile)
print(f"</script>\n</head>\n<body><h2>{title}</h2>", file=outfile)
json2html(summary_data, outfile, top=True)
print("</body>\n</html>", file=outfile)
outfile.close()