# JSON diff

Find difference between two json analysis files and record these in a new version of the latest file

## 1. Compare two json files with paragraphs

In [1]:
import datetime
import json
import re

In [2]:
def read_json_file(file_name):
    infile = open(file_name, "r")
    text_data = infile.read()
    infile.close()
    json_data = json.loads(text_data)
    return(json_data)

In [3]:
def remove_counts(paragraphs):
    new_keys = {}
    for key in paragraphs:
        if type(paragraphs[key]) == dict:
            paragraphs[key] = remove_counts(paragraphs[key])
        new_key = re.sub('^\(\d+\) ', "", key)
        if new_key != key:
            new_keys[new_key] = key
    for new_key in new_keys:
        paragraphs[new_key] = paragraphs[new_keys[new_key]]
    return(dict(paragraphs))

assert remove_counts({ "1": 1, "(2) 2": 2 }) == { "1": 1, "(2) 2": 2, "2": 2 } , "test 1 failed!"
assert remove_counts({ "0": { "1": 1, "(2) 2": 2 }}) == { "0": { "1": 1, "(2) 2": 2, "2": 2 }} , "test 2 failed!"

In [4]:
def undo_remove_counts(paragraphs):
    new_keys = []
    for key in paragraphs:
        if type(paragraphs[key]) == dict:
            paragraphs[key] = undo_remove_counts(paragraphs[key])
        new_key = re.sub('^\(\d+\) ', "", key)
        if new_key != key and new_key in paragraphs:
            new_keys.append(new_key)
    for new_key in new_keys:
        del(paragraphs[new_key])
    return(dict(paragraphs))

assert undo_remove_counts({ "1": 1, "(2) 2": 2, "2": 2 }) == { "1": 1, "(2) 2": 2 }, "test 1 failed!"
assert undo_remove_counts({ "0": {"1": 1, "(2) 2": 2, "2": 2 }}) == { "0": { "1": 1, "(2) 2": 2 }}, "test 2 failed!"

In [5]:
def compare_items(old, new):
    return "aantal gevonden sleuteltermen" in old and "aantal gevonden sleuteltermen" in new and old["aantal gevonden sleuteltermen"] != new["aantal gevonden sleuteltermen"] or \
           "documenten" in old and "documenten" in new and len(old["documenten"]) != len(new["documenten"]) or \
           "paragrafen" in old and "paragrafen" in new and len(old["paragrafen"]) != len(new["paragrafen"]) or \
           "richtlijnen" in old and "richtlijnen" in new and len(old["richtlijnen"]) != len(new["richtlijnen"]) or \
           "sleuteltermen" in old and "sleuteltermen" in new and len(old["sleuteltermen"]) != len(new["sleuteltermen"])


def compare_sub_groups(paragraphs_new, paragraphs_old):
    for sub_group_name in ["documenten", "richtlijnen", "sleuteltermen"]:
        if sub_group_name in paragraphs_new:
            for file_name_new in paragraphs_new[sub_group_name]:
                if not re.search('^\(\d+\) ', file_name_new):
                    if file_name_new == "index.html":
                        file_name_old = ""
                    else:
                        file_name_old = file_name_new
                    if file_name_old not in paragraphs_old[sub_group_name]:
                        paragraphs_new[sub_group_name][file_name_new]["status"] = "nieuw"
                        propagate_status(paragraphs_new[sub_group_name][file_name_new], "nieuw")
                    else:
                        if compare_items(paragraphs_old[sub_group_name][file_name_old], paragraphs_new[sub_group_name][file_name_new]):
                            paragraphs_new[sub_group_name][file_name_new]["status"] = "veranderd"
                        compare_sub_groups(paragraphs_new[sub_group_name][file_name_new], paragraphs_old[sub_group_name][file_name_old])
            for file_name_old in paragraphs_old[sub_group_name]:
                if not re.search('^\(\d+\) ', file_name_old):
                    if file_name_old == "":
                        file_name_new = "index.html"
                    else:
                        file_name_new = file_name_old
                    if file_name_new not in paragraphs_new[sub_group_name]:
                        key = f'({paragraphs_old[sub_group_name][file_name_old]["aantal gevonden sleuteltermen"]}) {file_name_old}'
                        paragraphs_new[sub_group_name][key] = dict(paragraphs_old[sub_group_name][file_name_old])
                        paragraphs_new[sub_group_name][key]["status"] = "verwijderd"
                        propagate_status(paragraphs_new[sub_group_name][key], "verwijderd")


def propagate_status(paragraphs_new, status):
    for sub_group_name in ["documenten", "richtlijnen", "sleuteltermen"]:
        if sub_group_name in paragraphs_new:
            for file_name_new in paragraphs_new[sub_group_name]:
                if not re.search('^\(\d+\) ', file_name_new):
                    paragraphs_new[sub_group_name][file_name_new]["status"] = status
                    propagate_status(paragraphs_new[sub_group_name][file_name_new], status)
                    
                    
def compare_group(paragraphs_new, paragraphs_old, group_name):
    for measure in paragraphs_new[group_name]:
        if not re.search('^\(\d+\) ', measure):
            if measure not in paragraphs_old[group_name]:
                paragraphs_new[group_name][measure]["status"] = "nieuw"
                propagate_status(paragraphs_new[group_name][measure], "nieuw")
                continue
            elif compare_items(paragraphs_old[group_name][measure], paragraphs_new[group_name][measure]):
                paragraphs_new[group_name][measure]["status"] = "veranderd"
            compare_sub_groups(paragraphs_new[group_name][measure], paragraphs_old[group_name][measure])            
    for measure in paragraphs_old[group_name]:
        if not re.search('^\(\d+\) ', measure):
            if measure not in paragraphs_new[group_name]:
                key = f'({paragraphs_old[group_name][measure]["aantal gevonden sleuteltermen"]}) {measure}'
                paragraphs_new[group_name][key] = dict(paragraphs_old[group_name][measure])
                paragraphs_new[group_name][key]["status"] = "verwijderd"
                propagate_status(paragraphs_new[group_name][key], "verwijderd")

                
def compare(paragraphs_new, paragraphs_old):
    compare_group(paragraphs_new, paragraphs_old, "richtlijnen")
    compare_group(paragraphs_new, paragraphs_old, "sleuteltermen")

In [43]:
def json2html(data, outfile, top=False, counter=0):
    if type(data) != dict:
        print(f"<font style=\"color:grey;\">{data}</font>", file=outfile)
    else:
        if top:
            print(f"<div id=\"div{counter}\" style=\"display:block\">\n<ul>", file=outfile)
        else:
            print(f"<a href=\"javascript:toggle('div{counter}')\" id=\"div{counter}link\">open</a>", file=outfile)
            print(f"<div id=\"div{counter}\" style=\"display:none\">\n<ul>", file=outfile)
        for key in data:
            if type(data[key]) != dict or "status" not in data[key]: 
                print("<li style=\"background-color: white;\">", key, ":", file=outfile)
            elif data[key]["status"] == "nieuw":
                print("<li style=\"background-color: lightgreen;\">", key, ":", file=outfile)
            elif data[key]["status"] == "veranderd":
                print("<li style='background-color: yellow;'>", key, ":", file=outfile)
            elif data[key]["status"] == "verwijderd":
                print("<li style=\"background-color: pink;\">", key, ":", file=outfile)
            else:
                print("<li>", key, ":", "ERROR: unexpected status:", data[key]["status"], file=outfile)
            counter += 1
            counter = json2html(data[key], outfile, counter=counter)
            print("</li>", file=outfile)
        print("</ul>\n</div>", file=outfile)
    return(counter)

In [49]:
explanation = ("<p><strong>Uitleg</strong> "
"<a href=\"javascript:toggle('divexplanation')\" id=\"divexplanationlink\">open</a></p> "
"<div id=\"divexplanation\" style=\"display:none\"> "
"<p>Op deze pagina staat een analyse van de website <a href=\"https://richtlijnendatabase.nl\">richtlijnendatabase.nl</a>. De analyse betreft het voorkomen van termen over het "
"onderwerp <strong>ehealth</strong>, de zogenaamde sleuteltermen. Alle paragrafen die sleuteltermen bevatten zijn geselecteerd en de aantallen sleuteltermen zijn geteld, zowel "
"per paragraaf, per document als voor de gehele website.</p>"
"<p>De website richtlijnendatabase.nl bevat een lijst van medische richtlijnen voor het behandelen van aandoeningen. Deze analyse van de website is gedaan op twee niveau's: "
"richtlijnen en sleuteltermen. Voor elke richtlijn en voor elk sleutelterm is een analyse gemaakt, en is de paragraaf met de meeste sleuteltermen gekozen: de exemplarische "
"paragraaf. De analyse kan worden ingezien door eerst op \"richlijnen\" of \"sleuteltermen\" te klikken en dan op de naam van de richtlijn of de sleutelterm.</p>"
"<p>Omdat de website richtlijnendatabase.nl regelmatig verandert, houden we met kleurcodes bij welke richtlijnen, documenten en sleuteltermen nieuw zijn, of zijn veranderd. "
"Nieuwe richtlijnen, documenten en sleuteltermen zijn <font style=\"background-color: lightgreen;\">groen</font> gekleurd en veranderde elementen zijn "
"<font style=\"background-color: yellow;\">geel</font>. Daarnaast zijn verwijderde richtlijnen, documenten en sleuteltermen in een <font style=\"background-color: pink;\">rode</font> "
"kleur opgenomen in de lijsten. De aantallen gevonden in verwijderde elementen worden niet meegeteld op de hogere niveau's. In deze analyse wordt de versie van de website van 20 april "
"2021 vergeleken met de versie van 15 maart 2021.</p>"
"<p>Bij de vergelijking op basis van verandering wordt alleen gekeken naar de aantallen gevonden sleuteltermen, de aantallen relevante paragrafen, de aantallen relevante documenten "
"en de aantallen relevante richtlijnen. De documenten worden niet woord-voor-woord vergeleken omdat kleine veranderingen, zoals voor webnavigatie, voor deze analyse niet interessant "
"zijn.</p>"
"</div> ")

In [88]:
def print_paragraphs(paragraphs):
    title = "analyse richtlijnendatabase.nl"
    date = datetime.datetime.strftime(datetime.datetime.now(), "%Y-%m-%d %H:%M:%S")
    outfile = open("index.html", "w")
    print(f"<html>\n<head>\n<meta charset=\"utf-8\"/>\n<title>{title}</title>\n<script type=\"text/javascript\">", file=outfile)
    print("function toggle(divid) {\nvar item=document.getElementById(divid); if (item) { item.style.display=(item.style.display=='none')?'block':'none'; }\n"
          "var itemlink=document.getElementById(divid+'link'); if (itemlink) { itemlink.text=(itemlink.text=='open')?'sluit':'open'; }}", file=outfile)
    print(f"</script>\n</head>\n<body><p>Generated at {date}</p>", file=outfile)
    print("<p>Versies: <a href=\"index-20210315.html\">15 maart 2021</a> | <a href=\"index-20210420.html\">20 april 2021</a></p>", file=outfile)
    print("<p>Kleurcodes: <font style=\"background-color: lightgreen;\">Nieuw</font> | <font style=\"background-color: yellow;\">Veranderd</font> | <font style=\"background-color: pink;\">Verwijderd</font></p>", file=outfile)
    print(explanation, file=outfile)
    print(f"<h2>{title}</h2>", file=outfile)
    json2html(paragraphs, outfile, top=True)
    print("<p>Klaar met laden</p></body>\n</html>", file=outfile)
    outfile.close()

In [44]:
paragraphs_old = read_json_file("paragraphs-20210315.json")
paragraphs_new = read_json_file("paragraphs-20210420.json")
paragraphs_old = remove_counts(paragraphs_old)
paragraphs_new = remove_counts(paragraphs_new)
compare(paragraphs_new, paragraphs_old)
paragraphs_new = undo_remove_counts(paragraphs_new)

In [89]:
print_paragraphs(paragraphs_new)

## 2. Divide documents into classes

Classes: behandeling, beleid, diagnostiek, overige

In [77]:
target_tokens = {
    "beeldvorming": "diagnostiek",
    "behandel": "behandeling",
    "beleid": "beleid",
    "beoordeling": "diagnostiek",
    "beschrijving": "diagnostiek",
    "besluitvorming": "beleid",
    "care": "behandeling",
    "chirugie": "behandeling",
    "communicatie": "beleid",
    "detectie": "diagnostiek",
    "diagnose": "diagnostiek",
    "diagnostiek": "diagnostiek",
    "diagnostische": "diagnostiek",
    "duiding": "diagnostiek",
    "herkennen": "diagnostiek",
    "identificati": "diagnostiek",
    "informeren": "beleid",
    "interventie": "behandeling",
    "interventies": "behandeling",
    "onderzoek": "diagnostiek",
    "preventie": "beleid",
    "procedure": "beleid",
    "reiniging": "behandeling",
    "revalidatie": "behandeling",
    "samenwerking": "beleid",
    "screening": "diagnostiek",
    "therapie": "behandeling",
    "voorkomen": "beleid",
    "voorlicht": "beleid",
    "zorg": "behandeling",
}

In [84]:
def match_target_tokens(string):
    for target_token in target_tokens:
        if re.search(target_token, string):
            return(target_tokens[target_token])
    return False

In [87]:
classes = {}
for treatment in paragraphs_new["richtlijnen"].keys():
    match = match_target_tokens(treatment)
    if match:
        classes[treatment] = match
        continue
    classes[treatment] = {}
    for document in paragraphs_new["richtlijnen"][treatment]["documenten"].keys():
        match = match_target_tokens(document)
        if match:
            classes[treatment][document] = match
        else:
            classes[treatment][document] = "overige"

In [101]:
paragraphs_new["stappen"] = {}
for treatment in classes:
    if type(classes[treatment]) == dict:        
        for document in paragraphs_new["richtlijnen"][treatment]["documenten"]:
            if classes[treatment][document] not in paragraphs_new["stappen"]:
                paragraphs_new["stappen"][classes[treatment][document]] =  { "aantal gevonden sleuteltermen": 0,
                                                                             "aantal tokens": 0,
                                                                             "aantal types": 0,
                                                                             "exemplarische paragraaf": {},
                                                                             "richtlijnen": {},
                                                                             "sleuteltermen": {},
                                                                           }            
            paragraphs_new["stappen"][classes[treatment][document]]["aantal gevonden sleuteltermen"] += \
                int(re.sub(r'\.', "", paragraphs_new["richtlijnen"][treatment]["documenten"][document]["aantal gevonden sleuteltermen"]))
            paragraphs_new["stappen"][classes[treatment][document]]["aantal tokens"] += \
                int(re.sub(r'\.', "", paragraphs_new["richtlijnen"][treatment]["documenten"][document]["aantal tokens"]))
            paragraphs_new["stappen"][classes[treatment][document]]["aantal types"] += \
                int(re.sub(r'\.', "", paragraphs_new["richtlijnen"][treatment]["documenten"][document]["aantal types"]))
            if treatment not in paragraphs_new["stappen"][classes[treatment][document]]["richtlijnen"]:
                paragraphs_new["stappen"][classes[treatment][document]]["richtlijnen"][treatment] = { "documenten": {} }
            paragraphs_new["stappen"][classes[treatment][document]]["richtlijnen"][treatment]["documenten"][document] = dict(paragraphs_new["richtlijnen"][treatment]["documenten"][document]) 
    else:
        if classes[treatment] not in paragraphs_new["stappen"]:
            paragraphs_new["stappen"][classes[treatment]] =  { "aantal gevonden sleuteltermen": 0,
                                                               "aantal tokens": 0,
                                                               "aantal types": 0,
                                                               "exemplarische paragraaf": {},
                                                               "richtlijnen": {},
                                                               "sleuteltermen": {},
                                                             }
        for document in paragraphs_new["richtlijnen"][treatment]["documenten"]:
            paragraphs_new["stappen"][classes[treatment]]["aantal gevonden sleuteltermen"] += \
                int(re.sub(r'\.', "", paragraphs_new["richtlijnen"][treatment]["documenten"][document]["aantal gevonden sleuteltermen"]))
            paragraphs_new["stappen"][classes[treatment]]["aantal tokens"] += \
                int(re.sub(r'\.', "", paragraphs_new["richtlijnen"][treatment]["documenten"][document]["aantal tokens"]))
            paragraphs_new["stappen"][classes[treatment]]["aantal types"] += \
                int(re.sub(r'\.', "", paragraphs_new["richtlijnen"][treatment]["documenten"][document]["aantal types"]))
            if treatment not in paragraphs_new["stappen"][classes[treatment]]["richtlijnen"]:
                paragraphs_new["stappen"][classes[treatment]]["richtlijnen"][treatment] = { "documenten": {} }
            paragraphs_new["stappen"][classes[treatment]]["richtlijnen"][treatment]["documenten"][document] = dict(paragraphs_new["richtlijnen"][treatment]["documenten"][document]) 

In [102]:
print_paragraphs(paragraphs_new)