# JSON diff

Find difference between two json analysis files and record these in a new version of the latest file

## 1. Compare two json files with paragraphs

In [1]:
import datetime
import json
import re

In [2]:
def read_json_file(file_name):
    infile = open(file_name, "r")
    text_data = infile.read()
    infile.close()
    json_data = json.loads(text_data)
    return(json_data)

In [3]:
def remove_counts(paragraphs, delete_flag=False):
    new_keys = {}
    for key in paragraphs:
        if type(paragraphs[key]) == dict:
            paragraphs[key] = remove_counts(paragraphs[key], delete_flag)
        new_key = re.sub('^\(\d+\) ', "", key)
        if new_key != key:
            new_keys[new_key] = key
    for new_key in new_keys:
        paragraphs[new_key] = paragraphs[new_keys[new_key]]
        if delete_flag:
            del(paragraphs[new_keys[new_key]])
    return(dict(paragraphs))

assert remove_counts({ "1": 1, "(2) 2": 2 }) == { "1": 1, "(2) 2": 2, "2": 2 } , "test 1 failed!"
assert remove_counts({ "0": { "1": 1, "(2) 2": 2 }}) == { "0": { "1": 1, "(2) 2": 2, "2": 2 }} , "test 2 failed!"

In [4]:
def undo_remove_counts(paragraphs):
    new_keys = []
    for key in paragraphs:
        if type(paragraphs[key]) == dict:
            paragraphs[key] = undo_remove_counts(paragraphs[key])
        new_key = re.sub('^\(\d+\) ', "", key)
        if new_key != key and new_key in paragraphs:
            new_keys.append(new_key)
    for new_key in new_keys:
        del(paragraphs[new_key])
    return(dict(paragraphs))

assert undo_remove_counts({ "1": 1, "(2) 2": 2, "2": 2 }) == { "1": 1, "(2) 2": 2 }, "test 1 failed!"
assert undo_remove_counts({ "0": {"1": 1, "(2) 2": 2, "2": 2 }}) == { "0": { "1": 1, "(2) 2": 2 }}, "test 2 failed!"

In [5]:
def compare_items(old, new):
    return "aantal gevonden sleuteltermen" in old and "aantal gevonden sleuteltermen" in new and old["aantal gevonden sleuteltermen"] != new["aantal gevonden sleuteltermen"] or \
           "documenten" in old and "documenten" in new and len(old["documenten"]) != len(new["documenten"]) or \
           "paragrafen" in old and "paragrafen" in new and len(old["paragrafen"]) != len(new["paragrafen"]) or \
           "richtlijnen" in old and "richtlijnen" in new and len(old["richtlijnen"]) != len(new["richtlijnen"]) or \
           "sleuteltermen" in old and "sleuteltermen" in new and len(old["sleuteltermen"]) != len(new["sleuteltermen"])


In [6]:
def compare_sub_groups(paragraphs_new, paragraphs_old):
    for sub_group_name in ["documenten", "richtlijnen", "sleuteltermen"]:
        if sub_group_name in paragraphs_new:
            for file_name_new in paragraphs_new[sub_group_name]:
                if not re.search('^\(\d+\) ', file_name_new):
                    if file_name_new == "index.html":
                        file_name_old = ""
                    else:
                        file_name_old = file_name_new
                    if file_name_old not in paragraphs_old[sub_group_name]:
                        paragraphs_new[sub_group_name][file_name_new]["status"] = "nieuw"
                        propagate_status(paragraphs_new[sub_group_name][file_name_new], "nieuw")
                    else:
                        if compare_items(paragraphs_old[sub_group_name][file_name_old], paragraphs_new[sub_group_name][file_name_new]):
                            paragraphs_new[sub_group_name][file_name_new]["status"] = "veranderd"
                        compare_sub_groups(paragraphs_new[sub_group_name][file_name_new], paragraphs_old[sub_group_name][file_name_old])
            for file_name_old in paragraphs_old[sub_group_name]:
                if not re.search('^\(\d+\) ', file_name_old):
                    if file_name_old == "":
                        file_name_new = "index.html"
                    else:
                        file_name_new = file_name_old
                    if file_name_new not in paragraphs_new[sub_group_name]:
                        key = f'({paragraphs_old[sub_group_name][file_name_old]["aantal gevonden sleuteltermen"]}) {file_name_old}'
                        paragraphs_new[sub_group_name][key] = dict(paragraphs_old[sub_group_name][file_name_old])
                        paragraphs_new[sub_group_name][key]["status"] = "verwijderd"
                        propagate_status(paragraphs_new[sub_group_name][key], "verwijderd")

In [7]:
def propagate_status(paragraphs_new, status):
    for sub_group_name in ["documenten", "richtlijnen", "sleuteltermen"]:
        if sub_group_name in paragraphs_new:
            for file_name_new in paragraphs_new[sub_group_name]:
                if not re.search('^\(\d+\) ', file_name_new):
                    paragraphs_new[sub_group_name][file_name_new]["status"] = status
                    propagate_status(paragraphs_new[sub_group_name][file_name_new], status)


In [8]:
def compare_group(paragraphs_new, paragraphs_old, group_name):
    for measure in paragraphs_new[group_name]:
        if not re.search('^\(\d+\) ', measure):
            if measure not in paragraphs_old[group_name]:
                paragraphs_new[group_name][measure]["status"] = "nieuw"
                propagate_status(paragraphs_new[group_name][measure], "nieuw")
                continue
            elif compare_items(paragraphs_old[group_name][measure], paragraphs_new[group_name][measure]):
                paragraphs_new[group_name][measure]["status"] = "veranderd"
            compare_sub_groups(paragraphs_new[group_name][measure], paragraphs_old[group_name][measure])            
    for measure in paragraphs_old[group_name]:
        if not re.search('^\(\d+\) ', measure):
            if measure not in paragraphs_new[group_name]:
                key = f'({paragraphs_old[group_name][measure]["aantal gevonden sleuteltermen"]}) {measure}'
                paragraphs_new[group_name][key] = dict(paragraphs_old[group_name][measure])
                paragraphs_new[group_name][key]["status"] = "verwijderd"
                propagate_status(paragraphs_new[group_name][key], "verwijderd")

In [9]:
def compare(paragraphs_new, paragraphs_old):
    compare_group(paragraphs_new, paragraphs_old, "richtlijnen")
    compare_group(paragraphs_new, paragraphs_old, "sleuteltermen")

In [10]:
def json2html(data, outfile, counter=0, top=False):
    if type(data) != dict:
        print(f"<font style=\"color:grey;\">{data}</font>", file=outfile)
    else:
        if top:
            print(f"<div id=\"div{counter}\" style=\"display:block\">\n<ul>", file=outfile)
        else:
            print(f"<a href=\"javascript:toggle('div{counter}')\" id=\"div{counter}link\">open</a>", file=outfile)
            print(f"<div id=\"div{counter}\" style=\"display:none\">\n<ul>", file=outfile)
        for key in data:
            if type(data[key]) != dict or "status" not in data[key]: 
                print("<li style=\"background-color: white;\">", key, ":", file=outfile)
            elif data[key]["status"] == "nieuw":
                print("<li style=\"background-color: lightgreen;\">", key, ":", file=outfile)
            elif data[key]["status"] == "veranderd":
                print("<li style='background-color: yellow;'>", key, ":", file=outfile)
            elif data[key]["status"] == "verwijderd":
                print("<li style=\"background-color: pink;\">", key, ":", file=outfile)
            else:
                print("<li>", key, ":", "ERROR: unexpected status:", data[key]["status"], file=outfile)
            counter += 1
            counter = json2html(data[key], outfile, counter=counter)
            print("</li>", file=outfile)
        print("</ul>\n</div>", file=outfile)
    return(counter)

In [11]:
explanation = ("<p><strong>Uitleg</strong> "
"<a href=\"javascript:toggle('divexplanation')\" id=\"divexplanationlink\">open</a></p> "
"<div id=\"divexplanation\" style=\"display:none\"> "
"<p>Op deze pagina staat een analyse van de website <a href=\"https://richtlijnendatabase.nl\">richtlijnendatabase.nl</a>. De analyse betreft het voorkomen van termen over het "
"onderwerp <strong>ehealth</strong>, de zogenaamde sleuteltermen. Alle paragrafen die sleuteltermen bevatten zijn geselecteerd en de aantallen sleuteltermen zijn geteld, zowel "
"per paragraaf, per document als voor de gehele website.</p>"
"<p>De website richtlijnendatabase.nl bevat een lijst van medische richtlijnen voor het behandelen van aandoeningen. Deze analyse van de website is gedaan op twee niveau's: "
"richtlijnen en sleuteltermen. Voor elke richtlijn en voor elk sleutelterm is een analyse gemaakt, en is de paragraaf met de meeste sleuteltermen gekozen: de exemplarische "
"paragraaf. De analyse kan worden ingezien door eerst op \"richlijnen\" of \"sleuteltermen\" te klikken en dan op de naam van de richtlijn of de sleutelterm.</p>"
"<p>Omdat de website richtlijnendatabase.nl regelmatig verandert, houden we met kleurcodes bij welke richtlijnen, documenten en sleuteltermen nieuw zijn, of zijn veranderd. "
"Nieuwe richtlijnen, documenten en sleuteltermen zijn <font style=\"background-color: lightgreen;\">groen</font> gekleurd en veranderde elementen zijn "
"<font style=\"background-color: yellow;\">geel</font>. Daarnaast zijn verwijderde richtlijnen, documenten en sleuteltermen in een <font style=\"background-color: pink;\">rode</font> "
"kleur opgenomen in de lijsten. De aantallen gevonden in verwijderde elementen worden niet meegeteld op de hogere niveau's. In deze analyse wordt de versie van de website van 12 juli "
"2021 vergeleken met de versie van 21 juni 2021.</p>"
"<p>Bij de vergelijking op basis van verandering wordt alleen gekeken naar de aantallen gevonden sleuteltermen, de aantallen relevante paragrafen, de aantallen relevante documenten "
"en de aantallen relevante richtlijnen. De documenten worden niet woord-voor-woord vergeleken omdat kleine veranderingen, zoals voor webnavigatie, voor deze analyse niet interessant "
"zijn.</p>"
"</div> ")

In [12]:
def write_to_index_file(paragraphs):
    title = "analyse richtlijnendatabase.nl"
    date = datetime.datetime.strftime(datetime.datetime.now(), "%Y-%m-%d %H:%M:%S")
    outfile = open("index.html", "w")
    print(f"<html>\n<head>\n<meta charset=\"utf-8\"/>\n<title>{title}</title>\n<script type=\"text/javascript\">", file=outfile)
    print("function toggle(divid) {\nvar item=document.getElementById(divid); if (item) { item.style.display=(item.style.display=='none')?'block':'none'; }\n"
          "var itemlink=document.getElementById(divid+'link'); if (itemlink) { itemlink.text=(itemlink.text=='open')?'sluit':'open'; }}", file=outfile)
    print(f"</script>\n</head>\n<body><p>Generated at {date}</p>", file=outfile)
    print("<p>Versies: <a href=\"index-20210315.html\">15 maart 2021</a> | <a href=\"index-20210420.html\">20 april 2021</a> | <a href=\"index-20210621.html\">21 juni 2021</a> | <a href=\"index-20210712.html\">12 juli 2021</a></p>", file=outfile)
    print("<p>Kleurcodes: <font style=\"background-color: lightgreen;\">Nieuw</font> | <font style=\"background-color: yellow;\">Veranderd</font> | <font style=\"background-color: pink;\">Verwijderd</font></p>", file=outfile)
    print(explanation, file=outfile)
    print(f"<h2>{title}</h2>", file=outfile)
    json2html(paragraphs, outfile, top=True)
    print("<p>Klaar met laden</p></body>\n</html>", file=outfile)
    outfile.close()

Files of interest:
* paragraphs-20210315.json
* paragraphs-20210429.json
* paragraphs-20210622.json
* paragraphs-20210712.json

Note that `text_ranking.ipynb` stores it output in the file `paragraphs.json` (without the date in the name)

In [13]:
JSON_FILE_OLD = "paragraphs-20210621.json"
JSON_FILE_NEW = "paragraphs-20210712.json"

paragraphs_old = read_json_file(JSON_FILE_OLD)
paragraphs_new = read_json_file(JSON_FILE_NEW)
paragraphs_old = remove_counts(paragraphs_old)
paragraphs_new = remove_counts(paragraphs_new)
compare(paragraphs_new, paragraphs_old)
paragraphs_new = undo_remove_counts(paragraphs_new)

In [14]:
write_to_index_file(paragraphs_new)

## 2. Divide documents into classes

Classes are defined in input file

In [15]:
import pandas as pd

In [16]:
def summarize_document_name(document_name):
    document_name = re.sub(r"^\(\d+\) ", "", document_name)
    document_name = re.sub(r"\.html$", "", document_name)
    return document_name

In [17]:
def remove_counts_from_document_name(document_name):
    document_name = re.sub(r"^\(\d+\) ", "", document_name)
    return document_name

In [18]:
def remove_periods_from_numbers(number):
    return re.sub("\.", "", number)

In [19]:
def make_large_number_readable(number_in):
    number_in = remove_periods_from_numbers(str(number_in))
    number_out = ""
    for i in range(1, 1+len(number_in)):
        number_out = number_in[-i] + number_out
        if i % 3 == 0 and i < len(number_in):
            number_out = "." + number_out
    return number_out

In [20]:
paragraphs_new = remove_counts(paragraphs_new, delete_flag=True)
classes_from_file = pd.read_csv("csv/210519 indeling documenten richtlijnendatabse FMS.csv")

In [21]:
classes_from_file["indeling"].value_counts()

behandeling         5618
diagnostiek         2460
nazorg               469
preventie            263
screening            175
palliatieve zorg     163
revalidatie          146
Name: indeling, dtype: int64

In [22]:
classes = {}
for i, row in classes_from_file.iterrows():
    document_parts = row["document"].split("/")
    measure = document_parts.pop(0)
    document = "/".join(document_parts)
    if measure not in classes:
        classes[measure] = {}
    classes[measure][document] = row["indeling"]

In [23]:
paragraphs_new["stappen"] = {}
for treatment in paragraphs_new["richtlijnen"]:
    summarized_treatment = summarize_document_name(treatment)
    if summarized_treatment not in classes:
        print("treatment not found in classes:", summarized_treatment)
        continue
    for document in paragraphs_new["richtlijnen"][treatment]["documenten"]:
        summarized_document = summarize_document_name(document)
        if summarized_document not in classes[summarized_treatment]:
            print("document not found in classes:", summarized_treatment, summarized_document)
            continue
        step = classes[summarized_treatment][summarized_document]
        if step not in paragraphs_new["stappen"]:
            paragraphs_new["stappen"][step] =  { "aantal gevonden sleuteltermen": 0,
                                                 "aantal tokens": 0,
                                                 "aantal types": 0,
                                                 "exemplarische paragraaf": {
                                                     "aantal gevonden sleuteltermen": 0,
                                                 },
                                                 "richtlijnen": {},
                                                 "sleuteltermen": {},
                                                }            
        paragraphs_new["stappen"][step]["aantal gevonden sleuteltermen"] += \
            int(re.sub(r'\.', "", paragraphs_new["richtlijnen"][treatment]["documenten"][document]["aantal gevonden sleuteltermen"]))
        paragraphs_new["stappen"][step]["aantal tokens"] += \
            int(re.sub(r'\.', "", paragraphs_new["richtlijnen"][treatment]["documenten"][document]["aantal tokens"]))
        paragraphs_new["stappen"][step]["aantal types"] += \
            int(re.sub(r'\.', "", paragraphs_new["richtlijnen"][treatment]["documenten"][document]["aantal types"]))
        if treatment not in paragraphs_new["stappen"][step]["richtlijnen"]:
            paragraphs_new["stappen"][step]["richtlijnen"][treatment] = { "documenten": {} }
        paragraphs_new["stappen"][step]["richtlijnen"][treatment]["documenten"][document] = \
            dict(paragraphs_new["richtlijnen"][treatment]["documenten"][document])
        if int(paragraphs_new["stappen"][step]["richtlijnen"][treatment]["documenten"][document]["paragrafen"][list(paragraphs_new["stappen"][step]["richtlijnen"][treatment]["documenten"][document]["paragrafen"].keys())[0]]["aantal gevonden sleuteltermen"]) > \
           int(paragraphs_new["stappen"][step]["exemplarische paragraaf"]["aantal gevonden sleuteltermen"]):
            paragraphs_new["stappen"][step]["exemplarische paragraaf"] = \
                dict(paragraphs_new["stappen"][step]["richtlijnen"][treatment]["documenten"][document]["paragrafen"][list(paragraphs_new["stappen"][step]["richtlijnen"][treatment]["documenten"][document]["paragrafen"].keys())[0]])
paragraphs_new["stappen"] = { key: paragraphs_new["stappen"][key] for key in sorted(paragraphs_new["stappen"].keys(), 
                                                                                    key=lambda k: paragraphs_new["stappen"][k]["aantal gevonden sleuteltermen"], 
                                                                                    reverse=True)}

document not found in classes: behandeling_van_kinderen_met_obesitas index
document not found in classes: dementie behandeling_dementie/psychosociale_non-farmacologische_interventies/domotica_en_e-health_bij_dementie
document not found in classes: dementie besluitvorming_bij_dementie/einde_van_leven
document not found in classes: dementie besluitvorming_bij_dementie/rijgeschiktheid_bij_dementie
document not found in classes: dementie besluitvorming_bij_dementie/wilsbekwaamheid_medische_besluitvorming
document not found in classes: dementie index
document not found in classes: dementie organisatie_van_zorg_bij_dementie/de_zorgstandaard_dementie
document not found in classes: dementie organisatie_van_zorg_bij_dementie/regionale_samenwerking_dementiezorg
document not found in classes: dementie organisatie_van_zorg_bij_dementie/scholing_verzorgend_personeel_bij_dementie
document not found in classes: dementie startpagina_-_dementie
document not found in classes: chronische_nierschade_cns i

In [24]:
mentioned = {}
for keyword in paragraphs_new["sleuteltermen"]:
    summarized_keyword = summarize_document_name(keyword)
    for treatment in paragraphs_new["sleuteltermen"][keyword]["richtlijnen"]:
        summarized_treatment = summarize_document_name(treatment)
        if summarized_treatment not in classes:
            if summarized_treatment not in mentioned:
                print("treatment not found in classes:", summarized_treatment)
                mentioned[summarized_treatment] = True
            continue
        for document in paragraphs_new["sleuteltermen"][keyword]["richtlijnen"][treatment]["documenten"]:
            summarized_document = summarize_document_name(document)
            if summarized_document not in classes[summarized_treatment]:
                key = f"{summarized_treatment} {summarized_document}"
                if key not in mentioned:
                    print("document not found in classes:", summarized_treatment, summarized_document)
                    mentioned[key] = True
                continue
            target = paragraphs_new["stappen"][classes[summarized_treatment][summarized_document]]["sleuteltermen"] 
            if keyword not in target:
                target[keyword] = { "aantal gevonden sleuteltermen": 0,
                                    "exemplarische paragraaf": {},
                                    "richtlijnen": {}
                                  }
            target[keyword]["aantal gevonden sleuteltermen"] += \
                int(re.sub(r'\.', "", paragraphs_new["richtlijnen"][summarized_treatment]["documenten"][remove_counts_from_document_name(document)]["aantal gevonden sleuteltermen"]))
            if treatment not in target[keyword]["richtlijnen"]:
                target[keyword]["richtlijnen"][treatment] = { "documenten": {} }
            target[keyword]["richtlijnen"][treatment]["documenten"][remove_counts_from_document_name(document)] = \
                dict(paragraphs_new["richtlijnen"][summarized_treatment]["documenten"][remove_counts_from_document_name(document)])

document not found in classes: behandeling_van_kinderen_met_obesitas index
document not found in classes: dementie behandeling_dementie/psychosociale_non-farmacologische_interventies/domotica_en_e-health_bij_dementie
document not found in classes: dementie besluitvorming_bij_dementie/einde_van_leven
document not found in classes: dementie besluitvorming_bij_dementie/rijgeschiktheid_bij_dementie
document not found in classes: dementie besluitvorming_bij_dementie/wilsbekwaamheid_medische_besluitvorming
document not found in classes: dementie index
document not found in classes: dementie organisatie_van_zorg_bij_dementie/de_zorgstandaard_dementie
document not found in classes: dementie organisatie_van_zorg_bij_dementie/regionale_samenwerking_dementiezorg
document not found in classes: dementie organisatie_van_zorg_bij_dementie/scholing_verzorgend_personeel_bij_dementie
document not found in classes: dementie startpagina_-_dementie
document not found in classes: ziekte_van_parkinson index


In [27]:
for step in paragraphs_new["stappen"]:
    paragraphs_new["stappen"][step]["aantal gevonden sleuteltermen"] = make_large_number_readable(paragraphs_new["stappen"][step]["aantal gevonden sleuteltermen"])
    paragraphs_new["stappen"][step]["aantal tokens"] = make_large_number_readable(paragraphs_new["stappen"][step]["aantal tokens"])
    paragraphs_new["stappen"][step]["aantal types"] = make_large_number_readable(paragraphs_new["stappen"][step]["aantal types"])

In [28]:
write_to_index_file(paragraphs_new)

In [30]:
len(paragraphs_new["stappen"])

7

## 3. Remove duplicate paragraphs from counts

In [31]:
def reorder_measures(paragraphs_new):
    paragraphs_new["richtlijnen"] = { measure: paragraphs_new["richtlijnen"][measure] for measure in 
                                               sorted(paragraphs_new["richtlijnen"], 
                                                      key = lambda m: int(paragraphs_new["richtlijnen"][m]["aantal gevonden sleuteltermen"]),
                                                      reverse = True) }

In [32]:
def reorder_measure_duplicates(paragraphs_new):
    for measure in paragraphs_new["richtlijnen"]:
        paragraphs_new_list = \
            [{ "aantal duplicaatparagrafen": paragraphs_new["richtlijnen"][measure]["niet-meegerekende duplicaatsleuteltermen"][text],
               "text": text} for text in sorted(paragraphs_new["richtlijnen"][measure]["niet-meegerekende duplicaatsleuteltermen"],
                                                key = lambda t: paragraphs_new["richtlijnen"][measure]["niet-meegerekende duplicaatsleuteltermen"][t],
                                                reverse = True)]
        paragraphs_new["richtlijnen"][measure]["niet-meegerekende duplicaatsleuteltermen"] = \
            { i+1: paragraphs_new_list[i] for i in range(0, len(paragraphs_new_list)) }

In [33]:
def remove_duplicates_from_counts_measures(paragraphs_new):
    for measure in paragraphs_new["richtlijnen"]:
        seen = {}
        paragraphs_new["richtlijnen"][measure]["aantal niet-meegerekende duplicaatsleuteltermen"] = 0
        paragraphs_new["richtlijnen"][measure]["niet-meegerekende duplicaatsleuteltermen"] = {}
        for document in paragraphs_new["richtlijnen"][measure]["documenten"]:
            for paragraph in paragraphs_new["richtlijnen"][measure]["documenten"][document]["paragrafen"]:
                text = paragraphs_new["richtlijnen"][measure]["documenten"][document]["paragrafen"][paragraph]["paragraaf"]
                text = re.sub("^[^ ]* ", "", text)
                if text in seen:
                    paragraphs_new["richtlijnen"][measure]["aantal gevonden sleuteltermen"] = str(
                        int(paragraphs_new["richtlijnen"][measure]["aantal gevonden sleuteltermen"]) - 
                        int(paragraphs_new["richtlijnen"][measure]["documenten"][document]["paragrafen"][paragraph]["aantal gevonden sleuteltermen"]))
                    paragraphs_new["richtlijnen"][measure]["aantal niet-meegerekende duplicaatsleuteltermen"] += 1
                    if text in paragraphs_new["richtlijnen"][measure]["niet-meegerekende duplicaatsleuteltermen"]:
                        paragraphs_new["richtlijnen"][measure]["niet-meegerekende duplicaatsleuteltermen"][text] += 1
                    else:
                        paragraphs_new["richtlijnen"][measure]["niet-meegerekende duplicaatsleuteltermen"][text] = 1
                seen[text] = True
    reorder_measures(paragraphs_new)
    reorder_measure_duplicates(paragraphs_new)

In [34]:
def reorder_keywords(paragraphs_new):
    paragraphs_new["sleuteltermen"] = { keyword: paragraphs_new["sleuteltermen"][keyword] for keyword in 
                                                 sorted(paragraphs_new["sleuteltermen"], 
                                                        key = lambda k: int(re.sub("\.", "", paragraphs_new["sleuteltermen"][k]["aantal gevonden sleuteltermen"])),
                                                        reverse = True) }

In [35]:
def reorder_keyword_duplicates(paragraphs_new):
    for keyword in paragraphs_new["sleuteltermen"]:
        paragraphs_new_list = \
            [{ "aantal duplicaatparagrafen": paragraphs_new["sleuteltermen"][keyword]["niet-meegerekende duplicaatsleuteltermen"][text],
               "text": text} for text in sorted(paragraphs_new["sleuteltermen"][keyword]["niet-meegerekende duplicaatsleuteltermen"],
                                                key = lambda t: paragraphs_new["sleuteltermen"][keyword]["niet-meegerekende duplicaatsleuteltermen"][t],
                                                reverse = True)]
        paragraphs_new["sleuteltermen"][keyword]["niet-meegerekende duplicaatsleuteltermen"] = \
            { i+1: paragraphs_new_list[i] for i in range(0, len(paragraphs_new_list)) }

In [36]:
def remove_duplicates_from_counts_keywords(paragraphs_new):
    for keyword in paragraphs_new["sleuteltermen"]:
        seen = {}
        paragraphs_new["sleuteltermen"][keyword]["aantal niet-meegerekende duplicaatsleuteltermen"] = 0
        paragraphs_new["sleuteltermen"][keyword]["niet-meegerekende duplicaatsleuteltermen"] = {}
        for measure in paragraphs_new["sleuteltermen"][keyword]["richtlijnen"]:
            for document in  paragraphs_new["sleuteltermen"][keyword]["richtlijnen"][measure]["documenten"]:
                for paragraph in paragraphs_new["sleuteltermen"][keyword]["richtlijnen"][measure]["documenten"][document]["paragrafen"]:
                    text = paragraphs_new["sleuteltermen"][keyword]["richtlijnen"][measure]["documenten"][document]["paragrafen"][paragraph]["paragraaf"]
                    text = re.sub("^[^ ]* ", "", text)
                    if text in seen:
                        paragraphs_new["sleuteltermen"][keyword]["aantal gevonden sleuteltermen"] = str(
                            int(re.sub("\.", "", paragraphs_new["sleuteltermen"][keyword]["aantal gevonden sleuteltermen"])) - 
                            int(paragraphs_new["sleuteltermen"][keyword]["richtlijnen"][measure]["documenten"][document]["paragrafen"][paragraph]["aantal gevonden sleuteltermen"]))
                        paragraphs_new["sleuteltermen"][keyword]["aantal niet-meegerekende duplicaatsleuteltermen"] += 1
                        if text in paragraphs_new["sleuteltermen"][keyword]["niet-meegerekende duplicaatsleuteltermen"]:
                            paragraphs_new["sleuteltermen"][keyword]["niet-meegerekende duplicaatsleuteltermen"][text] += 1
                        else:
                            paragraphs_new["sleuteltermen"][keyword]["niet-meegerekende duplicaatsleuteltermen"][text] = 1
                    seen[text] = True
    reorder_keywords(paragraphs_new)
    reorder_keyword_duplicates(paragraphs_new)

In [37]:
remove_duplicates_from_counts_measures(paragraphs_new)
remove_duplicates_from_counts_keywords(paragraphs_new)

In [38]:
write_to_index_file(paragraphs_new)

In [39]:
paragraphs_old = read_json_file(JSON_FILE_OLD)
paragraphs_old = remove_counts(paragraphs_old)
compare(paragraphs_new, paragraphs_old)
write_to_index_file(paragraphs_new)

## 4. Add keyword counts to measure and keyword headings

In [40]:
def add_keyword_counts_to_headings(paragraphs_new):
    paragraphs_with_counts = dict(paragraphs_new)
    paragraphs_with_counts["richtlijnen"] = {}
    for measure in paragraphs_new["richtlijnen"]:
        measure_with_count = f'({make_large_number_readable(paragraphs_new["richtlijnen"][measure]["aantal gevonden sleuteltermen"])}) {measure}'
        paragraphs_with_counts["richtlijnen"][measure_with_count] = dict(paragraphs_new["richtlijnen"][measure])
    paragraphs_with_counts["sleuteltermen"] = {}
    for keyword in paragraphs_new["sleuteltermen"]:
        keyword_with_count = f'({make_large_number_readable(paragraphs_new["sleuteltermen"][keyword]["aantal gevonden sleuteltermen"])}) {keyword}'
        paragraphs_with_counts["sleuteltermen"][keyword_with_count] = dict(paragraphs_new["sleuteltermen"][keyword])
    paragraphs_with_counts["stappen"] = {}
    for step in paragraphs_new["stappen"]:
        step_with_count = f'({make_large_number_readable(paragraphs_new["stappen"][step]["aantal gevonden sleuteltermen"])}) {step}'
        paragraphs_with_counts["stappen"][step_with_count] = dict(paragraphs_new["stappen"][step])
    paragraphs_with_counts["exemplarische paragrafen"] = {}
    for document in paragraphs_new["exemplarische paragrafen"]:
        document_with_count = f'({make_large_number_readable(paragraphs_new["exemplarische paragrafen"][document]["aantal gevonden sleuteltermen"])}) {document}'
        paragraphs_with_counts["exemplarische paragrafen"][document_with_count] = dict(paragraphs_new["exemplarische paragrafen"][document])
    return paragraphs_with_counts

In [41]:
paragraphs_with_counts = add_keyword_counts_to_headings(paragraphs_new)

In [42]:
write_to_index_file(paragraphs_with_counts)