# Content analysis

In [1]:
from bs4 import BeautifulSoup
from colorama import Fore, Back, Style
import csv
import os
import pandas as pd
import re
import sys
import przona

dummy = csv.field_size_limit(sys.maxsize)

In [2]:
CSV_DIR = "csv/"
RECOMMENDATIONS_FILE = "recommendation_web_pages.csv"

web_pages = przona.read_dict(CSV_DIR+RECOMMENDATIONS_FILE, spy=True)

155051


## Text segmentation

In [53]:
def get_paragraphs(soup):
    paragraphs = []
    for paragraph in re.sub('\\\\r',' ',re.sub('\\\\t',' ',soup.text)).split('\\n'):
        paragraph = re.sub("\s+", " ", paragraph).strip()
        if paragraph != "":
            paragraphs.append(paragraph)
    return(paragraphs)

In [54]:
if os.path.isfile(CSV_DIR+"paragraphs.csv"):
    paragraphs = przona.read_dict(CSV_DIR+"paragraphs.csv")
else:
    paragraphs = {}
    counter = 0
    for url in web_pages.keys():
        counter += 1
        if counter % 100 == 0:
            squeal(counter)
        web_page_text = web_pages[url][0]
        web_page_text_with_spaces = re.sub(">", "> ", web_page_text)
        soup = BeautifulSoup(web_page_text_with_spaces)
        paragraphs[url] = get_paragraphs(soup)
    przona.save_dict(paragraphs, CSV_DIR+"paragraphs.csv")

## Keyword search

In [19]:
KEYWORD_FILE = "../data/beeldvormende_diagnostiek.txt"

In [4]:
paragraphs = przona.read_dict(CSV_DIR+"paragraphs.csv")

In [20]:
def read_keywords(filename=KEYWORD_FILE):
    infile = open(KEYWORD_FILE, "r")
    keywords = []
    for line in infile:
        keywords.append(line.strip().lower())
    infile.close()
    return(keywords)

In [None]:
keywords = read_keywords()

In [51]:
if os.path.isfile(CSV_DIR+"matches.csv"):
    matches = przona.read_dict(CSV_DIR+"matches.csv")
else:
    matches = {}
    counter = 0
    for url in paragraphs:
        text = " ".join(paragraphs[url])
        matches[url] = []
        for keyword in keywords:
            if re.search(r'\b'+keyword+r'\b', text, flags=re.IGNORECASE):
                matches[url].append(keyword)
        counter += 1
        if counter % 100 == 0:
            przona.squeal(counter)
    przona.squeal(counter)
    przona.save_dict(matches, CSV_DIR+"matches.csv")

In [52]:
counter = 0
for url in sorted(matches.keys(), key=lambda url:len(matches[url]), reverse=True):
    print(len(matches[url]), len((" ".join(paragraphs[url])).lower()), url)
    counter += 1
    if counter >= 20:
        break

21 48293 /richtlijn/meningeoom/diagnostiek/beeldvorming.html
19 102977 /richtlijn/diabetische_voet/diagnostiek_en_behandeling_charcot/diagnostiek_van_acute_charcot-voet.html
19 45354 /richtlijn/necrotiserende_wekedeleninfecties/beeldvormende_diagnostiek_nwdi.html
18 90994 /richtlijn/klinische_postmortem_radiologie/indicatiestelling_bij_foetussen_en_neonaten.html
17 43144 /richtlijn/fractuur-gerelateerde_infecties_fri_s/beeldvormend_onderzoek.html
17 40131 /richtlijn/veneuze_pathologie_varices/varices-diagnostiek_en_onderzoek/varices-overige_beeldvormende_technieken.html
17 52384 /richtlijn/primaire_tumor_onbekend/diagnostiek/beeldvormend_onderzoek.html
17 27431 /richtlijn/psa_bij_kinderen_op_locaties_buiten_de_ok/randvoorwaarden_toedienen_psa_bij_kinderen/procedures_voor_psa_bij_kinderen.html
16 106091 /richtlijn/borstkanker/screening/screening_binnen_het_bob/bob.html
16 108900 /richtlijn/borstkanker/diagnostiek/preoperatieve_stadiering/pet_ct.html
16 67006 /richtlijn/schildkliercarcin

In [49]:
keyword = keywords[0]
url = "/richtlijn/meningeoom/diagnostiek/beeldvorming.html"
for p in paragraphs[url]:
    matches_per_p = []
    for keyword in keywords:
        if re.search(keyword, p, flags=re.IGNORECASE):
            p = re.sub(r'(\b'+keyword+r'\b)', Style.BRIGHT+"\\1"+Style.RESET_ALL, p, flags=re.IGNORECASE)
            matches_per_p.append(keyword)
    if len(set(matches_per_p)) >= 5: 
        print(Fore.GREEN+Style.BRIGHT+str(len(set(matches_per_p))), str(matches_per_p)+Style.RESET_ALL, p)

[32m[1m6 ['beeld', 'ct', 'beeldvorming', 'mr', 'mri', 'scan'][0m Indien [1mbeeldvorming[0m van een (vermoedelijk) meningeoom door [1mMRI[0m bij de diagnostiek of follow-up niet haalbaar is, bijvoorbeeld doordat een patiënt niet lang plat kan liggen of bij contra-indicaties voor een [1mMRI[0m, is een [1mCT[0m-[1mscan[0m, bij voorkeur zonder en met jodiumhoudend contrast, een redelijk alternatief. Daar staat tegenover dat follow-up van een meningeoom met [1mCT[0m een herhaalde stralenbelasting betekent; dit in tegenstelling tot [1mMRI[0m.
[32m[1m9 ['beeld', 'computer', 'ct', 'tomografie', 'beeldvormende', 'mr', 'mri', 'radiologisch', 'scan'][0m Voor het vaststellen van de aanwezigheid van een ruimte-innemende intracraniële afwijking, de nadere differentiatie hiervan en/of de follow-up staan meerdere [1mbeeldvormende[0m modaliteiten ter beschikking. Als eerste modaliteit voor patiënten met acute symptomatologie wordt vaak gebruik gemaakt van een blanco [1mCT[0m-[1

## Analysis based on treatment phase

In [13]:
defined_phases = ["diagnostiek", "behandeling", "therapie", "preventie", "interventie", "onderzoek", "nazorg", "screening", "organisatie", "beleid", "pathologie", "nacontrole",
                  "voorlichting", "begeleiding", "revalidatie", "communicatie", "revalidatie", "indicatie", "zorg", "complicatie", "medicatie", "diagnose", "opvang", "follow-up"]

In [23]:
paragraphs = przona.read_dict(CSV_DIR+"paragraphs.csv")
keywords = read_keywords()

In [65]:
def check_urls_with_word(search_word):
    counter = 0
    for url in web_pages:
        if re.search(search_word, url):
            print(url)
            counter += 1
            if counter > 10: 
                break

In [64]:
check_urls_with_word("follow")

/richtlijn/pediatrisch_delier/gevolgen_en_follow-up_bij_pediatrisch_delier.html
/richtlijn/chirurgische_behandeling_van_obesitas/medische_nazorg_follow-up_na_chirurgische_behandeling_van_obesitas.html
/richtlijn/ongeruptureerd_intracranieel_aneurysma/follow-up_beeldvorming_bij_niet_preventief_behandelde_intracranieel_aneurysmata.html
/richtlijn/borstprothesechirurgie/radiologische_follow-up_bij_borstprothesechirurgie.html
/richtlijn/lymeziekte/evaluatie_van_een_patient_met_lymeziekte/follow-up_lymeziekte.html
/richtlijn/oesofaguscarcinoom/follow_up/postoperatieve_vitamine_b12_suppletie.html
/richtlijn/oesofaguscarcinoom/follow_up/nacontrole_en_nazorg.html
/gerelateerde_documenten/f/1748/Evidencetabel%20follow-up.pdf
/richtlijn/oesofaguscarcinoom/follow_up.html
/richtlijn/staphylococcus_aureus_bacteriemie/organisatie_van_zorg_s_aureus_bacteriemie/informatiestroom_en_follow-up_bij_s_aureus.html
/richtlijn/hersenmetastasen/diagnostiek_en_beeldvorming_-_hersenmetastasen/frequentie_neurolog

In [66]:
def find_interesting_phases(web_pages):
    phases = {}
    for url in web_pages:
        if re.search("^/richtlijn/", url):
            try:
                phase = url.split("/")[3]
                if phase != "":
                    for token in phase.split("_"):
                        if token in phases: 
                            phases[token] += 1
                        else:
                            phases[token] = 1
            except:
                pass
    return(phases)

In [67]:
phases = find_interesting_phases(web_pages)
list({phase:phases[phase] for phase in sorted(phases.keys(), key=lambda phase:phases[phase], reverse=True) if not phase in defined_phases}.items())[:10]

[('bij', 2871),
 ('van', 1177),
 ('en', 1123),
 ('-', 614),
 ('de', 276),
 ('acute', 243),
 ('kinderen', 221),
 ('interventies', 193),
 ('na', 193),
 ('met', 181)]

In [123]:
def add_totals(analysis, file_counts):
    total = {}
    total_files = {}
    for topic in analysis:
        total_row = 0
        total_row_files = 0
        for phase in analysis[topic]:
            if phase not in total: 
                total[phase] = 0
                total_files[phase] = 0
            total[phase] += analysis[topic][phase]
            total_files[phase] += file_counts[topic][phase]
            total_row += analysis[topic][phase]
            total_row_files += file_counts[topic][phase]
        analysis[topic]["totaal"] = total_row
        file_counts[topic]["totaal"] = total_row_files
    analysis["totaal"] = total
    file_counts["totaal"] = total_files
    analysis["totaal"]["totaal"] = sum(analysis["totaal"].values())
    file_counts["totaal"]["totaal"] = sum(file_counts["totaal"].values())
    return(analysis, file_counts)


def sort_dict(this_dict):
    return({topic:{ phase:this_dict[topic][phase] for phase in sorted(this_dict[topic].keys(), key=lambda phase:this_dict["totaal"][phase], reverse=True)}
            for topic in sorted(this_dict.keys(), key=lambda t:this_dict[t]["totaal"], reverse=True)})


def find_keywords_in_phases_per_recommendation(paragraphs):
    counter = 0
    analysis = {}
    file_counts = {}
    keywords_found = {}
    for url in paragraphs:
        if re.search("/richtlijn", url):
            text = " ".join(paragraphs[url])
            topic = url.split("/")[2]
            matches_page = None
            for phase in defined_phases:
                if re.search(phase, url):
                    if matches_page == None:
                        matches_page = []
                        for keyword in keywords:
                            if re.search(r'\b'+keyword+r'\b', text, flags=re.IGNORECASE):
                                matches_page.append(keyword)
                    if topic not in analysis:
                        analysis[topic] = {}
                        file_counts[topic] = {}
                        keywords_found[topic] = {}
                    if phase not in analysis[topic]:
                        analysis[topic][phase] = 0
                        file_counts[topic][phase] = 0
                        keywords_found[topic][phase] = set()
                    analysis[topic][phase] += len(matches_page)
                    file_counts[topic][phase] += 1
                    keywords_found[topic][phase] = keywords_found[topic][phase].union(set(matches_page))
        counter += 1
        if counter % 100 == 0:
            przona.squeal(counter)
    przona.squeal(counter)
    analysis, file_counts = add_totals(analysis, file_counts)
    analysis = sort_dict(analysis)
    file_counts = sort_dict(file_counts)
    keywords_found = { topic:{ phase:keywords_found[topic][phase] 
                               for phase in sorted(keywords_found[topic].keys()) }
                       for topic in sorted(keywords_found.keys()) }
    for topic in keywords_found:
        for phase in keywords_found[topic]:
            keywords_found[topic][phase] = " ".join(keywords_found[topic][phase])
    return(analysis, file_counts, keywords_found)


def find_missing_recommendations(web_pages, analysis):
    missing_recommendations = []
    for url in web_pages:
        if re.search("^/richtlijn/", url):
            try:
                recommendation = url.split("/")[2]
                if recommendation not in analysis and recommendation not in missing_recommendations and not re.search("(referentie|notitie).php|^(item|in-commentaar)$", recommendation):
                    missing_recommendations.append(recommendation)
            except:
                pass
    return(missing_recommendations)


def add_missing_recommendations(analysis, file_counts, missing_recommendations):
    for recommendation in missing_recommendations:
        analysis[recommendation] = {}
        file_counts[recommendation] = {}
    return(analysis, file_counts)

In [139]:
if not os.path.isfile(CSV_DIR+"analysis.csv"):
    analysis, file_counts, keywords_found = find_keywords_in_phases_per_recommendation(paragraphs)
    missing_recommendations = find_missing_recommendations(web_pages, analysis)
    analysis, file_counts = add_missing_recommendations(analysis, file_counts, missing_recommendations)
    pd.DataFrame(analysis).T.to_csv(CSV_DIR+"analysis.csv")
    pd.DataFrame(file_counts).T.to_csv(CSV_DIR+"file_counts.csv")
    pd.DataFrame(keywords_found).T.to_csv(CSV_DIR+"keywords_found.csv")
else:
    analysis = pd.DataFrame.to_dict(pd.read_csv(CSV_DIR+"analysis.csv", index_col=0).T)
    file_counts = pd.DataFrame.to_dict(pd.read_csv(CSV_DIR+"file_counts.csv", index_col=0).T)
    keywords_found = pd.DataFrame.to_dict(pd.read_csv(CSV_DIR+"keywords_found.csv", index_col=0).T)
len(analysis)

419

In [129]:
pd.DataFrame(analysis).T

Unnamed: 0,totaal,behandeling,diagnostiek,therapie,zorg,onderzoek,pathologie,beleid,revalidatie,preventie,...,nazorg,follow-up,nacontrole,diagnose,medicatie,voorlichting,communicatie,opvang,complicatie,begeleiding
totaal,31083.0,8147.0,5059.0,2864.0,2435.0,2223.0,1030.0,977.0,966.0,875.0,...,596.0,471.0,423.0,372.0,340.0,318.0,260.0,216.0,207.0,205.0
borstkanker,1861.0,265.0,237.0,375.0,208.0,63.0,127.0,26.0,,,...,146.0,,135.0,,11.0,,,,,
prostaatcarcinoom,996.0,199.0,134.0,188.0,77.0,243.0,,,,,...,77.0,,,11.0,,11.0,,,,11.0
acute_neurologie,965.0,273.0,162.0,13.0,8.0,143.0,,16.0,,30.0,...,,,,24.0,18.0,,,121.0,61.0,
colorectaal_carcinoom_crc,961.0,279.0,69.0,78.0,161.0,16.0,80.0,,,16.0,...,65.0,58.0,57.0,,,,26.0,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
adhd_bij_kinderen_en_jeugdigen_-_in_ontwikkeling,,,,,,,,,,,...,,,,,,,,,,
barrett-oesofagus,,,,,,,,,,,...,,,,,,,,,,
diphencyprone_dpcp_bij_alopecia_areata_aa,,,,,,,,,,,...,,,,,,,,,,
breast_reconstruction,,,,,,,,,,,...,,,,,,,,,,


In [130]:
pd.DataFrame(file_counts).T

Unnamed: 0,totaal,behandeling,diagnostiek,therapie,zorg,onderzoek,beleid,interventie,organisatie,revalidatie,...,screening,complicatie,diagnose,follow-up,voorlichting,nazorg,opvang,communicatie,begeleiding,nacontrole
totaal,8599.0,2403.0,1030.0,809.0,789.0,441.0,432.0,384.0,322.0,320.0,...,111.0,108.0,106.0,99.0,93.0,93.0,75.0,71.0,61.0,58.0
schizofrenie,173.0,45.0,13.0,21.0,22.0,4.0,1.0,39.0,7.0,0.0,...,2.0,0.0,2.0,0.0,1.0,0.0,0.0,0.0,14.0,0.0
borstkanker,169.0,25.0,19.0,35.0,20.0,5.0,2.0,0.0,5.0,0.0,...,18.0,0.0,0.0,0.0,0.0,14.0,0.0,0.0,0.0,13.0
urine-incontinentie_ui_2e-_en_3e-lijnszorg,146.0,55.0,8.0,5.0,69.0,1.0,0.0,6.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
antitrombotisch_beleid,135.0,9.0,0.0,14.0,0.0,0.0,83.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
adhd_bij_kinderen_en_jeugdigen_-_in_ontwikkeling,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
barrett-oesofagus,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
diphencyprone_dpcp_bij_alopecia_areata_aa,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
breast_reconstruction,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Topic tokens in directory structure

In [217]:
def find_interesting_tokens(web_pages):
    tokens = {}
    counter = 0
    for url in web_pages:
        counter += 1
        if re.search("^/richtlijn/", url):
            parts = url.split("/")
            for level in range(0,len(parts)):
                part = parts[level]
                for token in re.split(r'_|-|\.', part):
                    if not level in tokens:
                        tokens[level] = {}
                    if not token in tokens[level]:
                        tokens[level][token] = 0
                    tokens[level][token] += 1
    totals = {}
    for level in tokens:
        for token in tokens[level]:
            if token not in totals:
                totals[token] = 0
            totals[token] += tokens[level][token]
    tokens[0] = totals
    for level in tokens:
        tokens[level] = { token:tokens[level][token] for token in sorted(tokens[level].keys(), key=lambda token:tokens[0][token], reverse=True)}
    for level in tokens:
        if "" in tokens[level]:
            del(tokens[level][""])
        to_be_deleted = []
        for token in tokens[level]:
            if re.search(r'=|\?', token):
                to_be_deleted.append(token)
        for token in to_be_deleted:
            del(tokens[level][token])
    return(tokens)

In [218]:
folder_structure = find_interesting_tokens(web_pages)

In [219]:
pd.DataFrame(folder_structure).fillna(0).to_csv(CSV_DIR+"folder_structure.csv")
pd.DataFrame(folder_structure)

Unnamed: 0,0,1,2,3,4,5,6
richtlijn,9342,9323.0,,19.0,,,
html,8750,,,3578.0,4080.0,1050.0,42.0
bij,6928,,1616.0,2871.0,1930.0,474.0,37.0
van,2676,,819.0,1177.0,567.0,111.0,2.0
behandeling,2658,,323.0,1587.0,643.0,103.0,2.0
...,...,...,...,...,...,...,...
ptp,1,,,,,,1.0
nhtr,1,,,,,,1.0
koortsreactie,1,,,,,,1.0
antihypertisum,1,,,,,,1.0
