In [None]:
import sys
import json
# sys.path.insert(0,'/groups/dso/meijie/melanoma_data/Code_Bahadir/Wispermed/extraction_funs')
# sys.path.insert(0,'/groups/dso/meijie/melanoma_data/fhir_data/Top2000Melanomexport-2022-04-30T10:49:34')
# sys.path.insert(0,'/groups/dso/meijie/melanoma_data/fhir_data')
sys.path.insert(0,'extraction_funs')

from medication_extractor import *
from tnm_extractors import *
from extraction_baseclass import extractor, tryPath
from group_and_merge import groupResources, combineResources
from procedure_extractors import *
from os import listdir
from itertools import chain

folder_jsons = "data_new/"

In [None]:
jfiles = [x for x in listdir(folder_jsons) if x[-4:] == 'json'] # save names of json files in list

In [None]:
# find patients with Stage 3 and 4, medication, surgeries and examinations

ptnm_extractor = getExtractorPTNM()
ctnm_extractor = getExtractorCTNM()
medi_extractor = getMedicationAdministration()
radiotherapy_ext = getRadioTherapy()
surgeries_ext = getOperation()
examinations_ext = getExaminations()
progress_extractor = getExtractorProgress()
tproperties_extractor = getExtractorPropertiesPrimary()

jsons_with_s3_and_s4 = []
jsons_with_medication = []
jsons_with_radiotherapy = []
jsons_with_examinations = []
jsons_with_progress = []
jsons_with_surgery = []
jsons_with_tproperties = []
for jfile in jfiles:
    resources = json.load(open(folder_jsons + jfile))['entry']
    stages = ptnm_extractor.extract(resources) + ctnm_extractor.extract(resources)
    stages = [x.get('tnm_stage','NA') for x in stages]
    has3 = any([x.startswith("III") for x in stages])
    has4 = any([x.startswith("IV") for x in stages])
    if has3 and has4:
        jsons_with_s3_and_s4.append(jfile)
    if len(medi_extractor.extract(resources)) > 0:
        jsons_with_medication.append(jfile)
    if len(radiotherapy_ext.extract(resources)) > 0:
        jsons_with_radiotherapy.append(jfile)
    if len(surgeries_ext.extract(resources)) > 0:
        jsons_with_surgery.append(jfile)
    if len(examinations_ext.extract(resources)) > 0:
        jsons_with_examinations.append(jfile)
    if len(progress_extractor.extract(resources)) > 0:
        jsons_with_progress.append(jfile)
    if len(tproperties_extractor.extract(resources)) > 0:
        jsons_with_tproperties.append(jfile)       
print(len(jsons_with_s3_and_s4))        

In [None]:
# do we have a patient that has data for all those resources?
jsons_with_all_info = set(jsons_with_s3_and_s4)
other_infos = [jsons_with_medication, jsons_with_radiotherapy, jsons_with_surgery,
               jsons_with_examinations, jsons_with_progress, jsons_with_tproperties]

for with_info in other_infos:
    jsons_with_all_info = jsons_with_all_info.intersection(with_info)
    print(len(jsons_with_all_info))
jsons_with_all_info = list(jsons_with_all_info)
jsons_with_all_info.sort()

In [None]:
# Some functions to make the output beautiful

# -> put in a resource with stage information -> this function outputs a string easy to read for doctors
# -> will be used later
def printStage(x):
    t, n, m = x.get('tstage',' k.A. '), x.get('nstage',' k.A. '), x.get('mstage',' k.A. ')
    tnm = x.get('tnm_stage','')
    ver = x.get('cat_version')
    if ver is None:
        ver = "k.A."
    porc = x.get('p_or_c')
    r = x.get('residual_state','')
    senPos = x.get('snodes_postive',"")
    senTest = x.get('snodes_examined',"?")
    renPos = x.get('rnodes_positive',"")
    renTest = x.get('rnodes_examined',"?")
    
    res = f"Version: {ver}, {porc}{tnm} T{t}N{n}M{m}{r}"
    if senPos != "":
        res += f", Sentinel: {senPos}+/{senTest}"
    if renPos != "":
        res += f", Regional Lymphnodes: {renPos}+/{renTest}"
    return res

In [None]:
# adds a field to a dictionary and returns the dictionary
def addField(x, addfun, fieldName):
    x[fieldName] = addfun(x)
    return x

In [None]:
jfile = jsons_with_all_info[1] # looks very rich

resources = json.load(open(folder_jsons + jfile))['entry']

stages = [addField(x, lambda x: "p","p_or_c") for x in ptnm_extractor.extract(resources)]
stages += [addField(x, lambda x: "c","p_or_c") for x in ctnm_extractor.extract(resources)]
stages.sort(key = lambda x: x['dt_record'])
stages

In [None]:
stages = groupResources(stages, "dt_record")
stages # for each date we want to select one

In [None]:
# we only want to keep one resource per date
def keepHighestPriority(resources, field, priorities):
    highestPrio = len(priorities) + 10
    for resource in resources:
        currentPrio = priorities.index(resource[field])
        if currentPrio < highestPrio:
             highestPrio = currentPrio
    highestPrio = priorities[highestPrio]
    return [x for x in resources if x[field] == highestPrio]

stages = [keepHighestPriority(x, 'p_or_c', ['p','c']) for x in stages]
stages = [keepHighestPriority(x, 'cat_version', ['AJCC2017','AJCC2009',None]) for x in stages]
stages

In [None]:
# after checking the resources, we can safely combine them
stages = [combineResources(x) for x in stages]
stages

In [None]:
# add nice field to print and the next date as stop date
stages = [addField(x, printStage, "val_print") for x in stages]
for i in range(len(stages)-1):
    stages[i]['dt_end'] = stages[i+1]['dt_record']
stages[-1]['dt_end'] = "2019-06-22" #  2019-06-21 is the date he deceased
stages

In [None]:
examinations = examinations_ext.extract(resources)
examinations = groupResources(examinations, "dt_record")
examinations = [combineResources(x) for x in examinations]
examinations

In [None]:
radio_therapy = radiotherapy_ext.extract(resources)
radio_therapy = [combineResources(x) for x in groupResources(radio_therapy, "dt_start")] # deletes one duplicate
radio_therapy

In [None]:
progresses = progress_extractor.extract(resources)
progresses = [combineResources(x) for x in groupResources(progresses, "dt_record")]
progresses

In [None]:
medis = medi_extractor.extract(resources)
medis.sort(key=lambda x: x['dt_start'])


# unknown is stupid for demonstration -> change it to Nivo+Ipi and Pembro
medis[0]['cat_drugtype'] = "Nivo+Ipi"
medis[1]['cat_drugtype'] = "Pembrolizumab"
medis

In [None]:
properties_prim = tproperties_extractor.extract(resources)
properties_prim = [combineResources(x) for x in groupResources(properties_prim, "dt_record")]
properties_prim

In [None]:
oncogenes = getExtractorOncogenes().extract(resources)
oncogenes = [x for x in oncogenes if x['cat_gene'] in ['TERT Ergebnis','BRAF Ergebnis',
                                                       'NRAS Ergebnis','NF1 Ergebnis']] # should be the most relevant

# make outcome nice
def printGene(x):
    res = x['cat_gene'][:-8]
    if x['flg_mutated']:
        res += 'mutation'
        freq = x.get('num_frequency',-1)
        if freq > 0:
            res += " (Frequency " + str(freq) + ")"
    else:
        res += 'wildtype'
    return res
        
oncogenes = [addField(x, printGene, 'val_print') for x in oncogenes]
for onco in oncogenes:
    onco.pop('cat_gene')
    onco.pop('flg_mutated')
    onco.pop('num_frequency', None)
oncogenes = [combineResources(x) for x in groupResources(oncogenes, 'dt_record')]
oncogenes

In [None]:
patientinfo = [x for x in resources if x['resource']['resourceType'] == 'Patient']
patientinfo = patientinfo[0]['resource']
patientinfo.keys()
patientinfo.pop('meta', None)
patientinfo.pop('managingOrganization', None)
patientinfo.pop('identifier', None)
patientinfo

In [None]:
json4demonstration = {
    'patient_info': patientinfo,
    'stages': stages,
    'examinations': examinations,
    'radiotherapy': radio_therapy,
    'progresses': progresses,
    'medication': medis,
    'properties_primary': properties_prim,
    'oncogenes': oncogenes
}

In [None]:
with open('example_patient.json', 'w', encoding='utf-8') as f:
    json.dump(json4demonstration, f, ensure_ascii=False, indent=4)