In [1]:
import sys
import json


try:
    sys.path.insert(0,'/groups/dso/meijie/melanoma_data/Code_Bahadir/patient-history/extraction_funs')
    sys.path.insert(0,'/groups/dso/meijie/melanoma_data/fhir_data/Top2000Melanomexport-2022-04-30T10:49:34')
    sys.path.insert(0,'/groups/dso/meijie/melanoma_data/fhir_data')
except ImportError:
    sys.path.insert(0,'extraction_funs')

from medication_extractor import *
from tnm_extractors import *
from extraction_baseclass import extractor, tryPath
from group_and_merge import groupResources, combineResources
from procedure_extractors import *
from condition_extractor import *
from careplan_extractor import *
from os import listdir
from itertools import chain

#folder_jsons = "data_new/"

folder_jsons = "/groups/dso/meijie/melanoma_data/fhir_data/Top2000Melanomexport-2022-04-30T10:49:34/"

In [2]:
jfiles = [x for x in listdir(folder_jsons) if x[-4:] == 'json'] # save names of json files in list

In [3]:
# find patients with Stage 3 and 4, medication, surgeries and examinations

ptnm_extractor = getExtractorPTNM()
ctnm_extractor = getExtractorCTNM()
medi_extractor = getMedicationAdministration()
radiotherapy_ext = getRadioTherapy()
surgeries_ext = getOperation()
examinations_ext = getExaminations()
progress_extractor = getExtractorProgress()
tproperties_extractor = getExtractorPropertiesPrimary()

jsons_with_s3_and_s4 = []
jsons_with_medication = []
jsons_with_radiotherapy = []
jsons_with_examinations = []
jsons_with_progress = []
jsons_with_surgery = []
jsons_with_tproperties = []
for jfile in jfiles:
    resources = json.load(open(folder_jsons + jfile))['entry']
    stages = ptnm_extractor.extract(resources) + ctnm_extractor.extract(resources)
    stages = [x.get('tnm_stage','NA') for x in stages]
    has3 = any([x.startswith("III") for x in stages])
    has4 = any([x.startswith("IV") for x in stages])
    if has3 and has4:
        jsons_with_s3_and_s4.append(jfile)
    if len(medi_extractor.extract(resources)) > 0:
        jsons_with_medication.append(jfile)
    if len(radiotherapy_ext.extract(resources)) > 0:
        jsons_with_radiotherapy.append(jfile)
    if len(surgeries_ext.extract(resources)) > 0:
        jsons_with_surgery.append(jfile)
    if len(examinations_ext.extract(resources)) > 0:
        jsons_with_examinations.append(jfile)
    if len(progress_extractor.extract(resources)) > 0:
        jsons_with_progress.append(jfile)
    if len(tproperties_extractor.extract(resources)) > 0:
        jsons_with_tproperties.append(jfile)       
print(len(jsons_with_s3_and_s4))        

317


In [4]:
# do we have a patient that has data for all those resources?
jsons_with_all_info = set(jsons_with_s3_and_s4)
other_infos = [jsons_with_medication, jsons_with_radiotherapy, jsons_with_surgery,
               jsons_with_examinations, jsons_with_progress, jsons_with_tproperties]

for with_info in other_infos:
    jsons_with_all_info = jsons_with_all_info.intersection(with_info)
    print(len(jsons_with_all_info))
jsons_with_all_info = list(jsons_with_all_info)
jsons_with_all_info.sort()

285
167
166
165
158
139


In [5]:
# Some functions to make the output beautiful

# -> put in a resource with stage information -> this function outputs a string easy to read for doctors
# -> will be used later
def printStage(x):
    t, n, m = x.get('tstage',' k.A. '), x.get('nstage',' k.A. '), x.get('mstage',' k.A. ')
    tnm = x.get('tnm_stage','')
    ver = x.get('cat_version')
    if ver is None:
        ver = "k.A."
    porc = x.get('p_or_c')
    r = x.get('residual_state','')
    senPos = x.get('snodes_postive',"")
    senTest = x.get('snodes_examined',"?")
    renPos = x.get('rnodes_positive',"")
    renTest = x.get('rnodes_examined',"?")
    
    res = f"Version: {ver}, {porc}{tnm} T{t}N{n}M{m}{r}"
    if senPos != "":
        res += f", Sentinel: {senPos}+/{senTest}"
    if renPos != "":
        res += f", Regional Lymphnodes: {renPos}+/{renTest}"
    return res

In [6]:
# adds a field to a dictionary and returns the dictionary
def addField(x, addfun, fieldName):
    x[fieldName] = addfun(x)
    return x

In [7]:
jfile = jsons_with_all_info[1] # looks very rich

resources = json.load(open(folder_jsons + jfile))['entry']

stages = [addField(x, lambda x: "p","p_or_c") for x in ptnm_extractor.extract(resources)]
stages += [addField(x, lambda x: "c","p_or_c") for x in ctnm_extractor.extract(resources)]
stages.sort(key = lambda x: x['dt_record'])




In [8]:
stages = groupResources(stages, "dt_record")
stages # for each date we want to select one

[[{'patid': 'Patient/02b4f296-3cfc-4e2b-a35c-70c83712160b',
   'dt_record': '2013-12',
   'cat_version': None,
   'tnm_stage': 'IIA',
   'tstage': '2b',
   'nstage': '0',
   'mstage': '0',
   'p_or_c': 'p'}],
 [{'patid': 'Patient/02b4f296-3cfc-4e2b-a35c-70c83712160b',
   'dt_record': '2013-12-15',
   'tnm_stage': 'IIIB',
   'cat_version': 'AJCC2017',
   'tstage': '2b',
   'nstage': '1a',
   'mstage': '0',
   'p_or_c': 'c'},
  {'patid': 'Patient/02b4f296-3cfc-4e2b-a35c-70c83712160b',
   'dt_record': '2013-12-15',
   'tnm_stage': 'IIIB',
   'cat_version': 'AJCC2009',
   'tstage': '2b',
   'nstage': '1a',
   'mstage': '0',
   'p_or_c': 'c'}],
 [{'patid': 'Patient/02b4f296-3cfc-4e2b-a35c-70c83712160b',
   'dt_record': '2014-01',
   'cat_version': None,
   'tnm_stage': 'IIIB',
   'tstage': '2b',
   'nstage': '1a',
   'mstage': '0',
   'snodes_postive': 1,
   'snodes_examined': 1,
   'p_or_c': 'p'}],
 [{'patid': 'Patient/02b4f296-3cfc-4e2b-a35c-70c83712160b',
   'dt_record': '2014-10-13',
  

In [9]:
# we only want to keep one resource per date
def keepHighestPriority(resources, field, priorities):
    highestPrio = len(priorities) + 10
    for resource in resources:
        currentPrio = priorities.index(resource[field])
        if currentPrio < highestPrio:
             highestPrio = currentPrio
    highestPrio = priorities[highestPrio]
    return [x for x in resources if x[field] == highestPrio]

stages = [keepHighestPriority(x, 'p_or_c', ['p','c']) for x in stages]
stages = [keepHighestPriority(x, 'cat_version', ['AJCC2017','AJCC2009',None]) for x in stages]
stages

[[{'patid': 'Patient/02b4f296-3cfc-4e2b-a35c-70c83712160b',
   'dt_record': '2013-12',
   'cat_version': None,
   'tnm_stage': 'IIA',
   'tstage': '2b',
   'nstage': '0',
   'mstage': '0',
   'p_or_c': 'p'}],
 [{'patid': 'Patient/02b4f296-3cfc-4e2b-a35c-70c83712160b',
   'dt_record': '2013-12-15',
   'tnm_stage': 'IIIB',
   'cat_version': 'AJCC2017',
   'tstage': '2b',
   'nstage': '1a',
   'mstage': '0',
   'p_or_c': 'c'}],
 [{'patid': 'Patient/02b4f296-3cfc-4e2b-a35c-70c83712160b',
   'dt_record': '2014-01',
   'cat_version': None,
   'tnm_stage': 'IIIB',
   'tstage': '2b',
   'nstage': '1a',
   'mstage': '0',
   'snodes_postive': 1,
   'snodes_examined': 1,
   'p_or_c': 'p'}],
 [{'patid': 'Patient/02b4f296-3cfc-4e2b-a35c-70c83712160b',
   'dt_record': '2014-10-13',
   'cat_version': None,
   'tnm_stage': 'IIIC',
   'tstage': '2b',
   'nstage': '2b',
   'mstage': '0',
   'rnodes_positive': 1,
   'rnodes_examined': 1,
   'p_or_c': 'p'}],
 [{'patid': 'Patient/02b4f296-3cfc-4e2b-a35c-70

In [10]:
# after checking the resources, we can safely combine them
stages = [combineResources(x) for x in stages]
stages

[{'patid': 'Patient/02b4f296-3cfc-4e2b-a35c-70c83712160b',
  'dt_record': '2013-12',
  'cat_version': None,
  'tnm_stage': 'IIA',
  'tstage': '2b',
  'nstage': '0',
  'mstage': '0',
  'p_or_c': 'p'},
 {'patid': 'Patient/02b4f296-3cfc-4e2b-a35c-70c83712160b',
  'dt_record': '2013-12-15',
  'tnm_stage': 'IIIB',
  'cat_version': 'AJCC2017',
  'tstage': '2b',
  'nstage': '1a',
  'mstage': '0',
  'p_or_c': 'c'},
 {'patid': 'Patient/02b4f296-3cfc-4e2b-a35c-70c83712160b',
  'dt_record': '2014-01',
  'cat_version': None,
  'tnm_stage': 'IIIB',
  'tstage': '2b',
  'nstage': '1a',
  'mstage': '0',
  'snodes_postive': 1,
  'snodes_examined': 1,
  'p_or_c': 'p'},
 {'patid': 'Patient/02b4f296-3cfc-4e2b-a35c-70c83712160b',
  'dt_record': '2014-10-13',
  'cat_version': None,
  'tnm_stage': 'IIIC',
  'tstage': '2b',
  'nstage': '2b',
  'mstage': '0',
  'rnodes_positive': 1,
  'rnodes_examined': 1,
  'p_or_c': 'p'},
 {'patid': 'Patient/02b4f296-3cfc-4e2b-a35c-70c83712160b',
  'dt_record': '2014-11-14',

In [11]:
# add nice field to print and the next date as stop date
stages = [addField(x, printStage, "val_print") for x in stages]
for i in range(len(stages)-1):
    stages[i]['dt_end'] = stages[i+1]['dt_record']
stages[-1]['dt_end'] = "2019-06-22" #  2019-06-21 is the date he deceased
stages



[{'patid': 'Patient/02b4f296-3cfc-4e2b-a35c-70c83712160b',
  'dt_record': '2013-12',
  'cat_version': None,
  'tnm_stage': 'IIA',
  'tstage': '2b',
  'nstage': '0',
  'mstage': '0',
  'p_or_c': 'p',
  'val_print': 'Version: k.A., pIIA T2bN0M0',
  'dt_end': '2013-12-15'},
 {'patid': 'Patient/02b4f296-3cfc-4e2b-a35c-70c83712160b',
  'dt_record': '2013-12-15',
  'tnm_stage': 'IIIB',
  'cat_version': 'AJCC2017',
  'tstage': '2b',
  'nstage': '1a',
  'mstage': '0',
  'p_or_c': 'c',
  'val_print': 'Version: AJCC2017, cIIIB T2bN1aM0',
  'dt_end': '2014-01'},
 {'patid': 'Patient/02b4f296-3cfc-4e2b-a35c-70c83712160b',
  'dt_record': '2014-01',
  'cat_version': None,
  'tnm_stage': 'IIIB',
  'tstage': '2b',
  'nstage': '1a',
  'mstage': '0',
  'snodes_postive': 1,
  'snodes_examined': 1,
  'p_or_c': 'p',
  'val_print': 'Version: k.A., pIIIB T2bN1aM0, Sentinel: 1+/1',
  'dt_end': '2014-10-13'},
 {'patid': 'Patient/02b4f296-3cfc-4e2b-a35c-70c83712160b',
  'dt_record': '2014-10-13',
  'cat_version'

In [12]:
# Add code to include code information on the stages
json_pat = open(folder_jsons + jfile)
json_pat = json.load(json_pat)
primary_stage = getPrimaryTumor(json_pat)
cancer_code = primary_stage[0]['ICD10_2019_cancer_code']

def add_to_beginning(my_list, entry):
    my_list.insert(0, entry)
    return my_list

stages = add_to_beginning(stages, {'ICD10_2019_cancer_code':cancer_code})
stages

[{'ICD10_2019_cancer_code': 'C43.7'},
 {'patid': 'Patient/02b4f296-3cfc-4e2b-a35c-70c83712160b',
  'dt_record': '2013-12',
  'cat_version': None,
  'tnm_stage': 'IIA',
  'tstage': '2b',
  'nstage': '0',
  'mstage': '0',
  'p_or_c': 'p',
  'val_print': 'Version: k.A., pIIA T2bN0M0',
  'dt_end': '2013-12-15'},
 {'patid': 'Patient/02b4f296-3cfc-4e2b-a35c-70c83712160b',
  'dt_record': '2013-12-15',
  'tnm_stage': 'IIIB',
  'cat_version': 'AJCC2017',
  'tstage': '2b',
  'nstage': '1a',
  'mstage': '0',
  'p_or_c': 'c',
  'val_print': 'Version: AJCC2017, cIIIB T2bN1aM0',
  'dt_end': '2014-01'},
 {'patid': 'Patient/02b4f296-3cfc-4e2b-a35c-70c83712160b',
  'dt_record': '2014-01',
  'cat_version': None,
  'tnm_stage': 'IIIB',
  'tstage': '2b',
  'nstage': '1a',
  'mstage': '0',
  'snodes_postive': 1,
  'snodes_examined': 1,
  'p_or_c': 'p',
  'val_print': 'Version: k.A., pIIIB T2bN1aM0, Sentinel: 1+/1',
  'dt_end': '2014-10-13'},
 {'patid': 'Patient/02b4f296-3cfc-4e2b-a35c-70c83712160b',
  'dt_

In [13]:
examinations = examinations_ext.extract(resources)
examinations = groupResources(examinations, "dt_record")
examinations = [combineResources(x) for x in examinations]
examinations

[{'patid': 'Patient/02b4f296-3cfc-4e2b-a35c-70c83712160b',
  'dt_record': '2014-02-06',
  'cat_examination_type': ['Physical Examination', 'Laboratory Procedure'],
  'cat_reasons': ['Treatment Planning', 'Initial presentation']},
 {'patid': 'Patient/02b4f296-3cfc-4e2b-a35c-70c83712160b',
  'dt_record': '2014-05-07',
  'cat_examination_type': ['Physical Examination',
   'Laboratory Procedure',
   'Ultrasonography'],
  'cat_reasons': 'Follow-Up'},
 {'patid': 'Patient/02b4f296-3cfc-4e2b-a35c-70c83712160b',
  'dt_record': '2014-08-22',
  'cat_examination_type': ['Physical Examination',
   'Laboratory Procedure',
   'Computed Tomography',
   'Ultrasonography'],
  'cat_reasons': ['Diagnostic or staging procedure', 'Follow-Up']},
 {'patid': 'Patient/02b4f296-3cfc-4e2b-a35c-70c83712160b',
  'dt_record': '2014-10-06',
  'cat_examination_type': ['Physical Examination',
   'Laboratory Procedure',
   'Computed Tomography',
   'Ultrasonography'],
  'cat_reasons': ['Diagnostic or staging procedure',

In [14]:
radio_therapy = radiotherapy_ext.extract(resources)
radio_therapy = [combineResources(x) for x in groupResources(radio_therapy, "dt_start")] # deletes one duplicate
radio_therapy

[{'patid': 'Patient/02b4f296-3cfc-4e2b-a35c-70c83712160b',
  'dt_start': '2016-06',
  'dt_end': '2016-06-29',
  'cat_intention': ['Palliativ', 'Adjuvant'],
  'cat_status': 'completed',
  'cat_reason_end': 'Reguläres Ende'},
 {'patid': 'Patient/02b4f296-3cfc-4e2b-a35c-70c83712160b',
  'dt_start': '2017-02-24',
  'dt_end': '2017-02-28',
  'cat_intention': 'Palliativ',
  'cat_status': 'completed',
  'cat_reason_end': 'Reguläres Ende'},
 {'patid': 'Patient/02b4f296-3cfc-4e2b-a35c-70c83712160b',
  'dt_start': '2019-03-21',
  'dt_end': '2019-03-21',
  'cat_intention': 'Palliativ',
  'cat_status': 'completed',
  'cat_reason_end': 'Sonstiges'}]

In [15]:
progresses = progress_extractor.extract(resources)
progresses = [combineResources(x) for x in groupResources(progresses, "dt_record")]
progresses

[{'patid': 'Patient/02b4f296-3cfc-4e2b-a35c-70c83712160b',
  'dt_record': '2014-05-07',
  'cat_progress_nodes': 'Kein Lymphknotenbefall nachweisbar'},
 {'patid': 'Patient/02b4f296-3cfc-4e2b-a35c-70c83712160b',
  'dt_record': '2014-08-22',
  'cat_progress_overall': 'Entfällt, da Behandlung im Rahmen eines multimodalen Konzepts und dieses noch nicht abgeschlossen',
  'cat_progress_metastases': 'Fraglicher Befund',
  'cat_progress_nodes': 'Fraglicher Befund'},
 {'patid': 'Patient/02b4f296-3cfc-4e2b-a35c-70c83712160b',
  'dt_record': '2014-10-06',
  'cat_progress_nodes': 'Fraglicher Befund'},
 {'patid': 'Patient/02b4f296-3cfc-4e2b-a35c-70c83712160b',
  'dt_record': '2014-10-31',
  'cat_progress_overall': 'Entfällt, da Behandlung im Rahmen eines multimodalen Konzepts und dieses noch nicht abgeschlossen',
  'cat_progress_metastases': 'Keine Fernmetastasen nachweisbar'},
 {'patid': 'Patient/02b4f296-3cfc-4e2b-a35c-70c83712160b',
  'dt_record': '2015-02-11',
  'cat_progress_nodes': 'Kein Lymph

In [16]:
medis = medi_extractor.extract(resources)
medis.sort(key=lambda x: x['dt_start'])


# unknown is stupid for demonstration -> change it to Nivo+Ipi and Pembro
medis[0]['cat_drugtype'] = "Nivo+Ipi"
medis[1]['cat_drugtype'] = "Pembrolizumab"
medis

[{'patid': 'Patient/02b4f296-3cfc-4e2b-a35c-70c83712160b',
  'cat_drugtype': 'Nivo+Ipi',
  'dt_start': '2016-07-13T19:41:57+00:00',
  'dt_end': '2016-10-21T19:41:57+00:00',
  'cat_intention': 'Palliativ',
  'cat_status': 'stopped',
  'cat_reason_end': 'Abbruch wegen Nebenwirkungen',
  'num_quantity': 5},
 {'patid': 'Patient/02b4f296-3cfc-4e2b-a35c-70c83712160b',
  'cat_drugtype': 'Pembrolizumab',
  'dt_start': '2017-06-06T19:41:57+00:00',
  'dt_end': '2019-06-21T19:41:57+00:00',
  'cat_intention': 'Palliativ',
  'cat_status': 'stopped',
  'cat_reason_end': 'Sonstiges'}]

In [17]:
properties_prim = tproperties_extractor.extract(resources)
properties_prim = [combineResources(x) for x in groupResources(properties_prim, "dt_record")]
properties_prim

[{'patid': 'Patient/02b4f296-3cfc-4e2b-a35c-70c83712160b',
  'dt_record': '2013-12',
  'flg_ulcerated': True,
  'flg_regression': False,
  'no_tumor_thickness': 1.9},
 {'patid': 'Patient/02b4f296-3cfc-4e2b-a35c-70c83712160b',
  'dt_record': '2014-10-13',
  'flg_transcapsular': True},
 {'patid': 'Patient/02b4f296-3cfc-4e2b-a35c-70c83712160b',
  'dt_record': '2016-06-01',
  'flg_transcapsular': True}]

In [18]:
oncogenes = getExtractorOncogenes().extract(resources)
oncogenes = [x for x in oncogenes if x['cat_gene'] in ['TERT Ergebnis','BRAF Ergebnis',
                                                       'NRAS Ergebnis','NF1 Ergebnis']] # should be the most relevant

# make outcome nice
def printGene(x):
    res = x['cat_gene'][:-8]
    if x['flg_mutated']:
        res += 'mutation'
        freq = x.get('num_frequency',-1)
        if freq > 0:
            res += " (Frequency " + str(freq) + ")"
    else:
        res += 'wildtype'
    return res
        
oncogenes = [addField(x, printGene, 'val_print') for x in oncogenes]
for onco in oncogenes:
    onco.pop('cat_gene')
    onco.pop('flg_mutated')
    onco.pop('num_frequency', None)
oncogenes = [combineResources(x) for x in groupResources(oncogenes, 'dt_record')]
oncogenes

[{'patid': 'Patient/02b4f296-3cfc-4e2b-a35c-70c83712160b',
  'dt_record': '2013',
  'val_print': ['BRAF mutation', 'NRAS wildtype']},
 {'patid': 'Patient/02b4f296-3cfc-4e2b-a35c-70c83712160b',
  'dt_record': '2015-12-14',
  'val_print': ['NF1 wildtype',
   'BRAF mutation (Frequency 44)',
   'NRAS wildtype',
   'TERT wildtype']}]

In [19]:
patientinfo = [x for x in resources if x['resource']['resourceType'] == 'Patient']
patientinfo = patientinfo[0]['resource']
patientinfo.keys()
patientinfo.pop('meta', None)
patientinfo.pop('managingOrganization', None)
patientinfo.pop('identifier', None)
patientinfo

{'resourceType': 'Patient',
 'id': '02b4f296-3cfc-4e2b-a35c-70c83712160b',
 'gender': 'female',
 'birthDate': '1961-05-30',
 'deceasedDateTime': '2019-06-21T19:41:57.025+02:00'}

In [21]:
# Make the Careplan entry
careplan_simp = getCareplan(json_pat)
careplan_simp

[{'patid': 'Patient/02b4f296-3cfc-4e2b-a35c-70c83712160b',
  'time_of_careplan': '2019-02-06',
  'careplan_name': 'Strahlentherapie',
  'careplan_intention': 'Palliativ',
  'careplan_type': 'Rezidivtherapie'},
 {'patid': 'Patient/02b4f296-3cfc-4e2b-a35c-70c83712160b',
  'time_of_careplan': '2019-02-06',
  'careplan_name': 'Zielgerichtete Substanzen',
  'careplan_intention': 'Palliativ',
  'careplan_type': 'Rezidivtherapie'},
 {'patid': 'Patient/02b4f296-3cfc-4e2b-a35c-70c83712160b',
  'time_of_careplan': '2018-02-21',
  'careplan_name': 'Zielgerichtete Substanzen',
  'careplan_intention': 'Palliativ',
  'careplan_type': 'Rezidivtherapie'},
 {'patid': 'Patient/02b4f296-3cfc-4e2b-a35c-70c83712160b',
  'time_of_careplan': '2017-11-08',
  'careplan_name': 'Zielgerichtete Substanzen',
  'careplan_intention': 'Palliativ',
  'careplan_type': 'Rezidivtherapie'},
 {'patid': 'Patient/02b4f296-3cfc-4e2b-a35c-70c83712160b',
  'time_of_careplan': '2017-08-30',
  'careplan_name': 'Zielgerichtete Sub

In [22]:
json4demonstration = {
    'patient_info': patientinfo,
    'stages': stages,
    'examinations': examinations,
    'radiotherapy': radio_therapy,
    'progresses': progresses,
    'medication': medis,
    'properties_primary': properties_prim,
    'oncogenes': oncogenes,
    'careplans': careplan_simp
    
}

In [23]:
with open('example_patient.json', 'w', encoding='utf-8') as f:
    json.dump(json4demonstration, f, ensure_ascii=False, indent=4)

### We should double check again but now in the example_patient json file all the green-highlighted data are presented. That means we have the pipeline for fhir data --> simplified json
