In [1]:
import re
import os
import json
from pathlib import Path
# import codecs


In [2]:
SYNTHEA_DIR = "/Volumes/data/synthea/"
MODULE_DIR = SYNTHEA_DIR + "build/resources/main/modules/"
LOOKUP_TABLE_DIR = MODULE_DIR + "lookup_tables"


# Which modules use which attributes?
__NOTE__: You should probably use the gradle script `./gradlew attributes` instead; that is how `attributes.json` was generated.


In [3]:
# https://stackoverflow.com/questions/14962485/finding-a-key-recursively-in-a-dictionary

def get_recursively(search_dict, field):
    """
    Takes a dict with nested lists and dicts,
    and searches all dicts for a key of the field
    provided.
    """
    fields_found = []

    for key, value in search_dict.items():

        if key == field:
            fields_found.append(value)

        elif isinstance(value, dict):
            results = get_recursively(value, field)
            for result in results:
                fields_found.append(result)

        elif isinstance(value, list):
            for item in value:
                if isinstance(item, dict):
                    more_results = get_recursively(item, field)
                    for another_result in more_results:
                        fields_found.append(another_result)

    return fields_found


In [4]:

module_files = [path.relative_to(MODULE_DIR) for path in Path(MODULE_DIR).rglob('*.json')]
attribute_module = {}
for mf in module_files:
    with open(MODULE_DIR/mf, encoding='utf-8') as fh:
        data = json.load(fh)
        for k in ['attribute', 'assign_to_attribute']:
            attribute_list = set(get_recursively(data, k))
            for a in attribute_list:
                if a not in attribute_module:
                    attribute_module[a] = []
                attribute_module[a].append(str(mf))


In [5]:
attribute_module

t_cancer/surgery_therapy_breast.json',
  'breast_cancer/tnm_diagnosis.json'],
 'breast_cancer_aromatase_inhibitors': ['breast_cancer.json',
  'breast_cancer/hormonetherapy_breast.json'],
 'breast_cancer_chemoCount': ['breast_cancer/chemotherapy_breast.json'],
 'breast_cancer_condition': ['breast_cancer.json',
  'pickle_pica.json',
  'covid19/determine_risk.json',
  'covid19/medications.json'],
 'breast_cancer_init_followup_counter': ['breast_cancer.json'],
 'breast_cancer_neoCount': ['breast_cancer.json'],
 'breast_cancer_neoMed': ['breast_cancer.json'],
 'breast_cancer_survival': ['breast_cancer.json'],
 'breast_cancer_tamoxifen': ['breast_cancer.json',
  'breast_cancer/hormonetherapy_breast.json'],
 'breast_cancer_treat_count': ['breast_cancer/surgery_therapy_breast.json'],
 'breast_cancer_triple_negative': ['breast_cancer.json',
  'breast_cancer/hormone_diagnosis.json',
  'breast_cancer/surgery_therapy_breast.json'],
 'breast_cancer_years_after_treatment': ['breast_cancer.json'],
 '

# What are the possible vlaues for each attribute?

In [15]:
def get_attribute_values_recursively(search_dict):
    """
    Takes a dict with nested lists and dicts, searches all dicts and returns the value of the 'attribute' and 'value' fields if there is a field called 'attribute'
    """
    attribute_values = {} # key = attribute name, value = list (eventually set) of attribute values

    def record_attribute_value(attr, val):
        if attr not in attribute_values:
            attribute_values[attr] = set()
        # attribute_values[attr].append(val)
        if isinstance(val, set): # set
           attribute_values[attr].update(val)
        else:
           attribute_values[attr].add(val)

    if ('attribute' in search_dict) and ('value' in search_dict):
        record_attribute_value(search_dict['attribute'], search_dict['value'])
    
    for key, value in search_dict.items():

        if isinstance(value, dict):
            results = get_attribute_values_recursively(value)
            for a,v in results.items():
                record_attribute_value(a,v)

        if isinstance(value, list):
            for item in value:
                if isinstance(item, dict):
                    more_results = get_attribute_values_recursively(item)
                    for a,v in more_results.items():
                        record_attribute_value(a,v)

    return attribute_values

In [18]:
# mf = 'asthma.json'


attribute_values_list = []
for mf in module_files:
    # print(str(mf))
    with open(MODULE_DIR/mf, encoding='utf-8') as fh:
        data = json.load(fh)
        my_attribute_values = get_attribute_values_recursively(data)
        if len(my_attribute_values) > 0:
            attribute_values_list.append(my_attribute_values)




In [19]:
def aggregate_attribute_values(attr_val_list):
    attr_val_dict = {}
    for d in attr_val_list:
        for k,v in d.items():
            if k not in attr_val_dict:
                attr_val_dict[k] = set()
            attr_val_dict[k].update(v)
    return attr_val_dict

aggregate_attribute_values(attribute_values_list)

{'G155D Mutation': {True},
 'Lung Cancer Type': {'NSCLC', 'SCLC'},
 'RH_NEG': {True},
 'SSRI': {'Vicodin'},
 'alcoholism': {True},
 'anemia_pregnancy': {0, 1},
 'assessment_done': {'HOOS',
  'KOOS',
  'PROMIS-10',
  'PROMIS-29',
  'VR-12',
  'VR-36'},
 'asthma_type': {'childhood', 'lifelong'},
 'atopic': {True},
 'birth_type': {'induced', 'normal', 'premature'},
 'blindness': {True},
 'blood_pressure_controlled': {False, True},
 'bmi_percentile': {95},
 'breast_cancer_ER': {'ER-negative', 'ER-positive'},
 'breast_cancer_HER2': {'HER2-negative', 'HER2-positive'},
 'breast_cancer_Location': {'distant', 'local', 'regional'},
 'breast_cancer_M': {'M0', 'M1'},
 'breast_cancer_N': {'N0', 'N1', 'N2', 'N3'},
 'breast_cancer_PR': {'PR-negative', 'PR-positive'},
 'breast_cancer_T': {'T0', 'T1', 'T2', 'T3', 'T4'},
 'breast_cancer_aromatase_inhibitors': {True},
 'breast_cancer_chemoCount': {0, 7},
 'breast_cancer_init_followup_counter': {0, 3},
 'breast_cancer_neoCount': {0, 7},
 'breast_cancer_su