In [1]:
import pandas as pd
import json
from datetime import datetime

In [2]:
concept = pd.read_csv('/workspaces/synthea_dw/omop/seeds/CONCEPT.csv', delimiter='\t', low_memory=False)

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.expand_frame_repr', False)

In [3]:
def find_concept_id(
        concept, concept_codes=None, 
        concept_names=None, vocabulary_ids=None, 
        domain_ids=None, concept_class_ids=None, 
        invalid_reason=False, standard_concept=None
    ):
    query_components = []

    if concept_codes:
        query_components.append(f"concept_code in @concept_codes")
    if concept_names:
        query_components.append(f"concept_name in @concept_names")
    if vocabulary_ids:
        query_components.append(f"vocabulary_id in @vocabulary_ids")
    if not invalid_reason:
        query_components.append(f"invalid_reason.isnull()")
    if standard_concept:
        query_components.append(f"standard_concept == @standard_concept")
    if domain_ids:
        query_components.append(f"domain_id in @domain_ids")
    if concept_class_ids:
        query_components.append(f"concept_class_id in @concept_class_ids")

    query = " and ".join(query_components)
    
    concept_rows = concept.query(query)['concept_id'] if query else concept['concept_id']
    
    return int(concept_rows.iloc[0]) if not concept_rows.empty else 0


In [5]:
def allergyIntolerance_to_observation(data, concept):
    if data['code']['coding'][0]['code'] != '419199007':

        observations = []

        for reaction in data.get('reaction', []):
            observation = {
                'observation_id': data['id'],
                'person_id': data['patient']['reference'].split('/')[-1],
                'observation_concept_id': 4169307,
                'observation_date': data['recordedDate'].split('T')[0],
                'observation_datetime': data['recordedDate'],
                'observation_type_concept_id': 32817,
                'value_as_number': None,
                'value_as_string': data['code']['coding'][0]['display'],
                'value_as_concept_id': find_concept_id(
                    concept, 
                    concept_codes=[data['code']['coding'][0]['code']], 
                    vocabulary_ids=['SNOMED'], 
                    domain_ids=['Observation'], 
                    invalid_reason=False, 
                    standard_concept='S', 
                    concept_class_ids=['Substance']
                ),
                'qualifier_concept_id': find_concept_id(
                    concept, 
                    concept_names=[data['criticality'].capitalize()], 
                    vocabulary_ids=['SNOMED'], 
                    domain_ids=['Meas Value'], 
                    invalid_reason=False, 
                    standard_concept='S',
                    concept_class_ids=['Qualifier Value']
                ),
                'unit_concept_id': None,
                'provider_id': None,
                'visit_occurrence_id': None,
                'visit_detail_id': None,
                'observation_source_value': None,
                'observation_source_concept_id': 4169307,
                'unit_source_value': None,
                'qualifier_source_value': data['criticality'],
                'value_source_value': data['code']['coding'][0]['code'],
                'observation_event_id': None,
                'obs_event_field_concept_id': None
            }
            observations.append(observation)

        return observations

    return None

observation_rows = []

with open('/workspaces/synthea_dw/data/fhir/AllergyIntolerance.ndjson', 'r') as file:
    for line in file:
        data = json.loads(line)
        rows = allergyIntolerance_to_observation(data, concept)
        if rows:
            observation_rows.extend(rows)

observation = pd.DataFrame(observation_rows).drop_duplicates()
observation = observation[observation['observation_source_concept_id'] != 0]

observation.head()

Unnamed: 0,observation_id,person_id,observation_concept_id,observation_date,observation_datetime,observation_type_concept_id,value_as_number,value_as_string,value_as_concept_id,qualifier_concept_id,unit_concept_id,provider_id,visit_occurrence_id,visit_detail_id,observation_source_value,observation_source_concept_id,unit_source_value,qualifier_source_value,value_source_value,observation_event_id,obs_event_field_concept_id
0,68268edd-ab8e-517e-b225-e1791bb3940f,79d8982d-fef7-7135-181a-0fb6af4a0e63,4169307,1967-11-26,1967-11-26T04:21:04+00:00,32817,,Bee venom (substance),4122068,4267416,,,,,,4169307,,low,288328004,,
1,449cb165-2eb3-b213-87cc-e2d8a77083cf,098d2b36-b839-488e-28d1-db369b3abc6b,4169307,1979-06-04,1979-06-04T17:47:10+00:00,32817,,Animal dander (substance),4138133,4267416,,,,,,4169307,,low,264287008,,
2,516ddc8d-0eb3-fe92-1648-a068cae9929c,098d2b36-b839-488e-28d1-db369b3abc6b,4169307,1979-06-04,1979-06-04T17:47:10+00:00,32817,,Soy bean,4106307,4267416,,,,,,4169307,,low,256355007,,
3,9cf89d4f-6a70-09ab-f7ca-99ac88dee5c2,79d8982d-fef7-7135-181a-0fb6af4a0e63,4169307,1967-11-26,1967-11-26T04:21:04+00:00,32817,,Animal dander (substance),4138133,4267416,,,,,,4169307,,low,264287008,,
4,39fe9eb5-35e2-a2ff-b7c2-e3465745766a,79d8982d-fef7-7135-181a-0fb6af4a0e63,4169307,1967-11-26,1967-11-26T04:21:04+00:00,32817,,Fish (substance),42539493,4267416,,,,,,4169307,,low,735971005,,


In [6]:
def allergyIntolerance_to_condition(line):
    data = json.loads(line)

    if data['code']['coding'][0]['code'] == '419199007':
        return None

    condition_occurrences = []

    for reaction in data.get('reaction', []):
        for manifestation in reaction.get('manifestation', []):
            condition_occurrences.append({
                'condition_occurrence_id': data['id'],
                'person_id': data['patient']['reference'].split('/')[-1],
                'condition_concept_id': find_concept_id(
                    concept, 
                    concept_codes=[manifestation['coding'][0]['code']], 
                    vocabulary_ids=['SNOMED'], 
                    domain_ids=['Condition'], 
                    invalid_reason=False, 
                    standard_concept='S', 
                    concept_class_ids=['Clinical Finding']
                ),
                'condition_start_date': datetime.strptime(data['recordedDate'].split('T')[0], '%Y-%m-%d').date(),
                'condition_start_datetime': datetime.fromisoformat(data['recordedDate']),
                'condition_end_date': None,
                'condition_end_datetime': None,
                'condition_type_concept_id': 32817,
                'condition_status_concept_id': pd.NA,
                'stop_reason': None,
                'provider_id': pd.NA,
                'visit_occurrence_id': pd.NA,
                'visit_detail_id': pd.NA,
                'condition_source_concept_id': find_concept_id(
                    concept, 
                    concept_codes=[manifestation['coding'][0]['code']], 
                    vocabulary_ids=['SNOMED'], 
                    domain_ids=['Condition'], 
                    concept_class_ids=['Clinical Finding']
                ),
                'condition_status_source_value': manifestation['coding'][0]['code']
            })

    return condition_occurrences

condition_occurrences = []

with open('/workspaces/synthea_dw/data/fhir/AllergyIntolerance.ndjson', 'r') as file:
    for line in file:
        conditions = allergyIntolerance_to_condition(line)
        if conditions:
            condition_occurrences.extend(conditions)

condition_occurrence = pd.DataFrame(condition_occurrences).drop_duplicates()
condition_occurrence = condition_occurrence[condition_occurrence['condition_source_concept_id'] != 0]

condition_occurrence.head()

Unnamed: 0,condition_occurrence_id,person_id,condition_concept_id,condition_start_date,condition_start_datetime,condition_end_date,condition_end_datetime,condition_type_concept_id,condition_status_concept_id,stop_reason,provider_id,visit_occurrence_id,visit_detail_id,condition_source_concept_id,condition_status_source_value
0,68268edd-ab8e-517e-b225-e1791bb3940f,79d8982d-fef7-7135-181a-0fb6af4a0e63,140214,1967-11-26,1967-11-26 04:21:04+00:00,,,32817,,,,,,140214,271807003
1,449cb165-2eb3-b213-87cc-e2d8a77083cf,098d2b36-b839-488e-28d1-db369b3abc6b,604304,1979-06-04,1979-06-04 17:47:10+00:00,,,32817,,,,,,604304,878820003
2,516ddc8d-0eb3-fe92-1648-a068cae9929c,098d2b36-b839-488e-28d1-db369b3abc6b,196523,1979-06-04,1979-06-04 17:47:10+00:00,,,32817,,,,,,196523,62315008
3,9cf89d4f-6a70-09ab-f7ca-99ac88dee5c2,79d8982d-fef7-7135-181a-0fb6af4a0e63,140214,1967-11-26,1967-11-26 04:21:04+00:00,,,32817,,,,,,140214,271807003
4,39fe9eb5-35e2-a2ff-b7c2-e3465745766a,79d8982d-fef7-7135-181a-0fb6af4a0e63,312437,1967-11-26,1967-11-26 04:21:04+00:00,,,32817,,,,,,312437,267036007


In [7]:
def carePlan_to_note(data):
    # Extracting note_title and note_text from 'text' -> 'div'
    div_text = data['text']['div']
    note_title_end_index = div_text.find('<br/>')

    return {
        'note_id': data['id'],
        'person_id': data['subject']['reference'].split('/')[-1],
        'note_date': datetime.strptime(data['period']['start'].split('T')[0], '%Y-%m-%d').date(),
        'note_datetime': datetime.fromisoformat(data['period']['start']),
        'note_type_concept_id': 32817,
        'note_class_concept_id': 706300,
        'note_title': data['text']['div'][len('<div xmlns="http://www.w3.org/1999/xhtml">'):note_title_end_index],
        'note_text': div_text[note_title_end_index + len('<br/>'):],
        'encoding_concept_id': 32678,
        'language_concept_id': 4175745,
        'provider_id': pd.NA,
        'visit_occurrence_id': data['encounter']['reference'].split('/')[-1],
        'visit_detail_id': pd.NA,
        'note_source_value': div_text,
        'note_event_id': data['encounter']['reference'].split('/')[-1],
        'note_event_field_concept_id': pd.NA
    }

note_rows = []

with open('/workspaces/synthea_dw/data/fhir/CarePlan.ndjson', 'r') as file:
    for line in file:
        data = json.loads(line)
        note_row = carePlan_to_note(data)
        note_rows.append(note_row)

note = pd.DataFrame(note_rows).drop_duplicates()

note.head()

Unnamed: 0,note_id,person_id,note_date,note_datetime,note_type_concept_id,note_class_concept_id,note_title,note_text,encoding_concept_id,language_concept_id,provider_id,visit_occurrence_id,visit_detail_id,note_source_value,note_event_id,note_event_field_concept_id
0,b5940334-4e94-2047-75ca-f53f0c771a0c,79d8982d-fef7-7135-181a-0fb6af4a0e63,1967-11-09,1967-11-09 19:10:04+00:00,32817,706300,Care Plan for Self-care interventions (procedu...,Activities: <ul><li>Self-care interventions (p...,32678,4175745,,7f547fc0-ee75-bac0-0707-04ff4623b828,,"<div xmlns=""http://www.w3.org/1999/xhtml"">Care...",7f547fc0-ee75-bac0-0707-04ff4623b828,
1,81b4e1f9-4d65-b2a8-303d-56f4ecc017d5,098d2b36-b839-488e-28d1-db369b3abc6b,1979-05-19,1979-05-19 22:47:10+00:00,32817,706300,Care Plan for Self-care interventions (procedu...,Activities: <ul><li>Self-care interventions (p...,32678,4175745,,a6ce1507-9875-3f92-2847-53dda1f2d89a,,"<div xmlns=""http://www.w3.org/1999/xhtml"">Care...",a6ce1507-9875-3f92-2847-53dda1f2d89a,
2,057c0a7a-98b4-684f-2a24-dd2653b72719,5498f452-4f9e-ea6c-9587-e4bf0d97a2a0,1986-07-19,1986-07-19 11:19:35+00:00,32817,706300,Care Plan for Lifestyle education regarding hy...,Care plan is meant to treat Essential hyperten...,32678,4175745,,743816b1-6a76-2c34-00ae-18cd5337c924,,"<div xmlns=""http://www.w3.org/1999/xhtml"">Care...",743816b1-6a76-2c34-00ae-18cd5337c924,
3,35e1b2d7-c43f-f5fb-f3f9-6bb4538be596,d23456ac-957d-67ad-1ba4-34c3f8a54744,1980-11-28,1980-11-28 00:14:03+00:00,32817,706300,Care Plan for Diabetes self management plan.,Care plan is meant to treat Prediabetes.<br/>A...,32678,4175745,,bf76c911-3186-547a-fec4-f63389c5e3a2,,"<div xmlns=""http://www.w3.org/1999/xhtml"">Care...",bf76c911-3186-547a-fec4-f63389c5e3a2,
4,d95fb87a-062e-97fd-cbea-5d981c13fe39,561d73b3-1c56-ed8d-266e-70c9e8712efb,1997-01-08,1997-01-08 13:39:43+00:00,32817,706300,Care Plan for Lifestyle education regarding hy...,Care plan is meant to treat Essential hyperten...,32678,4175745,,869e0e5d-2aec-841c-44b0-9dbcb6cf92ac,,"<div xmlns=""http://www.w3.org/1999/xhtml"">Care...",869e0e5d-2aec-841c-44b0-9dbcb6cf92ac,


In [8]:
def carePlan_to_procedure_occurrence(data, concept):
    procedure_occurrences = []

    for category in data.get('category', []):
        for coding in category.get('coding', []):
            if 'display' in coding:
                procedure = {
                    'procedure_occurrence_id': data['id'],
                    'person_id': data['subject']['reference'].split('/')[-1],
                    'procedure_concept_id': find_concept_id(
                        concept, 
                        concept_codes=[coding['code']], 
                        vocabulary_ids=['SNOMED'], 
                        domain_ids=['Procedure'], 
                        invalid_reason=False, 
                        standard_concept='S', 
                        concept_class_ids=['Procedure']
                    ),
                    'procedure_date': datetime.strptime(data['period']['start'].split('T')[0], '%Y-%m-%d').date(),
                    'procedure_datetime': datetime.fromisoformat(data['period']['start']),
                    'procedure_end_date': None,
                    'procedure_end_datetime': None,
                    'procedure_type_concept_id': 32817,
                    'modifier_concept_id': pd.NA,
                    'quantity': 1,
                    'provider_id': pd.NA,
                    'visit_occurrence_id': data['encounter']['reference'].split('/')[-1],
                    'visit_detail_id': pd.NA,
                    'procedure_source_value': coding['code'],
                    'procedure_source_concept_id': find_concept_id(
                        concept, 
                        concept_codes=[coding['code']], 
                        vocabulary_ids=['SNOMED'], 
                        domain_ids=['Procedure'], 
                        concept_class_ids=['Procedure']
                    ),
                    'modifier_source_value': None
                }
                procedure_occurrences.append(procedure)

    return procedure_occurrences

procedure_occurrences = []

with open('/workspaces/synthea_dw/data/fhir/CarePlan.ndjson', 'r') as file:
    for line in file:
        data = json.loads(line)
        procedures = carePlan_to_procedure_occurrence(data, concept)
        if procedures:
            procedure_occurrences.extend(procedures)

procedure_occurrence = pd.DataFrame(procedure_occurrences).drop_duplicates()
procedure_occurrence = procedure_occurrence[procedure_occurrence['procedure_source_concept_id'] != 0]

procedure_occurrence.head()

Unnamed: 0,procedure_occurrence_id,person_id,procedure_concept_id,procedure_date,procedure_datetime,procedure_end_date,procedure_end_datetime,procedure_type_concept_id,modifier_concept_id,quantity,provider_id,visit_occurrence_id,visit_detail_id,procedure_source_value,procedure_source_concept_id,modifier_source_value
0,b5940334-4e94-2047-75ca-f53f0c771a0c,79d8982d-fef7-7135-181a-0fb6af4a0e63,4293157,1967-11-09,1967-11-09 19:10:04+00:00,,,32817,,1,,7f547fc0-ee75-bac0-0707-04ff4623b828,,384758001,4293157,
1,81b4e1f9-4d65-b2a8-303d-56f4ecc017d5,098d2b36-b839-488e-28d1-db369b3abc6b,4293157,1979-05-19,1979-05-19 22:47:10+00:00,,,32817,,1,,a6ce1507-9875-3f92-2847-53dda1f2d89a,,384758001,4293157,
2,057c0a7a-98b4-684f-2a24-dd2653b72719,5498f452-4f9e-ea6c-9587-e4bf0d97a2a0,40481459,1986-07-19,1986-07-19 11:19:35+00:00,,,32817,,1,,743816b1-6a76-2c34-00ae-18cd5337c924,,443402002,40481459,
4,d95fb87a-062e-97fd-cbea-5d981c13fe39,561d73b3-1c56-ed8d-266e-70c9e8712efb,40481459,1997-01-08,1997-01-08 13:39:43+00:00,,,32817,,1,,869e0e5d-2aec-841c-44b0-9dbcb6cf92ac,,443402002,40481459,
8,f7f75673-8153-d99f-6ba4-e6303055752e,1d26a818-351d-22f4-15d7-04cf0f520780,40481459,2003-11-21,2003-11-21 15:30:45+00:00,,,32817,,1,,fb708140-8873-5618-e24a-71ba1d6d7176,,443402002,40481459,


In [11]:
def carePlan_to_observation(data, concept):
    observation_rows = []

    for category in data.get('category', []):
        for coding in category.get('coding', []):
            if 'display' in coding:
                observation = {
                    'observation_id': data['id'],
                    'person_id': data['subject']['reference'].split('/')[-1],
                    'observation_concept_id': find_concept_id(
                        concept, 
                        concept_codes=[coding['code']], 
                        vocabulary_ids=['SNOMED'], 
                        domain_ids=['Observation'], 
                        invalid_reason=False, 
                        standard_concept='S'
                    ),
                    'observation_date': datetime.strptime(data['period']['start'].split('T')[0], '%Y-%m-%d').date(),
                    'observation_datetime': datetime.fromisoformat(data['period']['start']),
                    'observation_type_concept_id': 32817,
                    'value_as_number': None,
                    'value_as_string': None,
                    'value_as_concept_id': pd.NA,
                    'qualifier_concept_id': pd.NA,
                    'unit_concept_id': pd.NA,
                    'provider_id': pd.NA,
                    'visit_occurrence_id': data['encounter']['reference'].split('/')[-1],
                    'visit_detail_id': pd.NA,
                    'observation_source_value': coding['code'],
                    'observation_source_concept_id': find_concept_id(
                        concept, 
                        concept_codes=[coding['code']], 
                        vocabulary_ids=['SNOMED'], 
                        domain_ids=['Observation']
                    ),
                    'qualifier_source_value': None,
                    'value_source_value': None,
                    'observation_event_id': data['id'],
                    'obs_event_field_concept_id': None  
                }
                observation_rows.append(observation)

    return observation_rows

observation_rows = []

with open('/workspaces/synthea_dw/data/fhir/CarePlan.ndjson', 'r') as file:
    for line in file:
        data = json.loads(line)
        observations = carePlan_to_observation(data, concept)
        if observations:
            observation_rows.extend(observations)

observation = pd.DataFrame(observation_rows).drop_duplicates()
observation = observation[observation['observation_source_concept_id'] != 0]

observation.head()


Unnamed: 0,observation_id,person_id,observation_concept_id,observation_date,observation_datetime,observation_type_concept_id,value_as_number,value_as_string,value_as_concept_id,qualifier_concept_id,unit_concept_id,provider_id,visit_occurrence_id,visit_detail_id,observation_source_value,observation_source_concept_id,qualifier_source_value,value_source_value,observation_event_id,obs_event_field_concept_id
13,06025c2f-1402-31aa-4a16-1fc00ac2aa40,561d73b3-1c56-ed8d-266e-70c9e8712efb,4047564,2013-12-04,2013-12-04 13:39:43+00:00,32817,,,,,,,78780d5c-aaff-7bcd-3dae-c9d09aaae3ac,,134435003,4047564,,,06025c2f-1402-31aa-4a16-1fc00ac2aa40,
16,8617ba57-b1b3-d2a6-369b-f3e51c4c47f6,5498f452-4f9e-ea6c-9587-e4bf0d97a2a0,4047564,2001-09-29,2001-09-29 11:19:35+00:00,32817,,,,,,,cb42e515-3f71-d2e4-4f18-0ae602c69dc9,,134435003,4047564,,,8617ba57-b1b3-d2a6-369b-f3e51c4c47f6,
21,cf24fe5e-4d38-ab08-c879-e7b219f2f6aa,fffe0830-f71e-bd50-e90d-fc5f23c55433,4021315,2015-09-23,2015-09-23 13:08:35+00:00,32817,,,,,,,a5cba29c-4d9e-9f16-a985-364a1627c23a,,225358003,4021315,,,cf24fe5e-4d38-ab08-c879-e7b219f2f6aa,
22,0a9efc99-11c2-ffcb-5e86-74f17d302ee9,8d3c566e-e2f0-3f11-eee2-dce3c68c498d,46272846,2021-04-16,2021-04-16 18:00:56+00:00,32817,,,,,,,a33ad2b8-fc4a-355d-bd4c-6501c2dedb3d,,711282006,46272846,,,0a9efc99-11c2-ffcb-5e86-74f17d302ee9,
26,e2a4ae8f-677f-b7cc-90c1-97e896b1e332,1d26a818-351d-22f4-15d7-04cf0f520780,4047564,2016-02-05,2016-02-05 15:30:45+00:00,32817,,,,,,,088054ee-86a5-9143-8d4f-86ec3a97e0cf,,134435003,4047564,,,e2a4ae8f-677f-b7cc-90c1-97e896b1e332,
