In [1]:
import pandas as pd
import json
from datetime import datetime

In [2]:
concept = pd.read_csv('/workspaces/synthea_dw/omop/seeds/CONCEPT.csv', delimiter='\t', low_memory=False)

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.expand_frame_repr', False)

In [3]:
def find_concept_id(
        concept, concept_codes=None, 
        concept_names=None, vocabulary_ids=None, 
        domain_ids=None, concept_class_ids=None, 
        invalid_reason=False, standard_concept=None
    ):
    query_components = []

    if concept_codes:
        query_components.append(f"concept_code in @concept_codes")
    if concept_names:
        query_components.append(f"concept_name in @concept_names")
    if vocabulary_ids:
        query_components.append(f"vocabulary_id in @vocabulary_ids")
    if not invalid_reason:
        query_components.append(f"invalid_reason.isnull()")
    if standard_concept:
        query_components.append(f"standard_concept == @standard_concept")
    if domain_ids:
        query_components.append(f"domain_id in @domain_ids")
    if concept_class_ids:
        query_components.append(f"concept_class_id in @concept_class_ids")

    query = " and ".join(query_components)
    
    concept_rows = concept.query(query)['concept_id'] if query else concept['concept_id']
    
    return int(concept_rows.iloc[0]) if not concept_rows.empty else 0


### person

In [30]:
def patient_to_person(data, concept):
    person_rows = []

    # Extracting the year, month, and day of birth
    birth_year = int(data['birthDate'].split('-')[0])
    birth_month = int(data['birthDate'].split('-')[1])
    birth_day = int(data['birthDate'].split('-')[2])
    birth_datetime = datetime.fromisoformat(data['birthDate']).strftime('%d-%m-%Y %H:%M:%S')

    race_code = None
    ethnicity_code = None
    gender_source_value = None
    for ext in data.get('extension', []):
        if ext.get('url') == 'http://hl7.org/fhir/us/core/StructureDefinition/us-core-race':
            for sub_ext in ext.get('extension', []):
                if 'valueCoding' in sub_ext and 'display' in sub_ext['valueCoding']:
                    race_code = sub_ext['valueCoding']['display']
                    break
        if ext.get('url') == 'http://hl7.org/fhir/us/core/StructureDefinition/us-core-ethnicity':
            for sub_ext in ext.get('extension', []):
                if 'valueCoding' in sub_ext and 'display' in sub_ext['valueCoding']:
                    ethnicity_code = sub_ext['valueCoding']['display']
                    break
        if ext.get('url') == 'http://hl7.org/fhir/us/core/StructureDefinition/us-core-birthsex':
            gender_source_value = ext.get('valueCode')
            break

    race_concept_id = find_concept_id(concept, concept_names=[race_code], vocabulary_ids=['Race'])
    ethnicity_concept_id = find_concept_id(concept, concept_names=[ethnicity_code], vocabulary_ids=['Ethnicity'])
    gender_concept_id = find_concept_id(concept, concept_codes=[gender_source_value], vocabulary_ids=['Gender'])

    person = {
        'person_id': data['id'],
        'gender_concept_id': gender_concept_id,
        'year_of_birth': birth_year,
        'month_of_birth': birth_month,
        'day_of_birth': birth_day,
        'birth_datetime': birth_datetime,
        'race_concept_id': race_concept_id,
        'ethnicity_concept_id': ethnicity_concept_id,
        'location_id': data['id'],
        'provider_id': pd.NA,
        'care_site_id': pd.NA,
        'person_source_value': data['id'],
        'gender_source_value': gender_source_value,
        'gender_source_concept_id': gender_concept_id,
        'race_source_value': race_code,
        'race_source_concept_id': race_concept_id,
        'ethnicity_source_value': ethnicity_code,
        'ethnicity_source_concept_id': ethnicity_concept_id
    }

    person_rows.append(person)

    return person_rows

person_rows = []

with open('/workspaces/synthea_dw/data/fhir/Patient.ndjson', 'r') as file:
    for line in file:
        data = json.loads(line)
        persons = patient_to_person(data, concept)
        person_rows.extend(persons)

person = pd.DataFrame(person_rows).drop_duplicates()

person.sample(5)

Unnamed: 0,person_id,gender_concept_id,year_of_birth,month_of_birth,day_of_birth,birth_datetime,race_concept_id,ethnicity_concept_id,location_id,provider_id,care_site_id,person_source_value,gender_source_value,gender_source_concept_id,race_source_value,race_source_concept_id,ethnicity_source_value,ethnicity_source_concept_id
9,3558b674-952f-aa9d-9e66-b839f6a16316,8507,1969,11,16,16-11-1969 00:00:00,8527,38003564,3558b674-952f-aa9d-9e66-b839f6a16316,,,3558b674-952f-aa9d-9e66-b839f6a16316,M,8507,White,8527,Not Hispanic or Latino,38003564
4,7a82833f-fae1-d69a-2cbf-69279dac746f,8532,1967,5,20,20-05-1967 00:00:00,8527,38003564,7a82833f-fae1-d69a-2cbf-69279dac746f,,,7a82833f-fae1-d69a-2cbf-69279dac746f,F,8532,White,8527,Not Hispanic or Latino,38003564
1,c95d085d-2249-b616-7668-88cc9a0c11bd,8532,1958,8,17,17-08-1958 00:00:00,8527,38003564,c95d085d-2249-b616-7668-88cc9a0c11bd,,,c95d085d-2249-b616-7668-88cc9a0c11bd,F,8532,White,8527,Not Hispanic or Latino,38003564
5,c86bea4c-5647-c8c2-35c5-cb08246ded70,8507,1937,2,12,12-02-1937 00:00:00,8527,38003564,c86bea4c-5647-c8c2-35c5-cb08246ded70,,,c86bea4c-5647-c8c2-35c5-cb08246ded70,M,8507,White,8527,Not Hispanic or Latino,38003564
6,a5a02d31-a93c-7b72-7e1c-a9cbfa64874d,8532,1958,1,8,08-01-1958 00:00:00,8527,38003564,a5a02d31-a93c-7b72-7e1c-a9cbfa64874d,,,a5a02d31-a93c-7b72-7e1c-a9cbfa64874d,F,8532,White,8527,Not Hispanic or Latino,38003564


### observation_period

### visit_occurrence

In [4]:
def careTeam_to_visit_occurrence(data):
    visit_occurrences = []

    person_id, provider_id, care_site_id = None, None, None
    for participant in data.get('participant', []):
        for role in participant.get('role', []):
            for coding in role.get('coding', []):
                if coding.get('code') == '116154003':
                    person_id = participant['member']['reference'].split('/')[-1]
                elif coding.get('code') == '223366009': 
                    provider_id = participant['member']['reference'].split('/')[-1]
                elif coding.get('code') == '224891009': 
                    care_site_id = participant['member']['reference'].split('/')[-1]

    if person_id:
        visit_occurrence = {
            'visit_occurrence_id': data['encounter']['reference'].split('/')[-1],
            'person_id': person_id,
            'visit_concept_id': 9201,
            'visit_start_date': datetime.strptime(data['period']['start'].split('T')[0], '%Y-%m-%d').date(),
            'visit_start_datetime': datetime.fromisoformat(data['period']['start']),
            'visit_end_date': datetime.strptime(data['period']['end'].split('T')[0], '%Y-%m-%d').date() if 'end' in data['period'] else None,
            'visit_end_datetime': datetime.fromisoformat(data['period']['end']) if 'end' in data['period'] else None,
            'visit_type_concept_id': 32817,
            'provider_id': provider_id,
            'care_site_id': care_site_id,
            'visit_source_value': 'IP',
            'visit_source_concept_id': 9201,
            'admitted_from_concept_id': pd.NA,
            'admitted_from_source_value': None,
            'discharged_to_concept_id': pd.NA,
            'discharged_to_source_value': None,
            'preceding_visit_occurrence_id': pd.NA
        }
        visit_occurrences.append(visit_occurrence)

    return visit_occurrences

visit_occurrences = []

with open('/workspaces/synthea_dw/data/fhir/CareTeam.ndjson', 'r') as file:
    for line in file:
        data = json.loads(line)
        visits = careTeam_to_visit_occurrence(data)
        visit_occurrences.extend(visits)

visit_occurrence = pd.DataFrame(visit_occurrences).drop_duplicates()

visit_occurrence.sample(5)

Unnamed: 0,visit_occurrence_id,person_id,visit_concept_id,visit_start_date,visit_start_datetime,visit_end_date,visit_end_datetime,visit_type_concept_id,provider_id,care_site_id,visit_source_value,visit_source_concept_id,admitted_from_concept_id,admitted_from_source_value,discharged_to_concept_id,discharged_to_source_value,preceding_visit_occurrence_id
10,8607bcdc-61b1-c954-32a1-ec8cfb7d6896,c86bea4c-5647-c8c2-35c5-cb08246ded70,9201,1991-03-08,1991-03-08 16:03:44+00:00,,NaT,32817,60b23852-314f-3aeb-b0b7-967947697497,8d0f0e66-b1dc-37fd-8b4e-0dcb6b3b446e,IP,9201,,,,,
11,62466cd0-243f-4804-552b-8434c01c644c,a5a02d31-a93c-7b72-7e1c-a9cbfa64874d,9201,1992-03-25,1992-03-25 08:52:48+00:00,,NaT,32817,9b630ef9-f2d7-36f1-a579-b412643bae3f,93c3d861-e05d-33b9-91eb-1768470229d7,IP,9201,,,,,
24,1b1f2cda-ec02-ee21-723b-71eda63f5801,c95d085d-2249-b616-7668-88cc9a0c11bd,9201,2016-09-18,2016-09-18 09:41:51+00:00,2016-11-30,2016-11-30 09:41:51+00:00,32817,2c4082be-6419-366c-b664-98ca2250b9a4,76b7dbb5-0a02-32a4-9a01-3511f7e45016,IP,9201,,,,,
23,99cadc54-d5cc-769a-5e5c-e742e81a5f55,a5a02d31-a93c-7b72-7e1c-a9cbfa64874d,9201,2009-11-29,2009-11-29 04:08:38+00:00,2010-02-24,2010-02-24 08:52:48+00:00,32817,4eafa94f-4896-3e78-a837-0c12f5491935,440fa4b8-c731-3cf5-81d9-aa3f30a37b95,IP,9201,,,,,
16,cc25f815-bccf-17b3-acd2-f506a13dbd74,32ee64c2-1585-d7ad-c53f-9ad739c676cf,9201,2008-12-24,2008-12-24 13:44:30+00:00,2009-02-27,2009-02-27 13:44:30+00:00,32817,3d336284-20e1-386c-b13d-dd0b4e639693,390a2aa4-70b4-3a64-a12f-9bd777834c8d,IP,9201,,,,,


In [5]:
def encounter_to_visit_occurrence(data):
    person_id = data['subject']['reference'].split('/')[-1]
    visit_class_code = data['class']['code'] if 'class' in data else None
    visit_concept_id_map = {'IMP': 9201, 'EMER': 9203, 'AMB': 9202}
    visit_concept_id = visit_concept_id_map.get(visit_class_code, None)
    provider_id = data['participant'][0]['individual']['reference'].split('/')[-1] if 'participant' in data and 'individual' in data['participant'][0] else None
    care_site_id = data['serviceProvider']['reference'].split('/')[-1] if 'serviceProvider' in data else None

    visit_occurrence = {
        'visit_occurrence_id': data['id'],
        'person_id': person_id,
        'visit_concept_id': visit_concept_id,
        'visit_start_date': datetime.strptime(data['period']['start'].split('T')[0], '%Y-%m-%d').date(),
        'visit_start_datetime': datetime.fromisoformat(data['period']['start']),
        'visit_end_date': datetime.strptime(data['period']['end'].split('T')[0], '%Y-%m-%d').date() if 'end' in data['period'] else None,
        'visit_end_datetime': datetime.fromisoformat(data['period']['end']) if 'end' in data['period'] else None,
        'visit_type_concept_id': 32817,
        'provider_id': provider_id,
        'care_site_id': care_site_id,
        'visit_source_value': visit_class_code,
        'visit_source_concept_id': visit_concept_id,
        'admitted_from_concept_id': pd.NA,
        'admitted_from_source_value': None,
        'discharged_to_concept_id': pd.NA,
        'discharged_to_source_value': None,
        'preceding_visit_occurrence_id': pd.NA
    }

    return [visit_occurrence]

visit_occurrences = []

with open('/workspaces/synthea_dw/data/fhir/Encounter.ndjson', 'r') as file:
    for line in file:
        data = json.loads(line)
        visit_occurrence = encounter_to_visit_occurrence(data)
        visit_occurrences.extend(visit_occurrence)

visit_occurrence = pd.DataFrame(visit_occurrences).drop_duplicates()

visit_occurrence.sample(5)

Unnamed: 0,visit_occurrence_id,person_id,visit_concept_id,visit_start_date,visit_start_datetime,visit_end_date,visit_end_datetime,visit_type_concept_id,provider_id,care_site_id,visit_source_value,visit_source_concept_id,admitted_from_concept_id,admitted_from_source_value,discharged_to_concept_id,discharged_to_source_value,preceding_visit_occurrence_id
284,a5093a94-54da-e844-5d90-1b73a5bb434e,7a82833f-fae1-d69a-2cbf-69279dac746f,9202.0,2016-05-07,2016-05-07 01:46:00+00:00,2016-05-07,2016-05-07 02:24:13+00:00,32817,8f9e4de4-e070-3058-9aea-4c9a8e0e7ac1,41e2a44c-477c-3511-96f9-12c476aa3b6a,AMB,9202.0,,,,,
725,8a91df2c-2481-2e87-0f4b-17fcf633bd04,7a7b7fba-a005-3736-91ef-218a0d2824c5,9202.0,2018-12-04,2018-12-04 02:16:03+00:00,2018-12-04,2018-12-04 04:51:03+00:00,32817,299bc447-29c4-3c98-948b-ea0891c97d89,497f39dd-280e-3d58-af5b-c5e3a3a09b10,AMB,9202.0,,,,,
600,9dabb7fb-d6da-9011-9544-8a696d99976c,7a7b7fba-a005-3736-91ef-218a0d2824c5,9202.0,2017-08-15,2017-08-15 05:06:03+00:00,2017-08-15,2017-08-15 08:05:03+00:00,32817,299bc447-29c4-3c98-948b-ea0891c97d89,497f39dd-280e-3d58-af5b-c5e3a3a09b10,AMB,9202.0,,,,,
405,82575e97-c41d-fd32-a16a-cefd2ef33041,7a82833f-fae1-d69a-2cbf-69279dac746f,9202.0,2020-10-10,2020-10-10 01:46:00+00:00,2020-10-10,2020-10-10 02:35:15+00:00,32817,8f9e4de4-e070-3058-9aea-4c9a8e0e7ac1,41e2a44c-477c-3511-96f9-12c476aa3b6a,AMB,9202.0,,,,,
516,11727698-28cb-37d2-108b-f7ba8e7d5fa6,7a7b7fba-a005-3736-91ef-218a0d2824c5,9203.0,2016-08-21,2016-08-21 12:15:03+00:00,2016-08-21,2016-08-21 13:15:03+00:00,32817,3d336284-20e1-386c-b13d-dd0b4e639693,390a2aa4-70b4-3a64-a12f-9bd777834c8d,EMER,9203.0,,,,,


### condition_occurrence

In [7]:
def allergyIntolerance_to_condition_occurrence(line):
    data = json.loads(line)

    if data['code']['coding'][0]['code'] == '419199007':
        return None

    condition_occurrences = []

    for reaction in data.get('reaction', []):
        for manifestation in reaction.get('manifestation', []):
            condition_occurrences.append({
                'condition_occurrence_id': data['id'],
                'person_id': data['patient']['reference'].split('/')[-1],
                'condition_concept_id': find_concept_id(
                    concept, 
                    concept_codes=[manifestation['coding'][0]['code']], 
                    vocabulary_ids=['SNOMED'], 
                    domain_ids=['Condition'], 
                    invalid_reason=False, 
                    standard_concept='S', 
                    concept_class_ids=['Clinical Finding']
                ),
                'condition_start_date': datetime.strptime(data['recordedDate'].split('T')[0], '%Y-%m-%d').date(),
                'condition_start_datetime': datetime.fromisoformat(data['recordedDate']),
                'condition_end_date': None,
                'condition_end_datetime': None,
                'condition_type_concept_id': 32817,
                'condition_status_concept_id': pd.NA,
                'stop_reason': None,
                'provider_id': pd.NA,
                'visit_occurrence_id': pd.NA,
                'visit_detail_id': pd.NA,
                'condition_source_concept_id': find_concept_id(
                    concept, 
                    concept_codes=[manifestation['coding'][0]['code']], 
                    vocabulary_ids=['SNOMED'], 
                    domain_ids=['Condition'], 
                    concept_class_ids=['Clinical Finding']
                ),
                'condition_status_source_value': manifestation['coding'][0]['code']
            })

    return condition_occurrences

condition_occurrences = []

with open('/workspaces/synthea_dw/data/fhir/AllergyIntolerance.ndjson', 'r') as file:
    for line in file:
        conditions = allergyIntolerance_to_condition_occurrence(line)
        if conditions:
            condition_occurrences.extend(conditions)

condition_occurrence = pd.DataFrame(condition_occurrences).drop_duplicates()
condition_occurrence = condition_occurrence[condition_occurrence['condition_source_concept_id'] != 0]

condition_occurrence.sample(3)

Unnamed: 0,condition_occurrence_id,person_id,condition_concept_id,condition_start_date,condition_start_datetime,condition_end_date,condition_end_datetime,condition_type_concept_id,condition_status_concept_id,stop_reason,provider_id,visit_occurrence_id,visit_detail_id,condition_source_concept_id,condition_status_source_value
1,aac94664-849e-be6d-e8e0-72f51fe20102,a5a02d31-a93c-7b72-7e1c-a9cbfa64874d,4082588,1959-04-27,1959-04-27 13:52:48+00:00,,,32817,,,,,,4082588,247472004
3,833cb8b3-752e-1b37-9aea-8dd9fe9ecfd9,4390395b-5a78-2005-80b7-5ebd62b595c9,4270861,1969-06-13,1969-06-13 09:11:38+00:00,,,32817,,,,,,4270861,402387002
2,833cb8b3-752e-1b37-9aea-8dd9fe9ecfd9,4390395b-5a78-2005-80b7-5ebd62b595c9,140214,1969-06-13,1969-06-13 09:11:38+00:00,,,32817,,,,,,140214,271807003


In [10]:
def careTeam_to_condition_occurrence(data, concept):
    condition_occurrences = []

    if 'reasonCode' in data:
        person_id = None
        provider_id = None
        for participant in data.get('participant', []):
            for role in participant.get('role', []):
                for coding in role.get('coding', []):
                    if coding.get('code') == '116154003':
                        person_id = participant['member']['reference'].split('/')[-1]
                    elif coding.get('code') == '223366009':
                        provider_id = participant['member']['reference'].split('/')[-1]

        if person_id:
            for reasonCode in data['reasonCode']:
                for coding in reasonCode.get('coding', []):
                    condition_occurrence = {
                        'condition_occurrence_id': data['id'],
                        'person_id': person_id,
                        'condition_concept_id': find_concept_id(
                            concept, 
                            concept_codes=[coding['code']], 
                            vocabulary_ids=['SNOMED'], 
                            domain_ids=['Condition'],
                            invalid_reason=False, 
                            standard_concept='S'
                        ),
                        'condition_start_date': datetime.strptime(data['period']['start'].split('T')[0], '%Y-%m-%d').date(),
                        'condition_start_datetime': datetime.fromisoformat(data['period']['start']),
                        'condition_end_date': None,
                        'condition_end_datetime': None,
                        'condition_type_concept_id': 32817,
                        'condition_status_concept_id': pd.NA,
                        'stop_reason': None,
                        'provider_id': provider_id,
                        'visit_occurrence_id': data['encounter']['reference'].split('/')[-1],
                        'visit_detail_id': pd.NA,
                        'condition_source_value': coding['code'],
                        'condition_source_concept_id': find_concept_id(
                            concept, 
                            concept_codes=[coding['code']], 
                            vocabulary_ids=['SNOMED'], 
                            domain_ids=['Condition'],
                            invalid_reason=True
                        ),
                        'condition_status_source_value': None
                    }
                    condition_occurrences.append(condition_occurrence)

    return condition_occurrences

condition_occurrences = []

with open('/workspaces/synthea_dw/data/fhir/CareTeam.ndjson', 'r') as file:
    for line in file:
        data = json.loads(line)
        conditions = careTeam_to_condition_occurrence(data, concept)
        if conditions:
            condition_occurrences.extend(conditions)

condition_occurrence = pd.DataFrame(condition_occurrences).drop_duplicates()

condition_occurrence.sample(5)

Unnamed: 0,condition_occurrence_id,person_id,condition_concept_id,condition_start_date,condition_start_datetime,condition_end_date,condition_end_datetime,condition_type_concept_id,condition_status_concept_id,stop_reason,provider_id,visit_occurrence_id,visit_detail_id,condition_source_value,condition_source_concept_id,condition_status_source_value
18,2316005a-f89a-6c86-7dd5-fe802f8e7e56,c95d085d-2249-b616-7668-88cc9a0c11bd,79740,2019-10-25,2019-10-25 17:24:15+00:00,,,32817,,,f3954a8e-f971-3d58-9bd5-b5282043fb08,be07a258-3ff5-d8af-aa9e-28573169689a,,109838007,79740,
4,c401e5ae-9097-865f-2454-8412325a9a00,c86bea4c-5647-c8c2-35c5-cb08246ded70,432867,1979-03-16,1979-03-16 16:03:44+00:00,,,32817,,,60b23852-314f-3aeb-b0b7-967947697497,e0e727ba-d5ce-4bc9-272c-b2259deb260f,,55822004,432867,
26,7f0b4284-9667-2348-c17f-880a83e95d49,3558b674-952f-aa9d-9e66-b839f6a16316,320128,1998-01-25,1998-01-25 12:15:03+00:00,,,32817,,,3d336284-20e1-386c-b13d-dd0b4e639693,5296ed0f-148b-e3ed-941d-9fd360122d1f,,59621000,320128,
22,1099fd92-1a51-0b2e-1023-4bcfcfcd7a1e,c95d085d-2249-b616-7668-88cc9a0c11bd,432867,2022-01-02,2022-01-02 09:16:36+00:00,,,32817,,,f3954a8e-f971-3d58-9bd5-b5282043fb08,ae8b90d4-b52c-710b-b30d-888e2c062f4c,,55822004,432867,
2,e3c70dd3-c050-4475-c3c6-2069ad347b13,c95d085d-2249-b616-7668-88cc9a0c11bd,0,2007-09-23,2007-09-23 09:16:36+00:00,,,32817,,,94b7f786-dca0-3d77-ae95-20ef40618473,70269899-768b-a394-cfd8-3294c4850342,,15777000,40316773,


In [11]:
def claim_to_condition_occurrence(data, concept):
    condition_occurrences = []

    if any(coding['code'] in ['professional', 'institutional'] for coding in data['type']['coding']):

        for diagnosis in data.get('diagnosis', []):
            condition_ref = diagnosis['diagnosisReference']['reference']
            condition_id = condition_ref.split('/')[-1]

            for item in data.get('item', []):
                if 'productOrService' in item and 'coding' in item['productOrService']:
                    for coding in item['productOrService']['coding']:
                        condition_occurrence = {
                            'condition_occurrence_id': condition_id,
                            'person_id': data['patient']['reference'].split('/')[-1],
                            'condition_concept_id': find_concept_id(
                                concept, 
                                concept_codes=[coding['code']], 
                                vocabulary_ids=['SNOMED'], 
                                domain_ids=['Condition'], 
                                invalid_reason=False, 
                                standard_concept='S'
                            ),
                            'condition_start_date': datetime.strptime(data['billablePeriod']['start'].split('T')[0], '%Y-%m-%d').date(),
                            'condition_start_datetime': datetime.fromisoformat(data['billablePeriod']['start']),
                            'condition_end_date': datetime.strptime(data['billablePeriod']['end'].split('T')[0], '%Y-%m-%d').date(),
                            'condition_end_datetime': datetime.fromisoformat(data['billablePeriod']['end']),
                            'condition_type_concept_id': 32817,
                            'condition_status_concept_id': pd.NA,
                            'stop_reason': None,
                            'provider_id': pd.NA,
                            'visit_occurrence_id': item['encounter'][0]['reference'].split('/')[-1] if 'encounter' in item else None,
                            'visit_detail_id': pd.NA,
                            'condition_source_concept_id': find_concept_id(
                                concept, 
                                concept_codes=[coding['code']], 
                                vocabulary_ids=['SNOMED'], 
                                domain_ids=['Condition']
                            ),
                            'condition_status_source_value': coding['code']
                        }
                        condition_occurrences.append(condition_occurrence)

    return condition_occurrences

condition_occurrences = []

with open('/workspaces/synthea_dw/data/fhir/Claim.ndjson', 'r') as file:
    for line in file:
        data = json.loads(line)
        conditions = claim_to_condition_occurrence(data, concept)
        condition_occurrences.extend(conditions)

condition_occurrence = pd.DataFrame(condition_occurrences).drop_duplicates()
condition_occurrence = condition_occurrence[condition_occurrence['condition_source_concept_id'] != 0]

condition_occurrence.sample(5)

Unnamed: 0,condition_occurrence_id,person_id,condition_concept_id,condition_start_date,condition_start_datetime,condition_end_date,condition_end_datetime,condition_type_concept_id,condition_status_concept_id,stop_reason,provider_id,visit_occurrence_id,visit_detail_id,condition_source_concept_id,condition_status_source_value
5933,1a0e2b95-6e87-0d52-a47c-2e0e293fc454,7a7b7fba-a005-3736-91ef-218a0d2824c5,4251306,2020-04-26,2020-04-26 12:15:03+00:00,2020-04-26,2020-04-26 13:13:48+00:00,32817,,,,,,4251306,73595000
4552,602ff68c-5c65-006c-8e5b-cf0662b3925b,7a7b7fba-a005-3736-91ef-218a0d2824c5,4251306,2014-09-07,2014-09-07 12:15:03+00:00,2014-09-07,2014-09-07 13:08:14+00:00,32817,,,,,,4251306,73595000
1227,789bbaed-eca2-cf58-0119-61f6fc9d4c96,7a82833f-fae1-d69a-2cbf-69279dac746f,4251306,2014-12-13,2014-12-13 01:46:00+00:00,2014-12-13,2014-12-13 02:39:06+00:00,32817,,,,,,4251306,73595000
3111,94dc8cea-91ed-1470-b340-71a8797c806e,408a95f4-02aa-3003-2f09-0241ac3343fb,4251306,2022-03-30,2022-03-30 08:52:48+00:00,2022-03-30,2022-03-30 09:49:02+00:00,32817,,,,,,4251306,73595000
672,0fcabed9-8219-0e4f-282f-cfe1415b246e,c95d085d-2249-b616-7668-88cc9a0c11bd,4218389,2016-09-18,2016-09-18 09:41:51+00:00,2016-09-18,2016-09-18 10:41:51+00:00,32817,,,,,,4218389,39848009


In [12]:
def condition_to_condition_occurrence(data, concept):
    condition_occurrences = []

    person_id = data['subject']['reference'].split('/')[-1]
    condition_code = data['code']['coding'][0]['code']

    condition_occurrence = {
        'condition_occurrence_id': data['id'],
        'person_id': person_id,
        'condition_concept_id': find_concept_id(
            concept, 
            concept_codes=[condition_code], 
            vocabulary_ids=['SNOMED'], 
            domain_ids=['Condition'], 
            invalid_reason=False, 
            standard_concept='S'
        ),
        'condition_start_date': datetime.strptime(data['onsetDateTime'].split('T')[0], '%Y-%m-%d').date(),
        'condition_start_datetime': datetime.fromisoformat(data['onsetDateTime']),
        'condition_end_date': datetime.strptime(data['abatementDateTime'].split('T')[0], '%Y-%m-%d').date() if 'abatementDateTime' in data else None,
        'condition_end_datetime': datetime.fromisoformat(data['abatementDateTime']) if 'abatementDateTime' in data else None,
        'condition_type_concept_id': 32817,
        'condition_status_concept_id': 37109701 if data['clinicalStatus']['coding'][0]['code'] == 'resolved' else 9181 if data['clinicalStatus']['coding'][0]['code'] == 'active' else pd.NA,
        'stop_reason': None,
        'provider_id': pd.NA,
        'visit_occurrence_id': data['encounter']['reference'].split('/')[-1],
        'visit_detail_id': pd.NA,
        'condition_source_concept_id': find_concept_id(
            concept, 
            concept_codes=[condition_code], 
            vocabulary_ids=['SNOMED'], 
            domain_ids=['Condition']
        ),
        'condition_status_source_value': condition_code
    }
    condition_occurrences.append(condition_occurrence)

    return condition_occurrences

condition_occurrences = []

with open('/workspaces/synthea_dw/data/fhir/Condition.ndjson', 'r') as file:
    for line in file:
        data = json.loads(line)
        conditions = condition_to_condition_occurrence(data, concept)
        condition_occurrences.extend(conditions)

condition_occurrence = pd.DataFrame(condition_occurrences).drop_duplicates()
condition_occurrence = condition_occurrence[condition_occurrence['condition_source_concept_id'] != 0]

condition_occurrence.sample(5)

Unnamed: 0,condition_occurrence_id,person_id,condition_concept_id,condition_start_date,condition_start_datetime,condition_end_date,condition_end_datetime,condition_type_concept_id,condition_status_concept_id,stop_reason,provider_id,visit_occurrence_id,visit_detail_id,condition_source_concept_id,condition_status_source_value
35,9b8e9f23-2323-afe8-6f73-33494b863102,32ee64c2-1585-d7ad-c53f-9ad739c676cf,4251306,2001-01-28,2001-01-28 13:10:12+00:00,2007-02-04,2007-02-04 13:02:24+00:00,32817,37109701,,,9b9425c0-e4cf-6117-97b7-d7487adbfd0b,,4251306,73595000
106,885a5e82-2323-872b-a93a-e08550c083f9,c86bea4c-5647-c8c2-35c5-cb08246ded70,201826,2011-07-01,2011-07-01 16:03:44+00:00,,NaT,32817,9181,,,cc314745-780a-03a8-32e2-d1ffd9a4a284,,201826,44054006
67,36ab7a7b-ea87-810d-03b2-d769877bed5c,32ee64c2-1585-d7ad-c53f-9ad739c676cf,40481087,2008-10-11,2008-10-11 05:15:03+00:00,2008-10-27,2008-10-27 05:15:03+00:00,32817,37109701,,,9fcbe701-abc0-f358-8872-e0e304e5bce9,,40481087,444814009
461,02d05fd9-924d-401c-6ff6-1d439af696c4,7a7b7fba-a005-3736-91ef-218a0d2824c5,4309238,2015-11-29,2015-11-29 13:06:20+00:00,2015-12-13,2015-12-13 12:55:49+00:00,32817,37109701,,,d5c9249c-1742-149e-f182-33b81054b7a8,,4309238,422650009
52,299c5529-a66f-13ba-4c6e-bea4822c5cb0,7a82833f-fae1-d69a-2cbf-69279dac746f,45768458,2012-07-28,2012-07-28 02:51:31+00:00,2018-03-31,2018-03-31 02:16:15+00:00,32817,37109701,,,be3992bb-5992-1dc5-8258-843207ac7304,,45768458,706893006


### drug_exposure

In [13]:
def claim_to_drug_exposure(data, concept):
    drug_exposures = []

    if any(coding['code'] == 'pharmacy' for coding in data['type']['coding']):

        for item in data.get('item', []):
            drug_exposure = {
                'drug_exposure_id': data['prescription']['reference'].split('/')[-1],
                'person_id': data['patient']['reference'].split('/')[-1],
                'drug_concept_id': find_concept_id(
                    concept, 
                    concept_codes=[item['productOrService']['coding'][0]['code']], 
                    vocabulary_ids=['RxNorm'], 
                    domain_ids=['Drug'], 
                    invalid_reason=False, 
                    standard_concept='S'
                ),
                'drug_exposure_start_date': datetime.strptime(data['billablePeriod']['start'].split('T')[0], '%Y-%m-%d').date(),
                'drug_exposure_start_datetime': datetime.fromisoformat(data['billablePeriod']['start']),
                'drug_exposure_end_date': datetime.strptime(data['billablePeriod']['end'].split('T')[0], '%Y-%m-%d').date(),
                'drug_exposure_end_datetime': datetime.fromisoformat(data['billablePeriod']['end']),
                'verbatim_end_date': datetime.strptime(data['billablePeriod']['end'].split('T')[0], '%Y-%m-%d').date(),
                'drug_type_concept_id': 32817,
                'stop_reason': None,
                'refills': 0,
                'quantity': None,
                'days_supply': ((datetime.strptime(data['billablePeriod']['start'].split('T')[0], '%Y-%m-%d').date()) - (datetime.strptime(data['billablePeriod']['start'].split('T')[0], '%Y-%m-%d').date())).days or 1,
                'sig': None,
                'route_concept_id': None,
                'lot_number': None,
                'provider_id': None,
                'visit_occurrence_id': item['encounter'][0]['reference'].split('/')[-1] if 'encounter' in item else None,
                'visit_detail_id': None,
                'drug_source_value': item['productOrService']['coding'][0]['code'],
                'drug_source_concept_id': find_concept_id(
                    concept, 
                    concept_codes=[item['productOrService']['coding'][0]['code']], 
                    vocabulary_ids=['RxNorm'], 
                    domain_ids=['Drug']
                ),
                'route_source_value': None,
                'dose_unit_source_value': None
            }
            drug_exposures.append(drug_exposure)

    return drug_exposures

drug_exposures = []

with open('/workspaces/synthea_dw/data/fhir/Claim.ndjson', 'r') as file:
    for line in file:
        data = json.loads(line)
        exposures = claim_to_drug_exposure(data, concept)
        drug_exposures.extend(exposures)

drug_exposure = pd.DataFrame(drug_exposures).drop_duplicates()

drug_exposure.sample(5)

Unnamed: 0,drug_exposure_id,person_id,drug_concept_id,drug_exposure_start_date,drug_exposure_start_datetime,drug_exposure_end_date,drug_exposure_end_datetime,verbatim_end_date,drug_type_concept_id,stop_reason,refills,quantity,days_supply,sig,route_concept_id,lot_number,provider_id,visit_occurrence_id,visit_detail_id,drug_source_value,drug_source_concept_id,route_source_value,dose_unit_source_value
164,331e8714-36b0-211e-6451-178c6d90d159,c95d085d-2249-b616-7668-88cc9a0c11bd,40220871,2020-02-15,2020-02-15 04:24:15+00:00,2020-02-15,2020-02-15 04:39:15+00:00,2020-02-15,32817,,0,,1,,,,,f7e9109c-5922-bdaf-1a7c-c0d760ff22c2,,1803932,40220871,,
388,f922d8be-e4a7-01aa-b81a-c61321d90c64,7a82833f-fae1-d69a-2cbf-69279dac746f,19080128,2021-01-09,2021-01-09 01:46:00+00:00,2021-01-09,2021-01-09 02:01:00+00:00,2021-01-09,32817,,0,,1,,,,,d77efe0f-5bc6-c5e3-52ca-cae4af74e0e8,,314076,19080128,,
700,e875d32b-468f-ee4f-998a-af55bfb9d980,7a7b7fba-a005-3736-91ef-218a0d2824c5,19080128,2019-08-11,2019-08-11 12:15:03+00:00,2019-08-11,2019-08-11 12:51:23+00:00,2019-08-11,32817,,0,,1,,,,,b038ffbb-2f62-ae7f-46fa-5785495bde8c,,314076,19080128,,
466,0089fe69-40d3-3c94-e9b1-1005394e9dd9,7a82833f-fae1-d69a-2cbf-69279dac746f,19009384,2023-12-30,2023-12-30 01:46:00+00:00,2023-12-30,2023-12-30 02:34:18+00:00,2023-12-30,32817,,0,,1,,,,,fc7f979a-8393-8d3e-6452-c8eaf187f992,,106892,19009384,,
913,525b3927-328d-22e2-fc59-1278222f5342,3558b674-952f-aa9d-9e66-b839f6a16316,19080128,2014-04-27,2014-04-27 12:15:03+00:00,2014-04-27,2014-04-27 12:57:39+00:00,2014-04-27,32817,,0,,1,,,,,c33b0931-74a5-eb1d-c071-5e110de1cd97,,314076,19080128,,


In [14]:
def immunization_to_drug_exposure(data, concept):
    drug_exposures = []

    person_id = data['patient']['reference'].split('/')[-1]
    occurrence_date = datetime.strptime(data['occurrenceDateTime'].split('T')[0], '%Y-%m-%d').date()
    occurrence_datetime = datetime.fromisoformat(data['occurrenceDateTime'])
    visit_occurrence_id = data['encounter']['reference'].split('/')[-1] if 'encounter' in data else None

    for vaccineCode in data.get('vaccineCode', {}).get('coding', []):
        drug_exposure = {
            'drug_exposure_id': data['id'],
            'person_id': person_id,
            'drug_concept_id': find_concept_id(
                concept, 
                concept_codes=[vaccineCode['code']], 
                vocabulary_ids=['CVX'], 
                domain_ids=['Drug'], 
                invalid_reason=False, 
                standard_concept='S'
            ),
            'drug_exposure_start_date': occurrence_date,
            'drug_exposure_start_datetime': occurrence_datetime,
            'drug_exposure_end_date': occurrence_date,
            'drug_exposure_end_datetime': occurrence_datetime,
            'verbatim_end_date': occurrence_date,
            'drug_type_concept_id': 32817,
            'stop_reason': 'completed' if data['status'] == 'completed' else None,
            'refills': 0,
            'quantity': 1,
            'days_supply': 1,
            'sig': None,
            'route_concept_id': None,
            'lot_number': None,
            'provider_id': None,
            'visit_occurrence_id': visit_occurrence_id,
            'visit_detail_id': None, 
            'drug_source_value': vaccineCode['code'],
            'drug_source_concept_id': find_concept_id(
                concept, 
                concept_codes=[vaccineCode['code']], 
                vocabulary_ids=['CVX'], 
                domain_ids=['Drug']
            ),
            'route_source_value': None,
            'dose_unit_source_value': None
        }
        drug_exposures.append(drug_exposure)

    return drug_exposures

drug_exposures = []

with open('/workspaces/synthea_dw/data/fhir/Immunization.ndjson', 'r') as file:
    for line in file:
        data = json.loads(line)
        exposures = immunization_to_drug_exposure(data, concept)
        drug_exposures.extend(exposures)

drug_exposure = pd.DataFrame(drug_exposures).drop_duplicates()

drug_exposure.sample(5)

Unnamed: 0,drug_exposure_id,person_id,drug_concept_id,drug_exposure_start_date,drug_exposure_start_datetime,drug_exposure_end_date,drug_exposure_end_datetime,verbatim_end_date,drug_type_concept_id,stop_reason,refills,quantity,days_supply,sig,route_concept_id,lot_number,provider_id,visit_occurrence_id,visit_detail_id,drug_source_value,drug_source_concept_id,route_source_value,dose_unit_source_value
93,5caedc7b-d1bc-9695-39bf-d37431d25447,7a82833f-fae1-d69a-2cbf-69279dac746f,40213154,2019-12-21,2019-12-21 01:46:00+00:00,2019-12-21,2019-12-21 01:46:00+00:00,2019-12-21,32817,completed,0,1,1,,,,,631a6f3a-d7e7-feea-c68e-c42450a3139f,,140,40213154,,
115,3730c7f5-4e6d-9feb-a99a-ccfd847d8703,7a7b7fba-a005-3736-91ef-218a0d2824c5,40213154,2015-06-21,2015-06-21 12:15:03+00:00,2015-06-21,2015-06-21 12:15:03+00:00,2015-06-21,32817,completed,0,1,1,,,,,a3f6c64f-ff9c-5c56-e862-2a67fad87e5b,,140,40213154,,
100,5a0735f2-f97b-6fc8-7048-21d55be7b3f3,408a95f4-02aa-3003-2f09-0241ac3343fb,40213227,2019-03-13,2019-03-13 08:52:48+00:00,2019-03-13,2019-03-13 08:52:48+00:00,2019-03-13,32817,completed,0,1,1,,,,,1833117e-a1ea-b17d-21d6-c5efebb1a23e,,113,40213227,,
133,f32d34f0-6410-1428-4af7-fb08c2ee73e7,3558b674-952f-aa9d-9e66-b839f6a16316,40213154,2018-05-20,2018-05-20 12:15:03+00:00,2018-05-20,2018-05-20 12:15:03+00:00,2018-05-20,32817,completed,0,1,1,,,,,5811fe8c-d0b6-90d6-83d8-cb01048e7388,,140,40213154,,
135,2e4f186e-23f8-7509-fe4a-a57d20252d77,3558b674-952f-aa9d-9e66-b839f6a16316,40213260,2020-05-31,2020-05-31 12:15:03+00:00,2020-05-31,2020-05-31 12:15:03+00:00,2020-05-31,32817,completed,0,1,1,,,,,b5b7200e-e775-7da7-7a06-527c24b6338f,,121,40213260,,


In [15]:
def medicationAdministration_to_drug_exposure(data, concept):
    person_id = data['subject']['reference'].split('/')[-1]
    visit_occurrence_id = data['context']['reference'].split('/')[-1] if 'context' in data else None
    effective_date = datetime.strptime(data['effectiveDateTime'].split('T')[0], '%Y-%m-%d').date()

    drug_exposures = []

    for coding in data['medicationCodeableConcept']['coding']:
        drug_exposure = {
            'drug_exposure_id': data['id'],
            'person_id': person_id,
            'drug_concept_id': find_concept_id(
                concept, 
                concept_codes=[coding['code']], 
                vocabulary_ids=['RxNorm'], 
                domain_ids=['Drug'], 
                invalid_reason=False, 
                standard_concept='S'
            ),
            'drug_exposure_start_date': effective_date,
            'drug_exposure_start_datetime': datetime.fromisoformat(data['effectiveDateTime']),
            'drug_exposure_end_date': effective_date,
            'drug_exposure_end_datetime': datetime.fromisoformat(data['effectiveDateTime']),
            'verbatim_end_date': effective_date,
            'drug_type_concept_id': 32817,
            'stop_reason': 'completed' if data['status'] == 'completed' else None,
            'refills': 0,
            'quantity': 1,
            'days_supply': 1,
            'sig': None,
            'route_concept_id': None,
            'lot_number': None,
            'provider_id': None,
            'visit_occurrence_id': visit_occurrence_id,
            'visit_detail_id': None,
            'drug_source_value': coding['code'],
            'drug_source_concept_id': find_concept_id(
                concept, 
                concept_codes=[coding['code']], 
                vocabulary_ids=['RxNorm'], 
                domain_ids=['Drug']
            ),
            'route_source_value': None,
            'dose_unit_source_value': None
        }
        drug_exposures.append(drug_exposure)

    return drug_exposures

drug_exposures = []

with open('/workspaces/synthea_dw/data/fhir/MedicationAdministration.ndjson', 'r') as file:
    for line in file:
        data = json.loads(line)
        exposure = medicationAdministration_to_drug_exposure(data, concept)
        drug_exposures.extend(exposure)

drug_exposure = pd.DataFrame(drug_exposures).drop_duplicates()

drug_exposure.sample(5)

Unnamed: 0,drug_exposure_id,person_id,drug_concept_id,drug_exposure_start_date,drug_exposure_start_datetime,drug_exposure_end_date,drug_exposure_end_datetime,verbatim_end_date,drug_type_concept_id,stop_reason,refills,quantity,days_supply,sig,route_concept_id,lot_number,provider_id,visit_occurrence_id,visit_detail_id,drug_source_value,drug_source_concept_id,route_source_value,dose_unit_source_value
16,4a0d10df-4176-d3ee-dd72-d45d79755325,32ee64c2-1585-d7ad-c53f-9ad739c676cf,40220871,2014-09-27,2014-09-27 02:24:05+00:00,2014-09-27,2014-09-27 02:24:05+00:00,2014-09-27,32817,completed,0,1,1,,,,,77914142-910f-4c80-86b1-b561ed23c11a,,1803932,40220871,,
23,8b67c040-1f03-8d02-51a1-a0759effd771,c95d085d-2249-b616-7668-88cc9a0c11bd,35603924,2020-04-15,2020-04-15 05:24:15+00:00,2020-04-15,2020-04-15 05:24:15+00:00,2020-04-15,32817,completed,0,1,1,,,,,99106bf2-e55d-9789-b7de-9574b1f3d9a6,,1736776,35603924,,
20,ac34e95f-5607-1622-9660-26fda39e241b,32ee64c2-1585-d7ad-c53f-9ad739c676cf,40220871,2014-10-28,2014-10-28 18:24:05+00:00,2014-10-28,2014-10-28 18:24:05+00:00,2014-10-28,32817,completed,0,1,1,,,,,e6aad50e-aba3-e7aa-b57e-26966bcb194d,,1803932,40220871,,
29,8526cf37-5480-a771-1e48-ef28218fb0e3,7a7b7fba-a005-3736-91ef-218a0d2824c5,19010292,2023-02-06,2023-02-06 14:40:47+00:00,2023-02-06,2023-02-06 14:40:47+00:00,2023-02-06,32817,completed,0,1,1,,,,,6421627e-7678-2429-082e-624c6e7c6c8a,,108515,19010292,,
2,119d7569-eec2-214e-d1fc-38109ec1aa9b,c95d085d-2249-b616-7668-88cc9a0c11bd,40220871,2019-11-15,2019-11-15 13:24:15+00:00,2019-11-15,2019-11-15 13:24:15+00:00,2019-11-15,32817,completed,0,1,1,,,,,9286d8fc-e2fc-9d91-7b25-6bd61131e0d4,,1803932,40220871,,


In [16]:
def medicationRequest_to_drug_exposure(data, concept):
    drug_exposures = []

    if 'medicationCodeableConcept' in data and 'coding' in data['medicationCodeableConcept']:
        person_id = data['subject']['reference'].split('/')[-1]
        provider_id = data['requester']['reference'].split('/')[-1] if 'requester' in data else None
        visit_occurrence_id = data['encounter']['reference'].split('/')[-1] if 'encounter' in data else None
        authored_date = datetime.strptime(data['authoredOn'].split('T')[0], '%Y-%m-%d').date()

        for coding in data['medicationCodeableConcept']['coding']:
            drug_exposure = {
                'drug_exposure_id': data['id'],
                'person_id': person_id,
                'drug_concept_id': find_concept_id(
                    concept, 
                    concept_codes=[coding['code']], 
                    vocabulary_ids=['RxNorm'], 
                    domain_ids=['Drug'], 
                    invalid_reason=False, 
                    standard_concept='S'
                ),
                'drug_exposure_start_date': authored_date,
                'drug_exposure_start_datetime': datetime.fromisoformat(data['authoredOn']),
                'drug_exposure_end_date': pd.NA,
                'drug_exposure_end_datetime': pd.NA,
                'verbatim_end_date': pd.NA,
                'drug_type_concept_id': 32817,
                'stop_reason': 'stopped' if data['status'] == 'stopped' else None,
                'refills': pd.NA,
                'quantity': pd.NA,
                'days_supply': pd.NA,
                'sig': None,
                'route_concept_id': None,
                'lot_number': None,
                'provider_id': provider_id,
                'visit_occurrence_id': visit_occurrence_id,
                'visit_detail_id': None,
                'drug_source_value': coding['code'],
                'drug_source_concept_id': find_concept_id(
                    concept, 
                    concept_codes=[coding['code']], 
                    vocabulary_ids=['RxNorm'], 
                    domain_ids=['Drug']
                ),
                'route_source_value': None,
                'dose_unit_source_value': None
            }
            drug_exposures.append(drug_exposure)

    return drug_exposures

drug_exposures = []

with open('/workspaces/synthea_dw/data/fhir/MedicationRequest.ndjson', 'r') as file:
    for line in file:
        data = json.loads(line)
        exposure = medicationRequest_to_drug_exposure(data, concept)
        drug_exposures.extend(exposure)

drug_exposure = pd.DataFrame(drug_exposures).drop_duplicates()

drug_exposure.sample(5)

Unnamed: 0,drug_exposure_id,person_id,drug_concept_id,drug_exposure_start_date,drug_exposure_start_datetime,drug_exposure_end_date,drug_exposure_end_datetime,verbatim_end_date,drug_type_concept_id,stop_reason,refills,quantity,days_supply,sig,route_concept_id,lot_number,provider_id,visit_occurrence_id,visit_detail_id,drug_source_value,drug_source_concept_id,route_source_value,dose_unit_source_value
302,fdd33659-b816-fd0b-9db0-93f05b164cb9,7a82833f-fae1-d69a-2cbf-69279dac746f,19009384,2019-12-21,2019-12-21 01:46:00+00:00,,,,32817,stopped,,,,,,,14f7c5ac-ea56-30d9-ac3c-d6e2ca22366f,631a6f3a-d7e7-feea-c68e-c42450a3139f,,106892,19009384,,
661,9ed3ebbe-df25-73f9-4af3-dd63129763e7,7a7b7fba-a005-3736-91ef-218a0d2824c5,19009384,2019-05-26,2019-05-26 12:15:03+00:00,,,,32817,stopped,,,,,,,3d336284-20e1-386c-b13d-dd0b4e639693,9922eab4-63d9-426a-757b-5b551ffbf27f,,106892,19009384,,
478,8c5d32e7-51f0-5b86-1660-34dde75a9c6e,7a7b7fba-a005-3736-91ef-218a0d2824c5,19009384,2014-08-24,2014-08-24 12:15:03+00:00,,,,32817,stopped,,,,,,,299bc447-29c4-3c98-948b-ea0891c97d89,6f064951-af37-9f08-b94b-cd7e08606773,,106892,19009384,,
508,120f94a6-df91-9c19-d322-67322a428f17,7a7b7fba-a005-3736-91ef-218a0d2824c5,19009384,2015-08-30,2015-08-30 12:15:03+00:00,,,,32817,stopped,,,,,,,299bc447-29c4-3c98-948b-ea0891c97d89,db754312-1df1-1d26-cacf-262b99179dfd,,106892,19009384,,
599,ac11ccf0-773d-b6cc-2683-b14282f7c3b3,7a7b7fba-a005-3736-91ef-218a0d2824c5,19041324,2017-05-21,2017-05-21 12:15:03+00:00,,,,32817,stopped,,,,,,,299bc447-29c4-3c98-948b-ea0891c97d89,aefa52ba-1a7e-60f7-3650-3122ae68d02f,,209387,19041324,,


### procedure_occurrence

In [17]:
def carePlan_to_procedure_occurrence(data, concept):
    procedure_occurrences = []

    for category in data.get('category', []):
        for coding in category.get('coding', []):
            if 'display' in coding:
                procedure = {
                    'procedure_occurrence_id': data['id'],
                    'person_id': data['subject']['reference'].split('/')[-1],
                    'procedure_concept_id': find_concept_id(
                        concept, 
                        concept_codes=[coding['code']], 
                        vocabulary_ids=['SNOMED'], 
                        domain_ids=['Procedure'], 
                        invalid_reason=False, 
                        standard_concept='S', 
                        concept_class_ids=['Procedure']
                    ),
                    'procedure_date': datetime.strptime(data['period']['start'].split('T')[0], '%Y-%m-%d').date(),
                    'procedure_datetime': datetime.fromisoformat(data['period']['start']),
                    'procedure_end_date': None,
                    'procedure_end_datetime': None,
                    'procedure_type_concept_id': 32817,
                    'modifier_concept_id': pd.NA,
                    'quantity': 1,
                    'provider_id': pd.NA,
                    'visit_occurrence_id': data['encounter']['reference'].split('/')[-1],
                    'visit_detail_id': pd.NA,
                    'procedure_source_value': coding['code'],
                    'procedure_source_concept_id': find_concept_id(
                        concept, 
                        concept_codes=[coding['code']], 
                        vocabulary_ids=['SNOMED'], 
                        domain_ids=['Procedure'], 
                        concept_class_ids=['Procedure']
                    ),
                    'modifier_source_value': None
                }
                procedure_occurrences.append(procedure)

    return procedure_occurrences

procedure_occurrences = []

with open('/workspaces/synthea_dw/data/fhir/CarePlan.ndjson', 'r') as file:
    for line in file:
        data = json.loads(line)
        procedures = carePlan_to_procedure_occurrence(data, concept)
        if procedures:
            procedure_occurrences.extend(procedures)

procedure_occurrence = pd.DataFrame(procedure_occurrences).drop_duplicates()
procedure_occurrence = procedure_occurrence[procedure_occurrence['procedure_source_concept_id'] != 0]

procedure_occurrence.sample(5)

Unnamed: 0,procedure_occurrence_id,person_id,procedure_concept_id,procedure_date,procedure_datetime,procedure_end_date,procedure_end_datetime,procedure_type_concept_id,modifier_concept_id,quantity,provider_id,visit_occurrence_id,visit_detail_id,procedure_source_value,procedure_source_concept_id,modifier_source_value
34,74198e69-03a8-295c-d1f1-359e3ce3ddaa,c95d085d-2249-b616-7668-88cc9a0c11bd,4188631,2021-09-22,2021-09-22 09:41:51+00:00,,,32817,,1,,9f7b7f00-d0a0-9adc-e47b-b69a21c0dfb6,,47387005,4188631,
43,97b05a3b-8747-6c15-5585-099bf4c86ef6,3558b674-952f-aa9d-9e66-b839f6a16316,40481459,1998-01-25,1998-01-25 12:15:03+00:00,,,32817,,1,,5296ed0f-148b-e3ed-941d-9fd360122d1f,,443402002,40481459,
22,68809caa-a35f-2c5b-8b48-4c3e2083cd77,32ee64c2-1585-d7ad-c53f-9ad739c676cf,4203780,2009-04-09,2009-04-09 19:58:58+00:00,,,32817,,1,,eadeb17c-ec14-a5b9-66d1-2ebf911b0028,,53950000,4203780,
23,fbb1d178-84ae-727e-3d8e-cf0804f8408a,a5a02d31-a93c-7b72-7e1c-a9cbfa64874d,4203780,2009-11-29,2009-11-29 04:08:38+00:00,,,32817,,1,,99cadc54-d5cc-769a-5e5c-e742e81a5f55,,53950000,4203780,
4,b3eeab01-02ef-1c02-70b3-8b84ad3e41d0,a5a02d31-a93c-7b72-7e1c-a9cbfa64874d,37397658,1964-12-30,1964-12-30 08:52:48+00:00,,,32817,,1,,da8a1dc0-6bc8-9f29-4b96-887b590f2c8c,,718361005,37397658,


In [19]:
def imagingStudy_to_procedure_occurrence(data, concept):
    procedure_occurrences = []

    person_id = data['subject']['reference'].split('/')[-1]
    procedure_date = datetime.strptime(data['started'].split('T')[0], '%Y-%m-%d').date()
    procedure_datetime = datetime.fromisoformat(data['started'])
    visit_occurrence_id = data['encounter']['reference'].split('/')[-1] if 'encounter' in data else None
    numberOfInstances = data['numberOfInstances'] if 'numberOfInstances' in data else None

    for procedureCode in data.get('procedureCode', []):
        for coding in procedureCode.get('coding', []):
            modifier_code = data['series'][0]['bodySite']['code'] if 'series' in data and 'bodySite' in data['series'][0] else None

            procedure_occurrence = {
                'procedure_occurrence_id': data['id'],
                'person_id': person_id,
                'procedure_concept_id': find_concept_id(
                    concept, 
                    concept_codes=[coding['code']], 
                    vocabulary_ids=['SNOMED'], 
                    domain_ids=['Procedure'], 
                    invalid_reason=False, 
                    standard_concept='S'
                ),
                'procedure_date': procedure_date,
                'procedure_datetime': procedure_datetime,
                'procedure_end_date': None,
                'procedure_end_datetime': None,
                'procedure_type_concept_id': 32817,
                'modifier_concept_id': find_concept_id(
                    concept, 
                    concept_codes=[modifier_code], 
                    vocabulary_ids=['SNOMED'], 
                    domain_ids=['Spec Anatomic Site'], 
                    invalid_reason=False, 
                    standard_concept='S'
                ),
                'quantity': numberOfInstances,
                'provider_id': pd.NA,
                'visit_occurrence_id': visit_occurrence_id,
                'visit_detail_id': pd.NA,
                'procedure_source_value': coding['display'] if 'display' in coding else None,
                'procedure_source_concept_id': find_concept_id(
                    concept, 
                    concept_codes=[coding['code']], 
                    vocabulary_ids=['SNOMED'], 
                    domain_ids=['Procedure'], 
                    invalid_reason=False, 
                    standard_concept='S'
                ),
                'modifier_source_value': modifier_code
            }

            procedure_occurrences.append(procedure_occurrence)

    return procedure_occurrences

procedure_occurrences = []

with open('/workspaces/synthea_dw/data/fhir/ImagingStudy.ndjson', 'r') as file:
    for line in file:
        data = json.loads(line)
        procedures = imagingStudy_to_procedure_occurrence(data, concept)
        procedure_occurrences.extend(procedures)

procedure_occurrence = pd.DataFrame(procedure_occurrences).drop_duplicates()

procedure_occurrence.sample(3)

Unnamed: 0,procedure_occurrence_id,person_id,procedure_concept_id,procedure_date,procedure_datetime,procedure_end_date,procedure_end_datetime,procedure_type_concept_id,modifier_concept_id,quantity,provider_id,visit_occurrence_id,visit_detail_id,procedure_source_value,procedure_source_concept_id,modifier_source_value
2,45e6a931-2818-70ff-6ffe-722cccfa3806,a5a02d31-a93c-7b72-7e1c-a9cbfa64874d,4163872,2017-02-23,2017-02-23 09:21:18+00:00,,,32817,4199473,1,,8e869ca3-2eb9-fefc-4cb4-8dbba3a9e3bb,,Plain chest X-ray (procedure),4163872,51185008
1,2b8846b2-f8c5-c0c0-258b-f008583eeae4,a5a02d31-a93c-7b72-7e1c-a9cbfa64874d,4230911,2016-08-27,2016-08-27 09:21:18+00:00,,,32817,4217142,1,,2b03e44b-e68c-a115-c1ce-60bea93772ab,,Echocardiography (procedure),4230911,80891009
0,c7fdab53-eff8-78fe-69f2-27aae766a2fb,a5a02d31-a93c-7b72-7e1c-a9cbfa64874d,4163872,2016-08-08,2016-08-08 09:01:29+00:00,,,32817,4199473,1,,86f37e05-ec6f-1a50-7488-7dcd7206c736,,Plain chest X-ray (procedure),4163872,51185008


In [None]:
def procedure_to_procedure_occurrence(data, concept):
    procedure_occurrence_rows = []

    person_id = data['subject']['reference'].split('/')[-1]
    procedure_code = data['code']['coding'][0]['code']
    procedure_date = datetime.strptime(data['performedPeriod']['start'].split('T')[0], '%Y-%m-%d').date()
    procedure_datetime = datetime.fromisoformat(data['performedPeriod']['start'])
    procedure_end_date = datetime.strptime(data['performedPeriod']['end'].split('T')[0], '%Y-%m-%d').date() if 'end' in data['performedPeriod'] else None
    procedure_end_datetime = datetime.fromisoformat(data['performedPeriod']['end']) if 'end' in data['performedPeriod'] else None
    visit_occurrence_id = data['encounter']['reference'].split('/')[-1] if 'encounter' in data else None

    procedure_occurrence = {
        'procedure_occurrence_id': data['id'],
        'person_id': person_id,
        'procedure_concept_id': find_concept_id(
            concept, 
            concept_codes=[procedure_code], 
            vocabulary_ids=['SNOMED'], 
            domain_ids=['Procedure'], 
            invalid_reason=False, 
            standard_concept='S'
        ),
        'procedure_date': procedure_date,
        'procedure_datetime': procedure_datetime,
        'procedure_end_date': procedure_end_date,
        'procedure_end_datetime': procedure_end_datetime,
        'procedure_type_concept_id': 32817,
        'modifier_concept_id': 0,
        'quantity': 1,
        'provider_id': pd.NA,
        'visit_occurrence_id': visit_occurrence_id,
        'visit_detail_id': pd.NA,
        'procedure_source_value': procedure_code,
        'procedure_source_concept_id': find_concept_id(
            concept, 
            concept_codes=[procedure_code], 
            vocabulary_ids=['SNOMED'], 
            domain_ids=['Procedure']
        ),
        'modifier_source_value': None
    }

    procedure_occurrence_rows.append(procedure_occurrence)

    return procedure_occurrence_rows

procedure_occurrence_rows = []

with open('/workspaces/synthea_dw/data/fhir/Procedure.ndjson', 'r') as file:
    for line in file:
        data = json.loads(line)
        procedure_occurrences = procedure_to_procedure_occurrence(data, concept)
        procedure_occurrence_rows.extend(procedure_occurrences)

procedure_occurrence = pd.DataFrame(procedure_occurrence_rows).drop_duplicates()
procedure_occurrence = procedure_occurrence[procedure_occurrence['procedure_source_concept_id'] != 0]

procedure_occurrence.sample(5)


### device_exposure

In [20]:
def device_to_device_exposure(data, concept):
    device_exposures = []

    person_id = data['patient']['reference'].split('/')[-1]
    device_code = data['type']['coding'][0]['code']

    device_exposure = {
        'device_exposure_id': data['id'],
        'person_id': person_id,
        'device_concept_id': find_concept_id(
            concept, 
            concept_codes=[device_code], 
            vocabulary_ids=['SNOMED'], 
            domain_ids=['Device'], 
            invalid_reason=False, 
            standard_concept='S'
        ),
        'device_exposure_start_date': datetime.strptime(data['manufactureDate'].split('T')[0], '%Y-%m-%d').date(),
        'device_exposure_start_datetime': datetime.fromisoformat(data['manufactureDate']),
        'device_exposure_end_date': None,
        'device_exposure_end_datetime': None,
        'device_type_concept_id': 32817,
        'unique_device_id': data.get('distinctIdentifier', None),
        'production_id': data['udiCarrier'][0]['carrierHRF'] if data.get('udiCarrier') else None,
        'quantity': 1,
        'provider_id': pd.NA,
        'visit_occurrence_id': pd.NA,
        'visit_detail_id': pd.NA,
        'device_source_value': device_code,
        'device_source_concept_id': find_concept_id(
            concept, 
            concept_codes=[device_code], 
            vocabulary_ids=['SNOMED'], 
            domain_ids=['Device']
        ),
        'unit_concept_id': pd.NA,
        'unit_source_value': None,
        'unit_source_concept_id': pd.NA
    }
    device_exposures.append(device_exposure)

    return device_exposures

device_exposures = []

with open('/workspaces/synthea_dw/data/fhir/Device.ndjson', 'r') as file:
    for line in file:
        data = json.loads(line)
        exposures = device_to_device_exposure(data, concept)
        device_exposures.extend(exposures)

device_exposure = pd.DataFrame(device_exposures).drop_duplicates()

device_exposure.sample(5)

Unnamed: 0,device_exposure_id,person_id,device_concept_id,device_exposure_start_date,device_exposure_start_datetime,device_exposure_end_date,device_exposure_end_datetime,device_type_concept_id,unique_device_id,production_id,quantity,provider_id,visit_occurrence_id,visit_detail_id,device_source_value,device_source_concept_id,unit_concept_id,unit_source_value,unit_source_concept_id
17,a61a17f9-7e9b-6e6c-ccef-d5564e2c6b83,4390395b-5a78-2005-80b7-5ebd62b595c9,4045112,2019-07-17,2019-07-17 07:11:38+00:00,,,32817,90520771158622,(01)90520771158622(11)190717(17)440731(10)4736...,1,,,,228869008,4045112,,,
21,f8351991-9345-b048-2d68-4e8eb746f95a,a5a02d31-a93c-7b72-7e1c-a9cbfa64874d,45768176,2017-02-15,2017-02-15 08:52:48+00:00,,,32817,14700645843961,(01)14700645843961(11)170215(17)420302(10)1560...,1,,,,706112002,45768176,,,
19,41e6905a-fb29-7628-764e-c9b6766f24ec,c86bea4c-5647-c8c2-35c5-cb08246ded70,45768203,2019-07-26,2019-07-26 16:31:24+00:00,,,32817,73127704662188,(01)73127704662188(11)190726(17)440809(10)4589...,1,,,,706180003,45768203,,,
4,d5b3e3ba-3862-f231-0c08-ed60d239d90a,32ee64c2-1585-d7ad-c53f-9ad739c676cf,4224372,2001-01-07,2001-01-07 12:15:03+00:00,,,32817,71173175741080,(01)71173175741080(11)010107(17)260122(10)2084...,1,,,,337414009,4224372,,,
16,265ea37b-4cd5-1925-895e-5ba59832ea74,c95d085d-2249-b616-7668-88cc9a0c11bd,4045112,2016-08-28,2016-08-28 09:41:51+00:00,,,32817,34387367351296,(01)34387367351296(11)160828(17)410912(10)1692...,1,,,,228869008,4045112,,,


In [50]:
def supplyDelivery_to_device_exposure(data, concept):
    device_exposure_rows = []

    person_id = data['patient']['reference'].split('/')[-1]
    device_code = data['suppliedItem']['itemCodeableConcept']['coding'][0]['code']
    quantity = data['suppliedItem']['quantity']['value'] if 'quantity' in data['suppliedItem'] else None
    occurrence_date = datetime.strptime(data['occurrenceDateTime'].split('T')[0], '%Y-%m-%d').date()
    occurrence_datetime = datetime.fromisoformat(data['occurrenceDateTime'])

    device_exposure = {
        'device_exposure_id': data['id'],
        'person_id': person_id,
        'device_concept_id': find_concept_id(
            concept, 
            concept_codes=[device_code], 
            vocabulary_ids=['SNOMED'], 
            domain_ids=['Device'], 
            invalid_reason=False, 
            standard_concept='S'
        ),
        'device_exposure_start_date': occurrence_date,
        'device_exposure_start_datetime': occurrence_datetime,
        'device_exposure_end_date': occurrence_date,
        'device_exposure_end_datetime': occurrence_datetime,
        'device_type_concept_id': 32817,
        'unique_device_id': None,
        'production_id': None,
        'quantity': quantity,
        'provider_id': pd.NA,
        'visit_occurrence_id': pd.NA,
        'visit_detail_id': pd.NA,
        'device_source_value': device_code,
        'device_source_concept_id': find_concept_id(
            concept, 
            concept_codes=[device_code], 
            vocabulary_ids=['SNOMED'], 
            domain_ids=['Device']
        ),
        'unit_concept_id': pd.NA,
        'unit_source_value': None,
        'unit_source_concept_id': pd.NA
    }

    device_exposure_rows.append(device_exposure)

    return device_exposure_rows

# Example usage
device_exposure_rows = []

with open('/workspaces/synthea_dw/data/fhir/SupplyDelivery.ndjson', 'r') as file:
    for line in file:
        data = json.loads(line)
        exposures = supplyDelivery_to_device_exposure(data, concept)
        device_exposure_rows.extend(exposures)

device_exposure = pd.DataFrame(device_exposure_rows).drop_duplicates()

device_exposure.sample(5)

Unnamed: 0,device_exposure_id,person_id,device_concept_id,device_exposure_start_date,device_exposure_start_datetime,device_exposure_end_date,device_exposure_end_datetime,device_type_concept_id,unique_device_id,production_id,quantity,provider_id,visit_occurrence_id,visit_detail_id,device_source_value,device_source_concept_id,unit_concept_id,unit_source_value,unit_source_concept_id
17,c8b44962-4b65-508b-1170-67a495baed73,c86bea4c-5647-c8c2-35c5-cb08246ded70,45758780,1992-03-13,1992-03-13 16:14:05+00:00,1992-03-13,1992-03-13 16:14:05+00:00,32817,,,1,,,,463659001,45758780,,,
9,c1c173fa-0b6b-278f-771d-226a0757c82b,c86bea4c-5647-c8c2-35c5-cb08246ded70,4322046,1992-02-24,1992-02-24 23:43:10+00:00,1992-02-24,1992-02-24 23:43:10+00:00,32817,,,5,,,,972002,4322046,,,
33,d0f7cb61-3aeb-dbd7-f315-8ebcf2bd925d,c86bea4c-5647-c8c2-35c5-cb08246ded70,4322046,2009-06-19,2009-06-19 16:22:45+00:00,2009-06-19,2009-06-19 16:22:45+00:00,32817,,,5,,,,972002,4322046,,,
35,63c31a71-e158-1e8e-a857-58235d33011b,c86bea4c-5647-c8c2-35c5-cb08246ded70,45768222,2009-06-19,2009-06-19 16:22:45+00:00,2009-06-19,2009-06-19 16:22:45+00:00,32817,,,1,,,,706226000,45768222,,,
138,29bd7ea6-5244-9132-ddd5-acdbd813892c,c86bea4c-5647-c8c2-35c5-cb08246ded70,45761494,2022-09-02,2022-09-02 16:15:55+00:00,2022-09-02,2022-09-02 16:15:55+00:00,32817,,,1,,,,467645007,45761494,,,


### measurement

In [49]:
def observation_to_measurement(data, concept):
    measurement_rows = []
    person_id = data['subject']['reference'].split('/')[-1]
    measurement_date = datetime.strptime(data['effectiveDateTime'].split('T')[0], '%Y-%m-%d').date()
    measurement_datetime = datetime.fromisoformat(data['effectiveDateTime'])
    measurement_time = data['effectiveDateTime'].split('T')[1] if 'effectiveDateTime' in data else None
    visit_occurrence_id = data['encounter']['reference'].split('/')[-1] if 'encounter' in data else None

    # Function to process a single measurement
    def process_measurement(code, valueQuantity, valueCodeableConcept=None):
        value_as_number = float(valueQuantity['value']) if valueQuantity and 'value' in valueQuantity else pd.NA
        value_as_concept_id = None
        if valueCodeableConcept:
            value_as_concept_id = find_concept_id(
                concept, 
                concept_codes=[valueCodeableConcept['coding'][0]['code']], 
                vocabulary_ids=['LOINC'], 
                domain_ids=['Observation'], 
                invalid_reason=False, 
                standard_concept='S'
            )

        return {
            'measurement_id': data['id'],
            'person_id': person_id,
            'measurement_concept_id': find_concept_id(
                concept, 
                concept_codes=[code], 
                vocabulary_ids=['LOINC'], 
                domain_ids=['Measurement'], 
                invalid_reason=False, 
                standard_concept='S'
            ),
            'measurement_date': measurement_date,
            'measurement_datetime': measurement_datetime,
            'measurement_time': measurement_time,
            'measurement_type_concept_id': 32817,
            'operator_concept_id': 4172703,
            'value_as_number': value_as_number,
            'value_as_concept_id': value_as_concept_id,
            'unit_concept_id': find_concept_id(
                concept, 
                concept_codes=[valueQuantity['code']] if valueQuantity and 'code' in valueQuantity else None, 
                vocabulary_ids=['UCUM'], 
                domain_ids=['Unit'], 
                invalid_reason=False, 
                standard_concept='S'
            ),
            'range_low': pd.NA,
            'range_high': pd.NA,
            'provider_id': pd.NA,
            'visit_occurrence_id': visit_occurrence_id,
            'visit_detail_id': pd.NA,
            'measurement_source_value': code,
            'measurement_source_concept_id': find_concept_id(
                concept, 
                concept_codes=[code], 
                vocabulary_ids=['LOINC'], 
                domain_ids=['Measurement']
            ),
            'unit_source_value': valueQuantity['code'] if valueQuantity and 'code' in valueQuantity else None,
            'unit_source_concept_id': find_concept_id(
                concept, 
                concept_codes=[valueQuantity['code']] if valueQuantity and 'code' in valueQuantity else None, 
                vocabulary_ids=['UCUM'], 
                domain_ids=['Unit']
            ),
            'value_source_value': valueQuantity['value'] if valueQuantity and 'value' in valueQuantity else None,
            'measurement_event_id': pd.NA,
            'meas_event_field_concept_id': pd.NA
        }

    # Check if the observation is in the 'component' format
    if 'component' in data:
        for comp in data['component']:
            code = comp['code']['coding'][0]['code']
            valueQuantity = comp.get('valueQuantity', None)
            valueCodeableConcept = comp.get('valueCodeableConcept', None)
            measurement = process_measurement(code, valueQuantity, valueCodeableConcept)
            measurement_rows.append(measurement)
    else:
        # Process the single measurement
        code = data['code']['coding'][0]['code']
        valueQuantity = data.get('valueQuantity', None)
        valueCodeableConcept = data.get('valueCodeableConcept', None)
        measurement = process_measurement(code, valueQuantity, valueCodeableConcept)
        measurement_rows.append(measurement)

    return measurement_rows

# Example usage
measurement_rows = []

with open('/workspaces/synthea_dw/data/fhir/Observation.ndjson', 'r') as file:
    for line in file:
        data = json.loads(line)
        measurements = observation_to_measurement(data, concept)
        measurement_rows.extend(measurements)

measurement = pd.DataFrame(measurement_rows).drop_duplicates()
measurement = measurement[measurement['measurement_source_concept_id'] != 0]

measurement.sample(5)

Unnamed: 0,measurement_id,person_id,measurement_concept_id,measurement_date,measurement_datetime,measurement_time,measurement_type_concept_id,operator_concept_id,value_as_number,value_as_concept_id,unit_concept_id,range_low,range_high,provider_id,visit_occurrence_id,visit_detail_id,measurement_source_value,measurement_source_concept_id,unit_source_value,unit_source_concept_id,value_source_value,measurement_event_id,meas_event_field_concept_id
12541,fcd6d2b1-fced-257b-1a6f-ab74febca45d,7a7b7fba-a005-3736-91ef-218a0d2824c5,3016723,2014-08-24,2014-08-24 12:15:03+00:00,12:15:03+00:00,32817,4172703,1.923,,8840,,,,6f064951-af37-9f08-b94b-cd7e08606773,,2160-0,3016723,mg/dL,8840,1.923,,
2114,3404432a-e59c-7c16-9fc4-7cdf0e5bac50,c86bea4c-5647-c8c2-35c5-cb08246ded70,3009966,2017-08-04,2017-08-04 16:03:44+00:00,16:03:44+00:00,32817,4172703,83.31,,8840,,,,ba72e4e6-3689-b835-5122-f53c884d7ffb,,18262-6,3009966,mg/dL,8840,83.31,,
13168,a5660373-e330-980a-7346-45516fae314e,7a7b7fba-a005-3736-91ef-218a0d2824c5,3006513,2015-08-02,2015-08-02 12:15:03+00:00,12:15:03+00:00,32817,4172703,0.38503,,8840,,,,4fea9d59-edde-104c-7777-08d3c5107b04,,20505-4,3006513,mg/dL,8840,0.38503,,
16885,60f86c33-3ba6-a072-4de7-cd0660e551b1,7a7b7fba-a005-3736-91ef-218a0d2824c5,3035350,2018-06-17,2018-06-17 12:15:03+00:00,12:15:03+00:00,32817,4172703,,0.0,8554,,,,26aeae98-7c52-12d7-c292-08316dfc9d99,,2514-8,3035350,,8554,,,
15808,a16f6f53-9712-83cb-8e63-a40fc56e5524,7a7b7fba-a005-3736-91ef-218a0d2824c5,3022192,2017-09-03,2017-09-03 12:15:03+00:00,12:15:03+00:00,32817,4172703,166.41,,8840,,,,2395be97-79d6-6ba0-5948-f53558b2cc87,,2571-8,3022192,mg/dL,8840,166.41,,


In [None]:
def procedure_to_measurement(data, concept):
    measurement_rows = []

    person_id = data['subject']['reference'].split('/')[-1]
    procedure_code = data['code']['coding'][0]['code']
    measurement_date = datetime.strptime(data['performedPeriod']['start'].split('T')[0], '%Y-%m-%d').date()
    measurement_datetime = datetime.fromisoformat(data['performedPeriod']['start'])
    measurement_time = data['performedPeriod']['start'].split('T')[1] if 'performedPeriod' in data and 'start' in data['performedPeriod'] else None
    visit_occurrence_id = data['encounter']['reference'].split('/')[-1] if 'encounter' in data else None

    measurement = {
        'measurement_id': data['id'],
        'person_id': person_id,
        'measurement_concept_id': find_concept_id(
            concept, 
            concept_codes=[procedure_code], 
            vocabulary_ids=['SNOMED'], 
            domain_ids=['Measurement'], 
            invalid_reason=False, 
            standard_concept='S'
        ),
        'measurement_date': measurement_date,
        'measurement_datetime': measurement_datetime,
        'measurement_time': measurement_time,
        'measurement_type_concept_id': 32817,
        'operator_concept_id': 4172703,
        'value_as_number': pd.NA,
        'value_as_concept_id': 0,
        'unit_concept_id': 0,
        'range_low': pd.NA,
        'range_high': pd.NA,
        'provider_id': pd.NA,
        'visit_occurrence_id': visit_occurrence_id,
        'visit_detail_id': pd.NA,
        'measurement_source_value': procedure_code,
        'measurement_source_concept_id': find_concept_id(
            concept, 
            concept_codes=[procedure_code], 
            vocabulary_ids=['SNOMED'], 
            domain_ids=['Measurement']
        ),
        'unit_source_value': None,
        'unit_source_concept_id': 0,
        'value_source_value': None,
        'measurement_event_id': pd.NA,
        'meas_event_field_concept_id': pd.NA
    }
    measurement_rows.append(measurement)

    return measurement_rows

measurement_rows = []

with open('/workspaces/synthea_dw/data/fhir/Procedure.ndjson', 'r') as file:
    for line in file:
        data = json.loads(line)
        measurements = procedure_to_measurement(data, concept)
        measurement_rows.extend(measurements)

measurement = pd.DataFrame(measurement_rows).drop_duplicates()
measurement = measurement[measurement['measurement_source_concept_id'] != 0]

measurement.sample(5)

### observation

In [None]:
def allergyIntolerance_to_observation(data, concept):
    if data['code']['coding'][0]['code'] != '419199007':

        observations = []

        for reaction in data.get('reaction', []):
            observation = {
                'observation_id': data['id'],
                'person_id': data['patient']['reference'].split('/')[-1],
                'observation_concept_id': 4169307,
                'observation_date': datetime.strptime(data['recordedDate'].split('T')[0], '%Y-%m-%d').date(),
                'observation_datetime': datetime.fromisoformat(data['recordedDate']),
                'observation_type_concept_id': 32817,
                'value_as_number': None,
                'value_as_string': None,
                'value_as_concept_id': find_concept_id(
                    concept, 
                    concept_codes=[data['code']['coding'][0]['code']], 
                    vocabulary_ids=['SNOMED'], 
                    domain_ids=['Observation'], 
                    invalid_reason=False, 
                    standard_concept='S', 
                    concept_class_ids=['Substance']
                ),
                'qualifier_concept_id': find_concept_id(
                    concept, 
                    concept_names=[data['criticality'].capitalize()], 
                    vocabulary_ids=['SNOMED'], 
                    domain_ids=['Meas Value'], 
                    invalid_reason=False, 
                    standard_concept='S',
                    concept_class_ids=['Qualifier Value']
                ),
                'unit_concept_id': pd.NA,
                'provider_id': pd.NA,
                'visit_occurrence_id': None,
                'visit_detail_id': None,
                'observation_source_value': None,
                'observation_source_concept_id': 4169307,
                'unit_source_value': None,
                'qualifier_source_value': data['criticality'],
                'value_source_value': data['code']['coding'][0]['code'],
                'observation_event_id': None,
                'obs_event_field_concept_id': None
            }
            observations.append(observation)

        return observations

    return None

observation_rows = []

with open('/workspaces/synthea_dw/data/fhir/AllergyIntolerance.ndjson', 'r') as file:
    for line in file:
        data = json.loads(line)
        rows = allergyIntolerance_to_observation(data, concept)
        if rows:
            observation_rows.extend(rows)

observation = pd.DataFrame(observation_rows).drop_duplicates()
observation = observation[observation['observation_source_concept_id'] != 0]

observation.sample(5)

Unnamed: 0,observation_id,person_id,observation_concept_id,observation_date,observation_datetime,observation_type_concept_id,value_as_number,value_as_string,value_as_concept_id,qualifier_concept_id,unit_concept_id,provider_id,visit_occurrence_id,visit_detail_id,observation_source_value,observation_source_concept_id,unit_source_value,qualifier_source_value,value_source_value,observation_event_id,obs_event_field_concept_id
15,55432010-264f-b18a-db85-a3343f889306,20485b01-bb18-b47a-0c59-faa30afd0af1,4169307,1998-02-06,1998-02-06 10:15:39+00:00,32817,,,4138133,4267416,,,,,,4169307,,low,264287008,,
10,20f213c0-cd90-624d-244f-f22934a2821e,8d3c566e-e2f0-3f11-eee2-dce3c68c498d,4169307,2021-04-14,2021-04-14 20:00:56+00:00,32817,,,42536288,4267416,,,,,,4169307,,low,735029006,,
8,38ddacbe-ac7b-e213-7ed0-8fcdbdb1ea67,8d3c566e-e2f0-3f11-eee2-dce3c68c498d,4169307,2021-04-14,2021-04-14 20:00:56+00:00,32817,,,4138133,4267416,,,,,,4169307,,low,264287008,,
19,fadc4099-13de-7fa7-0e17-96c540630ab6,20485b01-bb18-b47a-0c59-faa30afd0af1,4169307,1998-02-06,1998-02-06 10:15:39+00:00,32817,,,42538933,4267416,,,,,,4169307,,low,762952008,,
4,39fe9eb5-35e2-a2ff-b7c2-e3465745766a,79d8982d-fef7-7135-181a-0fb6af4a0e63,4169307,1967-11-26,1967-11-26 04:21:04+00:00,32817,,,42539493,4267416,,,,,,4169307,,low,735971005,,


In [None]:
def carePlan_to_observation(data, concept):
    observation_rows = []

    for category in data.get('category', []):
        for coding in category.get('coding', []):
            if 'display' in coding:
                observation = {
                    'observation_id': data['id'],
                    'person_id': data['subject']['reference'].split('/')[-1],
                    'observation_concept_id': find_concept_id(
                        concept, 
                        concept_codes=[coding['code']], 
                        vocabulary_ids=['SNOMED'], 
                        domain_ids=['Observation'], 
                        invalid_reason=False, 
                        standard_concept='S'
                    ),
                    'observation_date': datetime.strptime(data['period']['start'].split('T')[0], '%Y-%m-%d').date(),
                    'observation_datetime': datetime.fromisoformat(data['period']['start']),
                    'observation_type_concept_id': 32817,
                    'value_as_number': None,
                    'value_as_string': None,
                    'value_as_concept_id': pd.NA,
                    'qualifier_concept_id': pd.NA,
                    'unit_concept_id': pd.NA,
                    'provider_id': pd.NA,
                    'visit_occurrence_id': data['encounter']['reference'].split('/')[-1],
                    'visit_detail_id': pd.NA,
                    'observation_source_value': coding['code'],
                    'observation_source_concept_id': find_concept_id(
                        concept, 
                        concept_codes=[coding['code']], 
                        vocabulary_ids=['SNOMED'], 
                        domain_ids=['Observation']
                    ),
                    'qualifier_source_value': None,
                    'value_source_value': None,
                    'observation_event_id': data['id'],
                    'obs_event_field_concept_id': None  
                }
                observation_rows.append(observation)

    return observation_rows

observation_rows = []

with open('/workspaces/synthea_dw/data/fhir/CarePlan.ndjson', 'r') as file:
    for line in file:
        data = json.loads(line)
        observations = carePlan_to_observation(data, concept)
        if observations:
            observation_rows.extend(observations)

observation = pd.DataFrame(observation_rows).drop_duplicates()
observation = observation[observation['observation_source_concept_id'] != 0]

observation.sample(5)

Unnamed: 0,observation_id,person_id,observation_concept_id,observation_date,observation_datetime,observation_type_concept_id,value_as_number,value_as_string,value_as_concept_id,qualifier_concept_id,unit_concept_id,provider_id,visit_occurrence_id,visit_detail_id,observation_source_value,observation_source_concept_id,qualifier_source_value,value_source_value,observation_event_id,obs_event_field_concept_id
50,7f9d9b8e-2d00-37e1-584f-b0f2e73dc3bf,3f3b5b8d-be50-86b2-7dbf-066c16eb75e2,4047564,2019-01-21,2019-01-21 10:33:33+00:00,32817,,,,,,,510561e1-7c84-1498-6be2-fedae31ae53d,,134435003,4047564,,,7f9d9b8e-2d00-37e1-584f-b0f2e73dc3bf,
82,3bad52b4-0f09-6045-78e9-e674b1474cee,1c8c7995-e8ca-be32-411c-d8d1d55155d0,4305449,2008-05-26,2008-05-26 02:49:08+00:00,32817,,,,,,,318fbb11-3c9b-2024-1d63-6ce0cd800387,,133901003,4305449,,,3bad52b4-0f09-6045-78e9-e674b1474cee,
33,52a1b6e7-43a5-447a-a164-a5bdd6807163,5498f452-4f9e-ea6c-9587-e4bf0d97a2a0,4047564,2006-12-02,2006-12-02 11:19:35+00:00,32817,,,,,,,c1af9d5f-0763-06fa-e6c5-35e8cffc7dc1,,134435003,4047564,,,52a1b6e7-43a5-447a-a164-a5bdd6807163,
21,cf24fe5e-4d38-ab08-c879-e7b219f2f6aa,fffe0830-f71e-bd50-e90d-fc5f23c55433,4021315,2015-09-23,2015-09-23 13:08:35+00:00,32817,,,,,,,a5cba29c-4d9e-9f16-a985-364a1627c23a,,225358003,4021315,,,cf24fe5e-4d38-ab08-c879-e7b219f2f6aa,
53,72cda26c-be1a-394a-cf63-e4fd43c6c9ba,3f3b5b8d-be50-86b2-7dbf-066c16eb75e2,4237462,2023-07-06,2023-07-06 10:33:33+00:00,32817,,,,,,,d47d184a-76d1-461a-b4ac-35cfc71eaf53,,408869004,4237462,,,72cda26c-be1a-394a-cf63-e4fd43c6c9ba,


In [None]:
def claim_to_observation(data, concept):
    observation_rows = []

    if any(coding['code'] in ['professional', 'institutional'] for coding in data['type']['coding']):
        person_id = data['patient']['reference'].split('/')[-1]

        for item in data.get('item', []):
            if 'productOrService' in item and 'coding' in item['productOrService']:
                for coding in item['productOrService']['coding']:
                    observation = {
                        'observation_id': data['id'],
                        'person_id': person_id,
                        'observation_concept_id': find_concept_id(
                            concept, 
                            concept_codes=[coding['code']], 
                            vocabulary_ids=['SNOMED'], 
                            domain_ids=['Observation'], 
                            invalid_reason=False, 
                            standard_concept='S'
                        ),
                        'observation_date': datetime.strptime(data['created'].split('T')[0], '%Y-%m-%d').date(),
                        'observation_datetime': datetime.fromisoformat(data['created']),
                        'observation_type_concept_id': 32817,
                        'value_as_number': pd.NA,
                        'value_as_string': None,
                        'value_as_concept_id': pd.NA,
                        'qualifier_concept_id': pd.NA,
                        'unit_concept_id': None,
                        'provider_id': pd.NA,
                        'visit_occurrence_id': item['encounter'][0]['reference'].split('/')[-1] if 'encounter' in item else None,
                        'visit_detail_id': pd.NA,
                        'observation_source_value': coding['code'],
                        'observation_source_concept_id': find_concept_id(
                            concept, 
                            concept_codes=[coding['code']], 
                            vocabulary_ids=['SNOMED'], 
                            domain_ids=['Observation'], 
                            invalid_reason=False, 
                            standard_concept='S'
                        ),
                        'unit_source_value': None,
                        'qualifier_source_value': None,
                        'value_source_value': None,
                        'observation_event_id': data['id'],
                        'obs_event_field_concept_id': None
                    }
                    observation_rows.append(observation)

    return observation_rows

observation_rows = []

with open('/workspaces/synthea_dw/data/fhir/Claim.ndjson', 'r') as file:
    for line in file:
        data = json.loads(line)
        observations = claim_to_observation(data, concept)
        observation_rows.extend(observations)

observation = pd.DataFrame(observation_rows).drop_duplicates()
observation = observation[observation['observation_source_concept_id'] != 0]

observation.sample(5)

Unnamed: 0,observation_id,person_id,observation_concept_id,observation_date,observation_datetime,observation_type_concept_id,value_as_number,value_as_string,value_as_concept_id,qualifier_concept_id,unit_concept_id,provider_id,visit_occurrence_id,visit_detail_id,observation_source_value,observation_source_concept_id,unit_source_value,qualifier_source_value,value_source_value,observation_event_id,obs_event_field_concept_id
1605,6fbdca0e-083d-7787-7fcf-389d2f21e3a4,561d73b3-1c56-ed8d-266e-70c9e8712efb,4203722,2020-06-17,2020-06-17 13:55:11+00:00,32817,,,,,,,440aeb51-5849-6b07-0b2a-eaa8a3641ddb,,308335008,4203722,,,,6fbdca0e-083d-7787-7fcf-389d2f21e3a4,
7068,4fe5e74f-bf0d-4f90-483e-c06b8cff951a,c847a2de-6501-c49f-7988-9aaf6cf59ba5,4176269,2000-12-22,2000-12-22 07:40:05+00:00,32817,,,,,,,c12efff1-6588-c9f5-f0d4-646247e8e3ff,,50849002,4176269,,,,4fe5e74f-bf0d-4f90-483e-c06b8cff951a,
5110,79ac7b6c-1a13-71e6-5478-7867c203604b,1c8c7995-e8ca-be32-411c-d8d1d55155d0,42538208,2004-12-03,2004-12-03 01:00:51+00:00,32817,,,,,,,,,741062008,42538208,,,,79ac7b6c-1a13-71e6-5478-7867c203604b,
3753,4f83a854-f9b7-fa76-c107-f84da4b108d2,12d577de-3ce4-46cc-ad4b-769748ba110e,4085799,1996-06-14,1996-06-14 00:53:21+00:00,32817,,,,,,,05c5c408-802e-d859-0b0b-db7315f6ca76,,185349003,4085799,,,,4f83a854-f9b7-fa76-c107-f84da4b108d2,
4573,42266417-f017-0a3d-043e-227b4f70063a,0c69f458-1547-5034-0dc4-5b73f8d841c8,4072733,1948-01-16,1948-01-16 01:11:45+00:00,32817,,,,,,,,,224295006,4072733,,,,42266417-f017-0a3d-043e-227b4f70063a,


In [37]:
def observation_to_observation(data, concept):
    observation_rows = []
    person_id = data['subject']['reference'].split('/')[-1]
    observation_date = datetime.strptime(data['effectiveDateTime'].split('T')[0], '%Y-%m-%d').date()
    observation_datetime = datetime.fromisoformat(data['effectiveDateTime'])
    visit_occurrence_id = data['encounter']['reference'].split('/')[-1] if 'encounter' in data else None

    def process_observation(code, valueQuantity, valueCodeableConcept):
        value_as_concept_id = None
        if valueCodeableConcept:
            value_as_concept_id = find_concept_id(
                concept, 
                concept_codes=[valueCodeableConcept['coding'][0]['code']], 
                vocabulary_ids=['LOINC'], 
                domain_ids=['Meas Value']
            )

        return {
            'observation_id': data['id'],
            'person_id': person_id,
            'observation_concept_id': find_concept_id(
                concept, 
                concept_codes=[code], 
                vocabulary_ids=['LOINC'], 
                domain_ids=['Observation'], 
                invalid_reason=False, 
                standard_concept='S'
            ),
            'observation_date': observation_date,
            'observation_datetime': observation_datetime,
            'observation_type_concept_id': 32817,
            'value_as_number': float(valueQuantity['value']) if valueQuantity and 'value' in valueQuantity else pd.NA,
            'value_as_string': valueCodeableConcept['coding'][0]['display'] if valueCodeableConcept else None,
            'value_as_concept_id': value_as_concept_id if valueCodeableConcept else 0,
            'qualifier_concept_id': pd.NA,
            'unit_concept_id': find_concept_id(
                concept, 
                concept_codes=[valueQuantity['code']] if valueQuantity and 'code' in valueQuantity else None, 
                vocabulary_ids=['UCUM'], 
                domain_ids=['Unit'], 
                invalid_reason=False, 
                standard_concept='S'
            ),
            'provider_id': pd.NA,
            'visit_occurrence_id': visit_occurrence_id,
            'visit_detail_id': pd.NA,
            'observation_source_value': code,
            'observation_source_concept_id': find_concept_id(
                concept, 
                concept_codes=[code], 
                vocabulary_ids=['LOINC'], 
                domain_ids=['Observation']
            ),
            'unit_source_value': valueQuantity['code'] if valueQuantity and 'code' in valueQuantity else None,
            'qualifier_source_value': None,
            'value_source_value': valueQuantity['value'] if valueQuantity and 'value' in valueQuantity else None,
            'observation_event_id': data['id'],
            'obs_event_field_concept_id': pd.NA
        }

    # Check if the observation is in the 'component' format
    if 'component' in data:
        for comp in data['component']:
            code = comp['code']['coding'][0]['code']
            valueQuantity = comp.get('valueQuantity', None)
            valueCodeableConcept = comp.get('valueCodeableConcept', None)
            observation = process_observation(code, valueQuantity, valueCodeableConcept)
            observation_rows.append(observation)
    else:
        # Process the single observation
        code = data['code']['coding'][0]['code']
        valueQuantity = data.get('valueQuantity', None)
        valueCodeableConcept = data.get('valueCodeableConcept', None)
        observation = process_observation(code, valueQuantity, valueCodeableConcept)
        observation_rows.append(observation)

    return observation_rows

observation_rows = []

with open('/workspaces/synthea_dw/data/fhir/Observation.ndjson', 'r') as file:
    for line in file:
        data = json.loads(line)
        observations = observation_to_observation(data, concept)
        observation_rows.extend(observations)

observation = pd.DataFrame(observation_rows).drop_duplicates()
observation = observation[observation['observation_source_concept_id'] != 0]

observation.sample(5)

Unnamed: 0,observation_id,person_id,observation_concept_id,observation_date,observation_datetime,observation_type_concept_id,value_as_number,value_as_string,value_as_concept_id,qualifier_concept_id,unit_concept_id,provider_id,visit_occurrence_id,visit_detail_id,observation_source_value,observation_source_concept_id,unit_source_value,qualifier_source_value,value_source_value,observation_event_id,obs_event_field_concept_id
5035,30e7f956-eb5f-6005-3a1a-033f1a8cfcda,7a82833f-fae1-d69a-2cbf-69279dac746f,40759172,2017-01-21,2017-01-21 02:38:08+00:00,32817,,No,45878245,,8554,,f8ee7951-3e72-2eb5-4e0e-905f7b842a4f,,56051-6,40759172,,,,30e7f956-eb5f-6005-3a1a-033f1a8cfcda,
8773,58a8f4b6-76b1-90ef-747d-9a5539b9e525,7a82833f-fae1-d69a-2cbf-69279dac746f,42869557,2020-11-07,2020-11-07 02:35:35+00:00,32817,,I have housing,37079501,,8554,,2684ea4d-5d46-0726-8fbf-ca148116e1cc,,71802-3,42869557,,,,58a8f4b6-76b1-90ef-747d-9a5539b9e525,
7610,78a73447-39c8-71fa-558f-3d6dc7cbc843,408a95f4-02aa-3003-2f09-0241ac3343fb,3046853,2015-02-18,2015-02-18 09:49:34+00:00,32817,,White,45877987,,8554,,73f1c0d7-f865-4fda-1c59-928530cf8198,,32624-9,3046853,,,,78a73447-39c8-71fa-558f-3d6dc7cbc843,
22542,e9872acb-911a-95a0-1baf-70efabb0c32a,3558b674-952f-aa9d-9e66-b839f6a16316,40770471,2015-05-03,2015-05-03 13:14:37+00:00,32817,,Full-time work,37079092,,8554,,cdbbfd32-977c-efc5-e57c-84b182778329,,67875-5,40770471,,,,e9872acb-911a-95a0-1baf-70efabb0c32a,
3948,121f3e5e-a296-df17-ac26-7a8d658f2d72,c95d085d-2249-b616-7668-88cc9a0c11bd,40759918,2021-12-12,2021-12-12 09:53:26+00:00,32817,,,0,,8554,,f6e3d0c4-2faf-27fc-d45f-13d1b51ede06,,56799-0,40759918,,,,121f3e5e-a296-df17-ac26-7a8d658f2d72,


In [None]:
def procedure_to_observation(data, concept):
    observation_rows = []

    person_id = data['subject']['reference'].split('/')[-1]
    procedure_code = data['code']['coding'][0]['code']
    observation_date = datetime.strptime(data['performedPeriod']['start'].split('T')[0], '%Y-%m-%d').date()
    observation_datetime = datetime.fromisoformat(data['performedPeriod']['start'])
    visit_occurrence_id = data['encounter']['reference'].split('/')[-1] if 'encounter' in data else None

    observation = {
        'observation_id': data['id'],
        'person_id': person_id,
        'observation_concept_id': find_concept_id(
            concept, 
            concept_codes=[procedure_code], 
            vocabulary_ids=['SNOMED'], 
            domain_ids=['Observation'], 
            invalid_reason=False, 
            standard_concept='S'
        ),
        'observation_date': observation_date,
        'observation_datetime': observation_datetime,
        'observation_type_concept_id': 32817,
        'value_as_number': pd.NA,
        'value_as_string': None,
        'value_as_concept_id': 0,
        'unit_concept_id': 0,
        'provider_id': pd.NA,
        'visit_occurrence_id': visit_occurrence_id,
        'visit_detail_id': None,
        'observation_source_value': procedure_code,
        'observation_source_concept_id': find_concept_id(
            concept, 
            concept_codes=[procedure_code], 
            vocabulary_ids=['SNOMED'], 
            domain_ids=['Observation']
        ),
        'unit_source_value': None,
        'qualifier_source_value': None,
        'value_source_value': None,
        'observation_event_id': None,
        'obs_event_field_concept_id': None
    }

    observation_rows.append(observation)

    return observation_rows

observation_rows = []

with open('/workspaces/synthea_dw/data/fhir/Procedure.ndjson', 'r') as file:
    for line in file:
        data = json.loads(line)
        observations = procedure_to_observation(data, concept)
        observation_rows.extend(observations)

observation = pd.DataFrame(observation_rows).drop_duplicates()
observation = observation[observation['observation_source_concept_id'] != 0]

observation.sample(5)

### death

In [39]:
def patient_to_death(data):
    death_rows = []

    # Check if 'deceasedDateTime' exists
    if 'deceasedDateTime' in data:
        person_id = data['id']
        death_date = datetime.strptime(data['deceasedDateTime'].split('T')[0], '%Y-%m-%d').date()

        death_info = {
            'person_id': person_id,
            'death_date': death_date,
            'death_type_concept_id': 32817,
            'cause_concept_id': 0,
            'cause_source_value': None,
            'cause_source_concept_id': 0
        }

        death_rows.append(death_info)

    return death_rows

death_rows = []

with open('/workspaces/synthea_dw/data/fhir/Patient.ndjson', 'r') as file:
    for line in file:
        data = json.loads(line)
        deaths = patient_to_death(data)
        death_rows.extend(deaths)

death = pd.DataFrame(death_rows).drop_duplicates()

death.sample(3)

Unnamed: 0,person_id,death_date,death_type_concept_id,cause_concept_id,cause_source_value,cause_source_concept_id
0,32ee64c2-1585-d7ad-c53f-9ad739c676cf,2017-01-17,32817,0,,0
2,7a7b7fba-a005-3736-91ef-218a0d2824c5,2023-11-16,32817,0,,0
1,a5a02d31-a93c-7b72-7e1c-a9cbfa64874d,2017-03-31,32817,0,,0


### note

In [40]:
def carePlan_to_note(data):
    # Extracting note_title and note_text from 'text' -> 'div'
    div_text = data['text']['div']
    note_title_end_index = div_text.find('<br/>')

    return {
        'note_id': data['id'],
        'person_id': data['subject']['reference'].split('/')[-1],
        'note_date': datetime.strptime(data['period']['start'].split('T')[0], '%Y-%m-%d').date(),
        'note_datetime': datetime.fromisoformat(data['period']['start']),
        'note_type_concept_id': 32817,
        'note_class_concept_id': 706300,
        'note_title': data['text']['div'][len('<div xmlns="http://www.w3.org/1999/xhtml">'):note_title_end_index],
        'note_text': div_text[note_title_end_index + len('<br/>'):],
        'encoding_concept_id': 32678,
        'language_concept_id': 4175745,
        'provider_id': pd.NA,
        'visit_occurrence_id': data['encounter']['reference'].split('/')[-1],
        'visit_detail_id': pd.NA,
        'note_source_value': None,
        'note_event_id': data['encounter']['reference'].split('/')[-1],
        'note_event_field_concept_id': pd.NA
    }

note_rows = []

with open('/workspaces/synthea_dw/data/fhir/CarePlan.ndjson', 'r') as file:
    for line in file:
        data = json.loads(line)
        note_row = carePlan_to_note(data)
        note_rows.append(note_row)

note = pd.DataFrame(note_rows).drop_duplicates()

note.sample(5)

Unnamed: 0,note_id,person_id,note_date,note_datetime,note_type_concept_id,note_class_concept_id,note_title,note_text,encoding_concept_id,language_concept_id,provider_id,visit_occurrence_id,visit_detail_id,note_source_value,note_event_id,note_event_field_concept_id
41,ad76c020-07bf-4ac3-465b-66f83142938f,7a7b7fba-a005-3736-91ef-218a0d2824c5,2016-11-16,2016-11-16 15:27:03+00:00,32817,706300,Care Plan for Dialysis care plan (record artif...,Activities: <ul><li>Dialysis care plan (record...,32678,4175745,,564768d7-61a2-b87c-bafd-0fef0e370245,,,564768d7-61a2-b87c-bafd-0fef0e370245,
21,8c1a29e7-26c4-112b-2f30-69dda36658aa,05a5b076-68d9-702e-d1cc-1132e2a30ebb,2020-07-02,2020-07-02 09:43:32+00:00,32817,706300,Care Plan for Infectious disease care plan (re...,Activities: <ul><li>Infectious disease care pl...,32678,4175745,,31b00858-cff5-84db-8c09-7faaada3468d,,,31b00858-cff5-84db-8c09-7faaada3468d,
27,ac7c3963-1dd7-42e3-1c3b-da482ca0a1f4,a5a02d31-a93c-7b72-7e1c-a9cbfa64874d,2014-01-31,2014-01-31 08:52:48+00:00,32817,706300,Care Plan for Therapy (regime/therapy).,Activities: <ul><li>Therapy (regime/therapy)</...,32678,4175745,,157a6f39-7ef2-e210-ee9c-b5beed398dd4,,,157a6f39-7ef2-e210-ee9c-b5beed398dd4,
35,015e3c28-b8d7-34aa-c5c5-2acd32ce344e,a5a02d31-a93c-7b72-7e1c-a9cbfa64874d,2017-02-23,2017-02-23 09:21:18+00:00,32817,706300,Care Plan for Inpatient care plan (record arti...,Care plan is meant to treat Chronic congestive...,32678,4175745,,e4475818-0bb8-d366-92ef-11a70b80ced3,,,e4475818-0bb8-d366-92ef-11a70b80ced3,
1,b4789ac2-80f4-311f-e507-e5e1083124a6,32ee64c2-1585-d7ad-c53f-9ad739c676cf,1975-10-22,1975-10-22 16:41:57+00:00,32817,706300,Care Plan for Self-care interventions (procedu...,Activities: <ul><li>Self-care interventions (p...,32678,4175745,,f3ef8b65-28b8-a735-55c1-175321d55eb9,,,f3ef8b65-28b8-a735-55c1-175321d55eb9,


In [41]:
def diagnosticReport_to_note(data):
    note_rows = []

    person_id = data['subject']['reference'].split('/')[-1]
    provider_id = data['performer'][0]['reference'].split('/')[-1] if 'performer' in data and data['performer'] else None
    note_text = data['presentedForm'][0]['data'] if 'presentedForm' in data and data['presentedForm'] else None
    note_code_display = data['code']['coding'][0]['display'] if 'code' in data and 'coding' in data['code'] else None

    note = {
        'note_id': data['id'],
        'person_id': person_id,
        'note_date': datetime.strptime(data['issued'].split('T')[0], '%Y-%m-%d').date(),
        'note_datetime': datetime.fromisoformat(data['issued']),
        'note_type_concept_id': 32817,
        'note_class_concept_id': 42868493,
        'note_title': note_code_display,
        'note_text': note_text,
        'encoding_concept_id': 32678,
        'language_concept_id': 4175745,
        'provider_id': provider_id,
        'visit_occurrence_id': data['encounter']['reference'].split('/')[-1] if 'encounter' in data else None,
        'visit_detail_id': pd.NA,
        'note_source_value': None,
        'note_event_id': None,
        'note_event_field_concept_id': pd.NA
    }
    note_rows.append(note)

    return note_rows

note_rows = []

with open('/workspaces/synthea_dw/data/fhir/DiagnosticReport.ndjson', 'r') as file:
    for line in file:
        data = json.loads(line)
        notes = diagnosticReport_to_note(data)
        note_rows.extend(notes)

note = pd.DataFrame(note_rows).drop_duplicates()

note.sample(5)

Unnamed: 0,note_id,person_id,note_date,note_datetime,note_type_concept_id,note_class_concept_id,note_title,note_text,encoding_concept_id,language_concept_id,provider_id,visit_occurrence_id,visit_detail_id,note_source_value,note_event_id,note_event_field_concept_id
2077,416b2a6c-9522-20cd-3ffc-cae1571a8753,7a7b7fba-a005-3736-91ef-218a0d2824c5,2018-07-07,2018-07-07 02:00:03.694000+00:00,32817,42868493,History and physical note,CjIwMTgtMDctMDcKCiMgQ2hpZWYgQ29tcGxhaW50Ci0gSH...,32678,4175745,299bc447-29c4-3c98-948b-ea0891c97d89,122f6044-981c-1cb7-8e73-302b6f0eeaa3,,,,
2921,9d64cd11-2eb2-eb42-2cf2-22b15aab2aaf,3558b674-952f-aa9d-9e66-b839f6a16316,2016-01-24,2016-01-24 13:07:51.694000+00:00,32817,42868493,Generalized anxiety disorder 7 item (GAD-7),,32678,4175745,,32e25874-954a-e152-f042-ea4ac1d6e14a,,,,
87,d7936d98-c4bc-fa18-d3dd-7fffebbcb158,c95d085d-2249-b616-7668-88cc9a0c11bd,2015-01-31,2015-01-31 07:16:36.481000+00:00,32817,42868493,History and physical note,CjIwMTUtMDEtMzEKCiMgQ2hpZWYgQ29tcGxhaW50Ck5vIG...,32678,4175745,f3954a8e-f971-3d58-9bd5-b5282043fb08,c3a1b8dc-c243-1089-3fed-62afef74a755,,,,
2911,c9238cb8-7c5b-26c3-8470-0b1e76b4c183,3558b674-952f-aa9d-9e66-b839f6a16316,2015-05-03,2015-05-03 12:15:03.694000+00:00,32817,42868493,Basic metabolic panel - Blood,,32678,4175745,390a2aa4-70b4-3a64-a12f-9bd777834c8d,cdbbfd32-977c-efc5-e57c-84b182778329,,,,
2501,961331e7-976b-1fd4-1691-d407fa6e39b4,7a7b7fba-a005-3736-91ef-218a0d2824c5,2021-01-03,2021-01-03 13:33:26.694000+00:00,32817,42868493,Generalized anxiety disorder 7 item (GAD-7),,32678,4175745,,f4d5afca-3840-1e3c-cb6e-39e14462f801,,,,


In [42]:
def documentReference_to_note(data, concept):
    note_rows = []

    person_id = data['subject']['reference'].split('/')[-1]
    provider_id = data['author'][0]['reference'].split('/')[-1] if 'author' in data else None
    note_text = data['content'][0]['attachment']['data'] if 'content' in data and 'attachment' in data['content'][0] else None
    note_title = data['category'][0]['coding'][0]['display'] if 'category' in data and 'coding' in data['category'][0] else None
    note_type_code = data['type']['coding'][0]['code'] if 'type' in data and 'coding' in data['type'] else None

    note = {
        'note_id': data['id'],
        'person_id': person_id,
        'note_date': datetime.strptime(data['date'].split('T')[0], '%Y-%m-%d').date(),
        'note_datetime': datetime.fromisoformat(data['date']),
        'note_type_concept_id': 32817,
        'note_class_concept_id': find_concept_id(
            concept, 
            concept_codes=[note_type_code], 
            vocabulary_ids=['LOINC'], 
            domain_ids=['Note'], 
            invalid_reason=False, 
            standard_concept='S'
        ),
        'note_title': note_title,
        'note_text': note_text,
        'encoding_concept_id': 32678,
        'language_concept_id': 4175745,
        'provider_id': provider_id,
        'visit_occurrence_id': data['context']['encounter'][0]['reference'].split('/')[-1] if 'context' in data and 'encounter' in data['context'] else None,
        'visit_detail_id': pd.NA,
        'note_source_value': None,
        'note_event_id': None,
        'note_event_field_concept_id': pd.NA
    }
    note_rows.append(note)

    return note_rows

note_rows = []

with open('/workspaces/synthea_dw/data/fhir/DocumentReference.ndjson', 'r') as file:
    for line in file:
        data = json.loads(line)
        notes = documentReference_to_note(data, concept)
        note_rows.extend(notes)

note = pd.DataFrame(note_rows).drop_duplicates()

note.sample(5)

Unnamed: 0,note_id,person_id,note_date,note_datetime,note_type_concept_id,note_class_concept_id,note_title,note_text,encoding_concept_id,language_concept_id,provider_id,visit_occurrence_id,visit_detail_id,note_source_value,note_event_id,note_event_field_concept_id
128,3f029f11-acc9-5bef-6a96-956053452ca0,05a5b076-68d9-702e-d1cc-1132e2a30ebb,2021-03-08,2021-03-08 15:24:43.757000+00:00,32817,3030023,Clinical Note,CjIwMjEtMDMtMDgKCiMgQ2hpZWYgQ29tcGxhaW50Ck5vIG...,32678,4175745,d2149f29-18e2-365f-8f32-7ea6f00ec0b3,bf1e7ec7-f4c7-5ca5-b597-101229f0088a,,,,
124,7dadc08d-f8ea-66d6-2817-1ca00538e814,c86bea4c-5647-c8c2-35c5-cb08246ded70,2012-07-13,2012-07-13 16:03:44.884000+00:00,32817,3030023,Clinical Note,CjIwMTItMDctMTMKCiMgQ2hpZWYgQ29tcGxhaW50Ci0gQm...,32678,4175745,60b23852-314f-3aeb-b0b7-967947697497,e3c575ed-efab-e49d-306f-3ad24031b618,,,,
651,52d6d2f6-eeee-3bee-2311-33f322313b37,7a7b7fba-a005-3736-91ef-218a0d2824c5,2018-02-28,2018-02-28 15:06:03.694000+00:00,32817,3030023,Clinical Note,CjIwMTgtMDItMjgKCiMgQ2hpZWYgQ29tcGxhaW50Ci0gSH...,32678,4175745,299bc447-29c4-3c98-948b-ea0891c97d89,9e8dea9d-270a-bd38-92c1-76cb7c4dfcbb,,,,
874,3526d4c4-e2f5-b268-f71f-3b4ae18752dc,7a7b7fba-a005-3736-91ef-218a0d2824c5,2020-07-24,2020-07-24 22:38:03.694000+00:00,32817,3030023,Clinical Note,CjIwMjAtMDctMjQKCiMgQ2hpZWYgQ29tcGxhaW50Ci0gSH...,32678,4175745,299bc447-29c4-3c98-948b-ea0891c97d89,9fbc69b5-c0e5-4066-38b0-02fe588e584a,,,,
410,6627a2fd-4207-c30a-334c-07b780cb3094,7a82833f-fae1-d69a-2cbf-69279dac746f,2020-11-14,2020-11-14 01:46:00.675000+00:00,32817,3030023,Clinical Note,CjIwMjAtMTEtMTQKCiMgQ2hpZWYgQ29tcGxhaW50Ci0gSH...,32678,4175745,5c7231f0-eba5-3d45-b837-3415f3e0aafd,19911cfb-0d9d-0df7-092d-915b6f61e318,,,,


### specimen

In [51]:
def procedure_to_specimen(data, concept):
    specimen_rows = []

    person_id = data['subject']['reference'].split('/')[-1]
    procedure_code = data['code']['coding'][0]['code']
    specimen_date = datetime.strptime(data['performedPeriod']['start'].split('T')[0], '%Y-%m-%d').date()
    specimen_datetime = datetime.fromisoformat(data['performedPeriod']['start'])

    specimen = {
        'specimen_id': data['id'],
        'person_id': person_id,
        'specimen_concept_id': find_concept_id(
            concept, 
            concept_codes=[procedure_code], 
            vocabulary_ids=['SNOMED'], 
            domain_ids=['Specimen'], 
            invalid_reason=False, 
            standard_concept='S'
        ),
        'specimen_type_concept_id': 32817,
        'specimen_date': specimen_date,
        'specimen_datetime': specimen_datetime,
        'quantity': 1,
        'unit_concept_id': 0,
        'anatomic_site_concept_id': 0,
        'disease_status_concept_id': 0,
        'specimen_source_id': find_concept_id(
            concept, 
            concept_codes=[procedure_code], 
            vocabulary_ids=['SNOMED'], 
            domain_ids=['Specimen']
        ),
        'specimen_source_value': procedure_code,
        'unit_source_value': None,
        'anatomic_site_source_value': None,
        'disease_status_source_value': None
    }

    specimen_rows.append(specimen)

    return specimen_rows

specimen_rows = []

with open('/workspaces/synthea_dw/data/fhir/Procedure.ndjson', 'r') as file:
    for line in file:
        data = json.loads(line)
        specimens = procedure_to_specimen(data, concept)
        specimen_rows.extend(specimens)

specimen = pd.DataFrame(specimen_rows).drop_duplicates()
specimen = specimen[specimen['specimen_source_id'] != 0]

specimen.sample(2)

ValueError: a must be greater than 0 unless no samples are taken

### location

In [43]:
def location_to_location(data):
    location = {
        'location_id': data['id'],
        'address_1': data['address']['line'][0] if 'address' in data and 'line' in data['address'] and data['address']['line'] else None,
        'address_2': None,
        'city': data['address']['city'] if 'address' in data and 'city' in data['address'] else None,
        'state': data['address']['state'] if 'address' in data and 'state' in data['address'] else None,
        'zip': data['address']['postalCode'] if 'address' in data and 'postalCode' in data['address'] else None,
        'county': None,
        'location_source_value': data['name'] if 'name' in data else None,
        'country_concept_id': 42046186 if ('address' in data and 'country' in data['address'] and data['address']['country'] == 'US') else 0,
        'country_source_value': data['address']['country'] if 'address' in data and 'country' in data['address'] else None,
        'latitude': data['position']['latitude'] if 'position' in data and 'latitude' in data['position'] else None,
        'longitude': data['position']['longitude'] if 'position' in data and 'longitude' in data['position'] else None
    }

    return location

locations = []

with open('/workspaces/synthea_dw/data/fhir/Location.ndjson', 'r') as file:
    for line in file:
        data = json.loads(line)
        location = location_to_location(data)
        locations.append(location)

location = pd.DataFrame(locations).drop_duplicates()

location.sample(5)

Unnamed: 0,location_id,address_1,address_2,city,state,zip,county,location_source_value,country_concept_id,country_source_value,latitude,longitude
3,974e1a12-5930-3bae-8ef9-60dc1f0eebc5,20 POND MEADOW DR,,READING,MA,18673261.0,,READING INTERNAL MEDICINE ASSOCIATES INC,42046186,US,42.516645,-71.099176
25,bb1ad573-19b8-9cd8-68fb-0e6f684df992,,,,,,,,0,,,
17,16373638-5b1d-3258-a6c4-46a5c9180011,111 HEADWATERS DR,,HARWICH,MA,26451028.0,,REGALCARE AT HARWICH LLC,42046186,US,41.705946,-70.089044
15,44a084b6-8342-3ce2-92b7-026cfcba9227,655 DEDHAM ST,,WRENTHAM,MA,20931135.0,,SERENITY HILL NURSING CENTER,42046186,US,42.084901,-71.306892
12,c5de81bc-8f47-357b-af0b-ca4321dcab24,30 NEW CROSSING RD,,READING,MA,18673270.0,,HALLMARK HEALTH URGENT CARE PHYSICIANS LLC,42046186,US,42.518096,-71.095908


In [44]:
def patient_to_location(data):
    location_rows = []

    # Extracting address, city, state, zip, and country
    address_1 = data['address'][0]['line'][0] if 'address' in data and 'line' in data['address'][0] else None
    city = data['address'][0]['city'] if 'address' in data and 'city' in data['address'][0] else None
    state = data['address'][0]['state'] if 'address' in data and 'state' in data['address'][0] else None
    zip_code = data['address'][0]['postalCode'] if 'address' in data and 'postalCode' in data['address'][0] else None
    country = data['address'][0]['country'] if 'address' in data and 'country' in data['address'][0] else None

    # Setting country concept ID based on the country
    country_concept_id = 42046186 if country == 'US' else 0

    # Extracting latitude and longitude
    latitude = None
    longitude = None
    if 'address' in data and 'extension' in data['address'][0]:
        for ext in data['address'][0]['extension']:
            if ext['url'] == 'latitude':
                latitude = ext['valueDecimal']
            elif ext['url'] == 'longitude':
                longitude = ext['valueDecimal']

    location = {
        'location_id': data['id'],
        'address_1': address_1,
        'address_2': None,
        'city': city,
        'state': state,
        'zip': zip_code,
        'county': None,
        'location_source_value': data['id'],
        'country_concept_id': country_concept_id,
        'country_source_value': country,
        'latitude': latitude,
        'longitude': longitude
    }

    location_rows.append(location)

    return location_rows

location_rows = []

with open('/workspaces/synthea_dw/data/fhir/Patient.ndjson', 'r') as file:
    for line in file:
        data = json.loads(line)
        locations = patient_to_location(data)
        location_rows.extend(locations)

location = pd.DataFrame(location_rows).drop_duplicates()

location.sample(5)

Unnamed: 0,location_id,address_1,address_2,city,state,zip,county,location_source_value,country_concept_id,country_source_value,latitude,longitude
1,c95d085d-2249-b616-7668-88cc9a0c11bd,182 Hansen Burg,,Scituate,MA,2066,,c95d085d-2249-b616-7668-88cc9a0c11bd,42046186,US,,
4,7a82833f-fae1-d69a-2cbf-69279dac746f,759 Gottlieb Landing,,Boston,MA,2119,,7a82833f-fae1-d69a-2cbf-69279dac746f,42046186,US,,
7,408a95f4-02aa-3003-2f09-0241ac3343fb,784 Toy Ramp Unit 16,,Danvers,MA,1923,,408a95f4-02aa-3003-2f09-0241ac3343fb,42046186,US,,
0,32ee64c2-1585-d7ad-c53f-9ad739c676cf,696 Schaefer Divide Suite 50,,Harwich,MA,0,,32ee64c2-1585-d7ad-c53f-9ad739c676cf,42046186,US,,
9,3558b674-952f-aa9d-9e66-b839f6a16316,836 Hintz Crossroad,,Harwich,MA,0,,3558b674-952f-aa9d-9e66-b839f6a16316,42046186,US,,


### care_site

In [45]:
def organization_to_care_site(data):
    care_site_rows = []

    care_site = {
        'care_site_id': data['id'],
        'care_site_name': data['name'],
        'place_of_service_concept_id': 32693,
        'location_id': pd.NA,
        'care_site_source_value': data['id'],
        'place_of_service_source_value': 'Healthcare Provider'
    }

    care_site_rows.append(care_site)

    return care_site_rows

care_site_rows = []

with open('/workspaces/synthea_dw/data/fhir/Organization.ndjson', 'r') as file:
    for line in file:
        data = json.loads(line)
        care_sites = organization_to_care_site(data)
        care_site_rows.extend(care_sites)

care_site = pd.DataFrame(care_site_rows).drop_duplicates()

care_site.sample(5)

Unnamed: 0,care_site_id,care_site_name,place_of_service_concept_id,location_id,care_site_source_value,place_of_service_source_value
8,7b20d1c4-c56b-3647-801c-5e4e13679073,BETH ISRAEL DEACONESS HOSPITAL PLYMOUTH INC,32693,,7b20d1c4-c56b-3647-801c-5e4e13679073,Healthcare Provider
12,3e97ce4f-3ad7-3cea-883f-e2bb4ca89d83,HALLMARK HEALTH URGENT CARE PHYSICIANS LLC,32693,,3e97ce4f-3ad7-3cea-883f-e2bb4ca89d83,Healthcare Provider
10,390a2aa4-70b4-3a64-a12f-9bd777834c8d,OUTER CAPE HEALTH SERVICES INC,32693,,390a2aa4-70b4-3a64-a12f-9bd777834c8d,Healthcare Provider
20,0592d4e5-a0fb-3143-83cd-8e5a6b066a96,SOUTH SHORE PRIMARY AND URGENT CARE LLC,32693,,0592d4e5-a0fb-3143-83cd-8e5a6b066a96,Healthcare Provider
4,bb3d48eb-0c30-34f4-8950-790fc984741b,SOUTH END COMMUNITY HEALTH CENTER INC,32693,,bb3d48eb-0c30-34f4-8950-790fc984741b,Healthcare Provider


### provider

In [46]:
def practitioner_to_provider(practitioner_file_path, practitioner_role_file_path, concept):
    provider_rows = []

    # Load practitioner data into a dictionary with NPI as the key
    with open(practitioner_file_path, 'r') as file:
        practitioners_dict = {
            identifier['value']: {
                'provider_name': " ".join(
                    data['name'][0].get('prefix', []) + 
                    data['name'][0].get('given', []) + 
                    [data['name'][0]['family']]
                ),
                'gender': data['gender']
            }
            for line in file
            for data in [json.loads(line)]
            for identifier in data['identifier']
            if identifier['system'] == "http://hl7.org/fhir/sid/us-npi"
        }

    # Process PractitionerRoles and build provider rows
    with open(practitioner_role_file_path, 'r') as file:
        for line in file:
            data = json.loads(line)
            npi = data['practitioner']['identifier']['value']
            specialty_code = data['specialty'][0]['coding'][0]['code'] if 'specialty' in data else None

            provider_info = {
                'provider_id': data['id'],
                'provider_name': practitioners_dict.get(npi, {}).get('provider_name', ''),
                'npi': npi,
                'dea': None,
                'specialty_concept_id': find_concept_id(concept, concept_codes=[specialty_code], vocabulary_ids=['NUCC'], domain_ids=['Provider'], invalid_reason=False, standard_concept='S'),
                'care_site_id': pd.NA,
                'year_of_birth': pd.NA,
                'gender_concept_id': find_concept_id(concept, concept_names=[practitioners_dict.get(npi, {}).get('gender', '').upper()], vocabulary_ids=['Gender'], domain_ids=['Gender'], invalid_reason=False, standard_concept='S'),
                'provider_source_value': data['id'],
                'specialty_source_value': specialty_code,
                'specialty_source_concept_id': find_concept_id(concept, concept_codes=[specialty_code], vocabulary_ids=['NUCC'], domain_ids=['Provider']),
                'gender_source_value': practitioners_dict.get(npi, {}).get('gender', ''),
                'gender_source_concept_id': find_concept_id(concept, concept_names=[practitioners_dict.get(npi, {}).get('gender', '').upper()], vocabulary_ids=['Gender'], domain_ids=['Gender'])
            }

            provider_rows.append(provider_info)

    provider_df = pd.DataFrame(provider_rows).drop_duplicates()

    return provider_df

provider = practitioner_to_provider('/workspaces/synthea_dw/data/fhir/Practitioner.ndjson', '/workspaces/synthea_dw/data/fhir/PractitionerRole.ndjson', concept)

provider.sample(5)

Unnamed: 0,provider_id,provider_name,npi,dea,specialty_concept_id,care_site_id,year_of_birth,gender_concept_id,provider_source_value,specialty_source_value,specialty_source_concept_id,gender_source_value,gender_source_concept_id
21,2e8db9eb-8d77-a944-3d49-372a15b0b8dd,Dr. Emerita401 Satterfield305,9999891192,,0,,,8532,2e8db9eb-8d77-a944-3d49-372a15b0b8dd,208D00000X,38004022,female,8532
15,78113c54-172c-33a3-70cb-1c1944c5c422,Dr. Darlene91 Collier206,9999888198,,0,,,8532,78113c54-172c-33a3-70cb-1c1944c5c422,208D00000X,38004022,female,8532
2,a21e2e7f-5db5-85cc-51d5-3ab97fd02a47,Dr. Chris95 Kub800,9999965897,,0,,,8532,a21e2e7f-5db5-85cc-51d5-3ab97fd02a47,208D00000X,38004022,female,8532
3,87bd4af0-366a-f703-21a8-894cb489cb3d,Dr. Philip822 Beatty507,9999959692,,0,,,8507,87bd4af0-366a-f703-21a8-894cb489cb3d,208D00000X,38004022,male,8507
22,e66bfda0-6cf9-89a3-473f-f70a30005285,Dr. Cecil300 Fahey393,9999900993,,0,,,8507,e66bfda0-6cf9-89a3-473f-f70a30005285,208D00000X,38004022,male,8507


### episode

In [47]:
def careTeam_to_episode(data, concept):
    episode_rows = []

    # Check if reasonCode exists
    if 'reasonCode' in data:
        person_id = None
        for participant in data.get('participant', []):
            for role in participant.get('role', []):
                for coding in role.get('coding', []):
                    if coding.get('code') == '116154003':
                        person_id = participant['member']['reference'].split('/')[-1]
                        break
                if person_id:
                    break

        if person_id:
            for reasonCode in data['reasonCode']:
                for coding in reasonCode.get('coding', []):
                    episode = {
                        'episode_id': data['id'],
                        'person_id': person_id,
                        'episode_concept_id': 32533,
                        'episode_start_date': datetime.strptime(data['period']['start'].split('T')[0], '%Y-%m-%d').date(),
                        'episode_start_datetime': datetime.fromisoformat(data['period']['start']),
                        'episode_end_date': None,
                        'episode_end_datetime': None,
                        'episode_parent_id': pd.NA,
                        'episode_number': 1,
                        'episode_object_concept_id': find_concept_id(
                            concept, 
                            concept_codes=[coding['code']], 
                            vocabulary_ids=['SNOMED'], 
                            domain_ids=['Condition'],
                            invalid_reason=False, 
                            standard_concept='S'
                        ),
                        'episode_type_concept_id': 32817,
                        'episode_source_value': coding['code'],
                        'episode_source_concept_id': find_concept_id(
                            concept, 
                            concept_codes=[coding['code']], 
                            vocabulary_ids=['SNOMED'], 
                            domain_ids=['Condition'],
                            invalid_reason=True, 
                        ),
                    }
                    episode_rows.append(episode)

    return episode_rows

episode_rows = []

with open('/workspaces/synthea_dw/data/fhir/CareTeam.ndjson', 'r') as file:
    for line in file:
        data = json.loads(line)
        episodes = careTeam_to_episode(data, concept)
        if episodes:
            episode_rows.extend(episodes)

episode = pd.DataFrame(episode_rows).drop_duplicates()

episode.sample(5)

Unnamed: 0,episode_id,person_id,episode_concept_id,episode_start_date,episode_start_datetime,episode_end_date,episode_end_datetime,episode_parent_id,episode_number,episode_object_concept_id,episode_type_concept_id,episode_source_value,episode_source_concept_id
15,a78d44e2-c8c9-e66e-df70-183d73e91cca,32ee64c2-1585-d7ad-c53f-9ad739c676cf,32533,2014-05-16,2014-05-16 01:48:04+00:00,,,,1,4311765,32817,94260004,4311765
17,7e1770f4-cd81-1882-d158-a3b4cba64f6e,c95d085d-2249-b616-7668-88cc9a0c11bd,32533,2019-10-13,2019-10-13 16:40:23+00:00,,,,1,79740,32817,109838007,79740
16,19a5b455-0d9f-a039-8a85-3fd618dd8512,32ee64c2-1585-d7ad-c53f-9ad739c676cf,32533,2014-05-26,2014-05-26 02:24:05+00:00,,,,1,4311765,32817,94260004,4311765
7,fa84705d-feb6-1e28-38e2-c2e48db418a0,a5a02d31-a93c-7b72-7e1c-a9cbfa64874d,32533,1992-03-25,1992-03-25 08:52:48+00:00,,,,1,0,32817,15777000,40316773
23,e79bd8b9-1c41-1d15-a838-fe9f173181fa,408a95f4-02aa-3003-2f09-0241ac3343fb,32533,2009-01-14,2009-01-14 08:52:48+00:00,,,,1,0,32817,15777000,40316773


### cost

In [48]:
def claim_to_cost(data):
    cost_rows = []

    cost = {
        'cost_id': data['id'],
        'cost_event_id': pd.NA,
        'cost_domain_id': 32007,
        'cost_type_concept_id': 5032,
        'currency_concept_id': 44818668, 
        'total_charge': pd.NA,
        'total_cost': data['total']['value'],
        'total_paid': pd.NA,
        'paid_by_payer': pd.NA,
        'paid_by_patient': pd.NA,
        'paid_patient_copay': pd.NA,
        'paid_patient_coinsurance': pd.NA,
        'paid_patient_deductible': pd.NA,
        'paid_by_primary': pd.NA,
        'paid_ingredient_cost': pd.NA,
        'paid_dispensing_fee': pd.NA,
        'payer_plan_period_id': pd.NA,
        'amount_allowed': pd.NA,
        'revenue_code_concept_id': 38003025,
        'revenue_code_source_value': None,
        'drg_concept_id': pd.NA,
        'drg_source_value': None
    }
    cost_rows.append(cost)

    return cost_rows

cost_rows = []

with open('/workspaces/synthea_dw/data/fhir/Claim.ndjson', 'r') as file:
    for line in file:
        data = json.loads(line)
        costs = claim_to_cost(data)
        cost_rows.extend(costs)

cost = pd.DataFrame(cost_rows).drop_duplicates()

cost.sample(5)

Unnamed: 0,cost_id,cost_event_id,cost_domain_id,cost_type_concept_id,currency_concept_id,total_charge,total_cost,total_paid,paid_by_payer,paid_by_patient,paid_patient_copay,paid_patient_coinsurance,paid_patient_deductible,paid_by_primary,paid_ingredient_cost,paid_dispensing_fee,payer_plan_period_id,amount_allowed,revenue_code_concept_id,revenue_code_source_value,drg_concept_id,drg_source_value
632,0beb4b2e-32cd-4d9e-6706-992d9dc2265c,,32007,5032,44818668,,0.91,,,,,,,,,,,,38003025,,,
1448,ce88929e-37cf-74ef-9e51-63e2f7543b4d,,32007,5032,44818668,,234.71,,,,,,,,,,,,38003025,,,
1221,a2edf072-4a97-3f9c-514e-daf13e8f2f3e,,32007,5032,44818668,,988.24,,,,,,,,,,,,38003025,,,
1374,f6235aac-469d-580b-22da-2e15fa192c83,,32007,5032,44818668,,1028.46,,,,,,,,,,,,38003025,,,
993,97a57c64-38a6-854d-5918-aa86012a2de2,,32007,5032,44818668,,129.94,,,,,,,,,,,,38003025,,,
