In [1]:
import pandas as pd
import json
from datetime import datetime

In [2]:
concept = pd.read_csv('/workspaces/synthea_dw/omop/seeds/CONCEPT.csv', delimiter='\t', low_memory=False)

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.expand_frame_repr', False)

In [3]:
def find_concept_id(
        concept, concept_codes=None, 
        concept_names=None, vocabulary_ids=None, 
        domain_ids=None, concept_class_ids=None, 
        invalid_reason=False, standard_concept=None
    ):
    query_components = []

    if concept_codes:
        query_components.append(f"concept_code in @concept_codes")
    if concept_names:
        query_components.append(f"concept_name in @concept_names")
    if vocabulary_ids:
        query_components.append(f"vocabulary_id in @vocabulary_ids")
    if not invalid_reason:
        query_components.append(f"invalid_reason.isnull()")
    if standard_concept:
        query_components.append(f"standard_concept == @standard_concept")
    if domain_ids:
        query_components.append(f"domain_id in @domain_ids")
    if concept_class_ids:
        query_components.append(f"concept_class_id in @concept_class_ids")

    query = " and ".join(query_components)
    
    concept_rows = concept.query(query)['concept_id'] if query else concept['concept_id']
    
    return int(concept_rows.iloc[0]) if not concept_rows.empty else 0


### person

### observation_period

### visit_occurrence

In [4]:
def careTeam_to_visit_occurrence(data):
    visit_occurrences = []

    person_id, provider_id, care_site_id = None, None, None
    for participant in data.get('participant', []):
        for role in participant.get('role', []):
            for coding in role.get('coding', []):
                if coding.get('code') == '116154003':
                    person_id = participant['member']['reference'].split('/')[-1]
                elif coding.get('code') == '223366009': 
                    provider_id = participant['member']['reference'].split('/')[-1]
                elif coding.get('code') == '224891009': 
                    care_site_id = participant['member']['reference'].split('/')[-1]

    if person_id:
        visit_occurrence = {
            'visit_occurrence_id': data['encounter']['reference'].split('/')[-1],
            'person_id': person_id,
            'visit_concept_id': 9201,
            'visit_start_date': datetime.strptime(data['period']['start'].split('T')[0], '%Y-%m-%d').date(),
            'visit_start_datetime': datetime.fromisoformat(data['period']['start']),
            'visit_end_date': datetime.strptime(data['period']['end'].split('T')[0], '%Y-%m-%d').date() if 'end' in data['period'] else None,
            'visit_end_datetime': datetime.fromisoformat(data['period']['end']) if 'end' in data['period'] else None,
            'visit_type_concept_id': 32817,
            'provider_id': provider_id,
            'care_site_id': care_site_id,
            'visit_source_value': 'IP',
            'visit_source_concept_id': 9201,
            'admitted_from_concept_id': pd.NA,
            'admitted_from_source_value': None,
            'discharged_to_concept_id': pd.NA,
            'discharged_to_source_value': None,
            'preceding_visit_occurrence_id': pd.NA
        }
        visit_occurrences.append(visit_occurrence)

    return visit_occurrences

visit_occurrences = []

with open('/workspaces/synthea_dw/data/fhir/CareTeam.ndjson', 'r') as file:
    for line in file:
        data = json.loads(line)
        visits = careTeam_to_visit_occurrence(data)
        visit_occurrences.extend(visits)

visit_occurrence = pd.DataFrame(visit_occurrences).drop_duplicates()

visit_occurrence.head()

Unnamed: 0,visit_occurrence_id,person_id,visit_concept_id,visit_start_date,visit_start_datetime,visit_end_date,visit_end_datetime,visit_type_concept_id,provider_id,care_site_id,visit_source_value,visit_source_concept_id,admitted_from_concept_id,admitted_from_source_value,discharged_to_concept_id,discharged_to_source_value,preceding_visit_occurrence_id
0,7f547fc0-ee75-bac0-0707-04ff4623b828,79d8982d-fef7-7135-181a-0fb6af4a0e63,9201,1967-11-09,1967-11-09 19:10:04+00:00,,NaT,32817,e2a50964-0b8f-3eff-b399-8dbddc6ce7cf,eee00932-ca82-36bc-b7e8-e3fff6dbaeaa,IP,9201,,,,,
1,a6ce1507-9875-3f92-2847-53dda1f2d89a,098d2b36-b839-488e-28d1-db369b3abc6b,9201,1979-05-19,1979-05-19 22:47:10+00:00,,NaT,32817,21006c7d-c22b-395c-aad0-f3ad5e092524,46375470-1e20-3ddc-8dc5-010ba53826b6,IP,9201,,,,,
2,743816b1-6a76-2c34-00ae-18cd5337c924,5498f452-4f9e-ea6c-9587-e4bf0d97a2a0,9201,1986-07-19,1986-07-19 11:19:35+00:00,,NaT,32817,4d929468-faac-341a-b8a8-9bd890811899,84abfbd5-41c1-3df4-9b31-7ca93cb2309a,IP,9201,,,,,
3,bf76c911-3186-547a-fec4-f63389c5e3a2,d23456ac-957d-67ad-1ba4-34c3f8a54744,9201,1980-11-28,1980-11-28 00:14:03+00:00,,NaT,32817,6bfb9b4a-6b4a-3f6b-92e7-6318fcabf4b4,c92cc255-6778-3307-8092-987d667be4ce,IP,9201,,,,,
4,869e0e5d-2aec-841c-44b0-9dbcb6cf92ac,561d73b3-1c56-ed8d-266e-70c9e8712efb,9201,1997-01-08,1997-01-08 13:39:43+00:00,,NaT,32817,ad252b91-a18e-3e29-b067-63bfa08ba4cc,542185c1-943a-30f4-aba0-0efaa92488a4,IP,9201,,,,,


### condition_occurrence

In [5]:
def allergyIntolerance_to_condition_occurrence(line):
    data = json.loads(line)

    if data['code']['coding'][0]['code'] == '419199007':
        return None

    condition_occurrences = []

    for reaction in data.get('reaction', []):
        for manifestation in reaction.get('manifestation', []):
            condition_occurrences.append({
                'condition_occurrence_id': data['id'],
                'person_id': data['patient']['reference'].split('/')[-1],
                'condition_concept_id': find_concept_id(
                    concept, 
                    concept_codes=[manifestation['coding'][0]['code']], 
                    vocabulary_ids=['SNOMED'], 
                    domain_ids=['Condition'], 
                    invalid_reason=False, 
                    standard_concept='S', 
                    concept_class_ids=['Clinical Finding']
                ),
                'condition_start_date': datetime.strptime(data['recordedDate'].split('T')[0], '%Y-%m-%d').date(),
                'condition_start_datetime': datetime.fromisoformat(data['recordedDate']),
                'condition_end_date': None,
                'condition_end_datetime': None,
                'condition_type_concept_id': 32817,
                'condition_status_concept_id': pd.NA,
                'stop_reason': None,
                'provider_id': pd.NA,
                'visit_occurrence_id': pd.NA,
                'visit_detail_id': pd.NA,
                'condition_source_concept_id': find_concept_id(
                    concept, 
                    concept_codes=[manifestation['coding'][0]['code']], 
                    vocabulary_ids=['SNOMED'], 
                    domain_ids=['Condition'], 
                    concept_class_ids=['Clinical Finding']
                ),
                'condition_status_source_value': manifestation['coding'][0]['code']
            })

    return condition_occurrences

condition_occurrences = []

with open('/workspaces/synthea_dw/data/fhir/AllergyIntolerance.ndjson', 'r') as file:
    for line in file:
        conditions = allergyIntolerance_to_condition_occurrence(line)
        if conditions:
            condition_occurrences.extend(conditions)

condition_occurrence = pd.DataFrame(condition_occurrences).drop_duplicates()
condition_occurrence = condition_occurrence[condition_occurrence['condition_source_concept_id'] != 0]

condition_occurrence.head()

Unnamed: 0,condition_occurrence_id,person_id,condition_concept_id,condition_start_date,condition_start_datetime,condition_end_date,condition_end_datetime,condition_type_concept_id,condition_status_concept_id,stop_reason,provider_id,visit_occurrence_id,visit_detail_id,condition_source_concept_id,condition_status_source_value
0,68268edd-ab8e-517e-b225-e1791bb3940f,79d8982d-fef7-7135-181a-0fb6af4a0e63,140214,1967-11-26,1967-11-26 04:21:04+00:00,,,32817,,,,,,140214,271807003
1,449cb165-2eb3-b213-87cc-e2d8a77083cf,098d2b36-b839-488e-28d1-db369b3abc6b,604304,1979-06-04,1979-06-04 17:47:10+00:00,,,32817,,,,,,604304,878820003
2,516ddc8d-0eb3-fe92-1648-a068cae9929c,098d2b36-b839-488e-28d1-db369b3abc6b,196523,1979-06-04,1979-06-04 17:47:10+00:00,,,32817,,,,,,196523,62315008
3,9cf89d4f-6a70-09ab-f7ca-99ac88dee5c2,79d8982d-fef7-7135-181a-0fb6af4a0e63,140214,1967-11-26,1967-11-26 04:21:04+00:00,,,32817,,,,,,140214,271807003
4,39fe9eb5-35e2-a2ff-b7c2-e3465745766a,79d8982d-fef7-7135-181a-0fb6af4a0e63,312437,1967-11-26,1967-11-26 04:21:04+00:00,,,32817,,,,,,312437,267036007


In [6]:
def careTeam_to_condition_occurrence(data, concept):
    condition_occurrences = []

    if 'reasonCode' in data:
        person_id = None
        provider_id = None
        for participant in data.get('participant', []):
            for role in participant.get('role', []):
                for coding in role.get('coding', []):
                    if coding.get('code') == '116154003':
                        person_id = participant['member']['reference'].split('/')[-1]
                    elif coding.get('code') == '223366009':
                        provider_id = participant['member']['reference'].split('/')[-1]

        if person_id:
            for reasonCode in data['reasonCode']:
                for coding in reasonCode.get('coding', []):
                    condition_occurrence = {
                        'condition_occurrence_id': data['id'],
                        'person_id': person_id,
                        'condition_concept_id': find_concept_id(
                            concept, 
                            concept_codes=[coding['code']], 
                            vocabulary_ids=['SNOMED'], 
                            domain_ids=['Condition'],
                            invalid_reason=False, 
                            standard_concept='S'
                        ),
                        'condition_start_date': datetime.strptime(data['period']['start'].split('T')[0], '%Y-%m-%d').date(),
                        'condition_start_datetime': datetime.fromisoformat(data['period']['start']),
                        'condition_end_date': None,
                        'condition_end_datetime': None,
                        'condition_type_concept_id': 32817,
                        'condition_status_concept_id': pd.NA,
                        'stop_reason': None,
                        'provider_id': provider_id,
                        'visit_occurrence_id': data['encounter']['reference'].split('/')[-1],
                        'visit_detail_id': pd.NA,
                        'condition_source_value': coding['code'],
                        'condition_source_concept_id': find_concept_id(
                            concept, 
                            concept_codes=[coding['code']], 
                            vocabulary_ids=['SNOMED'], 
                            domain_ids=['Condition'],
                            invalid_reason=True
                        ),
                        'condition_status_source_value': None
                    }
                    condition_occurrences.append(condition_occurrence)

    return condition_occurrences

condition_occurrences = []

with open('/workspaces/synthea_dw/data/fhir/CareTeam.ndjson', 'r') as file:
    for line in file:
        data = json.loads(line)
        conditions = careTeam_to_condition_occurrence(data, concept)
        if conditions:
            condition_occurrences.extend(conditions)

condition_occurrence = pd.DataFrame(condition_occurrences).drop_duplicates()

condition_occurrence.head()

Unnamed: 0,condition_occurrence_id,person_id,condition_concept_id,condition_start_date,condition_start_datetime,condition_end_date,condition_end_datetime,condition_type_concept_id,condition_status_concept_id,stop_reason,provider_id,visit_occurrence_id,visit_detail_id,condition_source_value,condition_source_concept_id,condition_status_source_value
0,3651bbb8-8edf-df25-cd5e-bd98fc74c62c,5498f452-4f9e-ea6c-9587-e4bf0d97a2a0,320128,1986-07-19,1986-07-19 11:19:35+00:00,,,32817,,,4d929468-faac-341a-b8a8-9bd890811899,743816b1-6a76-2c34-00ae-18cd5337c924,,59621000,320128,
1,bd073957-3567-46dc-8624-85d129014110,d23456ac-957d-67ad-1ba4-34c3f8a54744,0,1980-11-28,1980-11-28 00:14:03+00:00,,,32817,,,6bfb9b4a-6b4a-3f6b-92e7-6318fcabf4b4,bf76c911-3186-547a-fec4-f63389c5e3a2,,15777000,40316773,
2,98b4003d-5a49-0126-84b3-c2fae5b12327,561d73b3-1c56-ed8d-266e-70c9e8712efb,320128,1997-01-08,1997-01-08 13:39:43+00:00,,,32817,,,ad252b91-a18e-3e29-b067-63bfa08ba4cc,869e0e5d-2aec-841c-44b0-9dbcb6cf92ac,,59621000,320128,
3,86969333-0db4-2e41-c86d-21cc1422aa12,1d26a818-351d-22f4-15d7-04cf0f520780,442588,2000-09-24,2000-09-24 00:13:34+00:00,,,32817,,,743dcb37-326b-3109-9e61-b506a0256ca2,67613ad2-832e-8f0f-b763-3746fbc71b32,,78275009,442588,
4,936bf292-663b-0f86-ac71-5dc7c24d1378,fffe0830-f71e-bd50-e90d-fc5f23c55433,0,2008-09-15,2008-09-15 11:42:35+00:00,,,32817,,,989a729c-ecf2-33e9-9c54-6a4c50c03d0c,de8af9b1-e6d1-869c-1823-8d512c91dee8,,15777000,40316773,


In [15]:
def claim_to_condition_occurrence(data, concept):
    condition_occurrences = []

    if any(coding['code'] in ['professional', 'institutional'] for coding in data['type']['coding']):

        for diagnosis in data.get('diagnosis', []):
            condition_ref = diagnosis['diagnosisReference']['reference']
            condition_id = condition_ref.split('/')[-1]

            for item in data.get('item', []):
                if 'productOrService' in item and 'coding' in item['productOrService']:
                    for coding in item['productOrService']['coding']:
                        condition_occurrence = {
                            'condition_occurrence_id': condition_id,
                            'person_id': data['patient']['reference'].split('/')[-1],
                            'condition_concept_id': find_concept_id(
                                concept, 
                                concept_codes=[coding['code']], 
                                vocabulary_ids=['SNOMED'], 
                                domain_ids=['Condition'], 
                                invalid_reason=False, 
                                standard_concept='S'
                            ),
                            'condition_start_date': datetime.strptime(data['billablePeriod']['start'].split('T')[0], '%Y-%m-%d').date(),
                            'condition_start_datetime': datetime.fromisoformat(data['billablePeriod']['start']),
                            'condition_end_date': datetime.strptime(data['billablePeriod']['end'].split('T')[0], '%Y-%m-%d').date(),
                            'condition_end_datetime': datetime.fromisoformat(data['billablePeriod']['end']),
                            'condition_type_concept_id': 32817,
                            'condition_status_concept_id': pd.NA,
                            'stop_reason': None,
                            'provider_id': pd.NA,
                            'visit_occurrence_id': item['encounter'][0]['reference'].split('/')[-1] if 'encounter' in item else None,
                            'visit_detail_id': pd.NA,
                            'condition_source_concept_id': find_concept_id(
                                concept, 
                                concept_codes=[coding['code']], 
                                vocabulary_ids=['SNOMED'], 
                                domain_ids=['Condition']
                            ),
                            'condition_status_source_value': coding['code']
                        }
                        condition_occurrences.append(condition_occurrence)

    return condition_occurrences

condition_occurrences = []

with open('/workspaces/synthea_dw/data/fhir/Claim.ndjson', 'r') as file:
    for line in file:
        data = json.loads(line)
        conditions = claim_to_condition_occurrence(data, concept)
        condition_occurrences.extend(conditions)

condition_occurrence = pd.DataFrame(condition_occurrences).drop_duplicates()
condition_occurrence = condition_occurrence[condition_occurrence['condition_source_concept_id'] != 0]

condition_occurrence.head()

Unnamed: 0,condition_occurrence_id,person_id,condition_concept_id,condition_start_date,condition_start_datetime,condition_end_date,condition_end_datetime,condition_type_concept_id,condition_status_concept_id,stop_reason,provider_id,visit_occurrence_id,visit_detail_id,condition_source_concept_id,condition_status_source_value
14,5aaeda67-3c17-03c5-6ef2-2154bba19afe,5498f452-4f9e-ea6c-9587-e4bf0d97a2a0,315564,1968-04-27,1968-04-27 11:19:35+00:00,1968-04-27,1968-04-27 11:34:35+00:00,32817,,,,,,315564,60234000
16,096ab125-5113-bdd4-bfa0-cdfc3bbcab40,fffe0830-f71e-bd50-e90d-fc5f23c55433,4059650,2003-09-01,2003-09-01 11:42:35+00:00,2003-09-01,2003-09-01 11:57:35+00:00,32817,,,,,,4059650,160968000
33,36db52f1-7078-90f5-829b-c2932ab23c75,1d26a818-351d-22f4-15d7-04cf0f520780,435524,2000-08-28,2000-08-28 15:30:45+00:00,2000-08-28,2000-08-28 16:14:09+00:00,32817,,,,,,435524,39898005
35,9c236262-3e77-555c-6db7-477c30655ecc,d23456ac-957d-67ad-1ba4-34c3f8a54744,4056621,1958-02-15,1958-02-15 21:13:25+00:00,1958-02-15,1958-02-15 23:08:44+00:00,32817,,,,,,4056621,197927001
57,08965786-00cf-4694-b558-2f647c04a9a5,6f96683d-fd58-5ba1-9735-f9d5f11c07a8,4251306,2015-09-02,2015-09-02 12:51:50+00:00,2015-09-02,2015-09-02 13:40:55+00:00,32817,,,,,,4251306,73595000


In [19]:
def condition_to_condition_occurrence(data, concept):
    condition_occurrences = []

    person_id = data['subject']['reference'].split('/')[-1]
    condition_code = data['code']['coding'][0]['code']

    condition_occurrence = {
        'condition_occurrence_id': data['id'],
        'person_id': person_id,
        'condition_concept_id': find_concept_id(
            concept, 
            concept_codes=[condition_code], 
            vocabulary_ids=['SNOMED'], 
            domain_ids=['Condition'], 
            invalid_reason=False, 
            standard_concept='S'
        ),
        'condition_start_date': datetime.strptime(data['onsetDateTime'].split('T')[0], '%Y-%m-%d').date(),
        'condition_start_datetime': datetime.fromisoformat(data['onsetDateTime']),
        'condition_end_date': datetime.strptime(data['abatementDateTime'].split('T')[0], '%Y-%m-%d').date() if 'abatementDateTime' in data else None,
        'condition_end_datetime': datetime.fromisoformat(data['abatementDateTime']) if 'abatementDateTime' in data else None,
        'condition_type_concept_id': 32817,
        'condition_status_concept_id': 37109701 if data['clinicalStatus']['coding'][0]['code'] == 'resolved' else 9181 if data['clinicalStatus']['coding'][0]['code'] == 'active' else pd.NA,
        'stop_reason': None,
        'provider_id': pd.NA,
        'visit_occurrence_id': data['encounter']['reference'].split('/')[-1],
        'visit_detail_id': pd.NA,
        'condition_source_concept_id': find_concept_id(
            concept, 
            concept_codes=[condition_code], 
            vocabulary_ids=['SNOMED'], 
            domain_ids=['Condition']
        ),
        'condition_status_source_value': condition_code
    }
    condition_occurrences.append(condition_occurrence)

    return condition_occurrences

condition_occurrences = []

with open('/workspaces/synthea_dw/data/fhir/Condition.ndjson', 'r') as file:
    for line in file:
        data = json.loads(line)
        conditions = condition_to_condition_occurrence(data, concept)
        condition_occurrences.extend(conditions)

condition_occurrence = pd.DataFrame(condition_occurrences).drop_duplicates()
condition_occurrence = condition_occurrence[condition_occurrence['condition_source_concept_id'] != 0]

condition_occurrence.head()

Unnamed: 0,condition_occurrence_id,person_id,condition_concept_id,condition_start_date,condition_start_datetime,condition_end_date,condition_end_datetime,condition_type_concept_id,condition_status_concept_id,stop_reason,provider_id,visit_occurrence_id,visit_detail_id,condition_source_concept_id,condition_status_source_value
5,096ab125-5113-bdd4-bfa0-cdfc3bbcab40,fffe0830-f71e-bd50-e90d-fc5f23c55433,4059650,2003-09-01,2003-09-01 13:03:13+00:00,,NaT,32817,9181,,,f2fa5139-f026-37d3-3a8f-5897a94c77b2,,4059650,160968000
8,5aaeda67-3c17-03c5-6ef2-2154bba19afe,5498f452-4f9e-ea6c-9587-e4bf0d97a2a0,315564,1968-05-19,1968-05-19 11:19:35+00:00,,NaT,32817,9181,,,c0579091-723a-9e05-69e8-07077ec09527,,315564,60234000
10,36db52f1-7078-90f5-829b-c2932ab23c75,1d26a818-351d-22f4-15d7-04cf0f520780,435524,2000-08-28,2000-08-28 15:30:45+00:00,,NaT,32817,9181,,,8dcc6bdd-311d-6fcf-a664-a8f0bf41d82e,,435524,39898005
11,9c236262-3e77-555c-6db7-477c30655ecc,d23456ac-957d-67ad-1ba4-34c3f8a54744,4056621,1958-07-16,1958-07-16 10:08:44+00:00,,NaT,32817,9181,,,24c42aae-48d2-d9f2-6f93-fbfc1a3b9629,,4056621,197927001
12,f8b697e9-307e-e84b-5733-49c09b10bec6,1d26a818-351d-22f4-15d7-04cf0f520780,442588,2000-09-24,2000-09-24 00:13:34+00:00,,NaT,32817,9181,,,67613ad2-832e-8f0f-b763-3746fbc71b32,,442588,78275009


### drug_exposure

In [12]:
def claim_to_drug_exposure(data, concept):
    drug_exposures = []

    if any(coding['code'] == 'pharmacy' for coding in data['type']['coding']):

        for item in data.get('item', []):
            drug_exposure = {
                'drug_exposure_id': data['prescription']['reference'].split('/')[-1],
                'person_id': data['patient']['reference'].split('/')[-1],
                'drug_concept_id': find_concept_id(
                    concept, 
                    concept_codes=[item['productOrService']['coding'][0]['code']], 
                    vocabulary_ids=['RxNorm'], 
                    domain_ids=['Drug'], 
                    invalid_reason=False, 
                    standard_concept='S'
                ),
                'drug_exposure_start_date': datetime.strptime(data['billablePeriod']['start'].split('T')[0], '%Y-%m-%d').date(),
                'drug_exposure_start_datetime': datetime.fromisoformat(data['billablePeriod']['start']),
                'drug_exposure_end_date': datetime.strptime(data['billablePeriod']['end'].split('T')[0], '%Y-%m-%d').date(),
                'drug_exposure_end_datetime': datetime.fromisoformat(data['billablePeriod']['end']),
                'verbatim_end_date': datetime.strptime(data['billablePeriod']['end'].split('T')[0], '%Y-%m-%d').date(),
                'drug_type_concept_id': 32817,
                'stop_reason': None,
                'refills': 0,
                'quantity': None,
                'days_supply': ((datetime.strptime(data['billablePeriod']['start'].split('T')[0], '%Y-%m-%d').date()) - (datetime.strptime(data['billablePeriod']['start'].split('T')[0], '%Y-%m-%d').date())).days or 1,
                'sig': None,
                'route_concept_id': None,
                'lot_number': None,
                'provider_id': None,
                'visit_occurrence_id': item['encounter'][0]['reference'].split('/')[-1] if 'encounter' in item else None,
                'visit_detail_id': None,
                'drug_source_value': item['productOrService']['coding'][0]['code'],
                'drug_source_concept_id': find_concept_id(
                    concept, 
                    concept_codes=[item['productOrService']['coding'][0]['code']], 
                    vocabulary_ids=['RxNorm'], 
                    domain_ids=['Drug']
                ),
                'route_source_value': None,
                'dose_unit_source_value': None
            }
            drug_exposures.append(drug_exposure)

    return drug_exposures

drug_exposures = []

with open('/workspaces/synthea_dw/data/fhir/Claim.ndjson', 'r') as file:
    for line in file:
        data = json.loads(line)
        exposures = claim_to_drug_exposure(data, concept)
        drug_exposures.extend(exposures)

drug_exposure = pd.DataFrame(drug_exposures).drop_duplicates()

drug_exposure.head()

Unnamed: 0,drug_exposure_id,person_id,drug_concept_id,drug_exposure_start_date,drug_exposure_start_datetime,drug_exposure_end_date,drug_exposure_end_datetime,verbatim_end_date,drug_type_concept_id,stop_reason,refills,quantity,days_supply,sig,route_concept_id,lot_number,provider_id,visit_occurrence_id,visit_detail_id,drug_source_value,drug_source_concept_id,route_source_value,dose_unit_source_value
0,5a371ba8-e1bb-03d3-82e7-eb4a991a6152,098d2b36-b839-488e-28d1-db369b3abc6b,40232448,1979-06-04,1979-06-04 17:47:10+00:00,1979-06-04,1979-06-04 18:21:31+00:00,1979-06-04,32817,,0,,1,,,,,cfa88018-8ee8-a6f8-4af0-3b38ca521762,,1049630,40232448,,
1,1ac507cd-406c-c447-0df4-21cd36afac6f,5498f452-4f9e-ea6c-9587-e4bf0d97a2a0,19077344,1994-09-10,1994-09-10 11:19:35+00:00,1994-09-10,1994-09-10 12:19:55+00:00,1994-09-10,32817,,0,,1,,,,,6be0c41f-20ba-d7b1-166b-c760cedb5b8f,,310325,19077344,,
2,c7066a85-9469-658e-5991-b59effe364a3,3ac697b4-16e8-994a-897b-bac2fc8a2cd7,1127433,2018-04-24,2018-04-24 00:58:37+00:00,2018-04-24,2018-04-24 01:13:37+00:00,2018-04-24,32817,,0,,1,,,,,962d884b-ca60-4e20-7e6b-4e2dde5a0939,,313782,1127433,,
3,36adf560-3eb3-9101-1d91-673c725bcc6e,fffe0830-f71e-bd50-e90d-fc5f23c55433,19077344,2011-09-27,2011-09-27 14:25:11+00:00,2011-09-27,2011-09-27 14:40:11+00:00,2011-09-27,32817,,0,,1,,,,,c8513fab-2b0f-264f-ccc4-cf5f778e64a3,,310325,19077344,,
4,b42de6d5-033e-fd0d-1623-689fff94af6c,fffe0830-f71e-bd50-e90d-fc5f23c55433,964261,2011-09-27,2011-09-27 14:25:11+00:00,2011-09-27,2011-09-27 14:40:11+00:00,2011-09-27,32817,,0,,1,,,,,c8513fab-2b0f-264f-ccc4-cf5f778e64a3,,2001499,964261,,


### procedure_occurrence

In [7]:
def carePlan_to_procedure_occurrence(data, concept):
    procedure_occurrences = []

    for category in data.get('category', []):
        for coding in category.get('coding', []):
            if 'display' in coding:
                procedure = {
                    'procedure_occurrence_id': data['id'],
                    'person_id': data['subject']['reference'].split('/')[-1],
                    'procedure_concept_id': find_concept_id(
                        concept, 
                        concept_codes=[coding['code']], 
                        vocabulary_ids=['SNOMED'], 
                        domain_ids=['Procedure'], 
                        invalid_reason=False, 
                        standard_concept='S', 
                        concept_class_ids=['Procedure']
                    ),
                    'procedure_date': datetime.strptime(data['period']['start'].split('T')[0], '%Y-%m-%d').date(),
                    'procedure_datetime': datetime.fromisoformat(data['period']['start']),
                    'procedure_end_date': None,
                    'procedure_end_datetime': None,
                    'procedure_type_concept_id': 32817,
                    'modifier_concept_id': pd.NA,
                    'quantity': 1,
                    'provider_id': pd.NA,
                    'visit_occurrence_id': data['encounter']['reference'].split('/')[-1],
                    'visit_detail_id': pd.NA,
                    'procedure_source_value': coding['code'],
                    'procedure_source_concept_id': find_concept_id(
                        concept, 
                        concept_codes=[coding['code']], 
                        vocabulary_ids=['SNOMED'], 
                        domain_ids=['Procedure'], 
                        concept_class_ids=['Procedure']
                    ),
                    'modifier_source_value': None
                }
                procedure_occurrences.append(procedure)

    return procedure_occurrences

procedure_occurrences = []

with open('/workspaces/synthea_dw/data/fhir/CarePlan.ndjson', 'r') as file:
    for line in file:
        data = json.loads(line)
        procedures = carePlan_to_procedure_occurrence(data, concept)
        if procedures:
            procedure_occurrences.extend(procedures)

procedure_occurrence = pd.DataFrame(procedure_occurrences).drop_duplicates()
procedure_occurrence = procedure_occurrence[procedure_occurrence['procedure_source_concept_id'] != 0]

procedure_occurrence.head()

Unnamed: 0,procedure_occurrence_id,person_id,procedure_concept_id,procedure_date,procedure_datetime,procedure_end_date,procedure_end_datetime,procedure_type_concept_id,modifier_concept_id,quantity,provider_id,visit_occurrence_id,visit_detail_id,procedure_source_value,procedure_source_concept_id,modifier_source_value
0,b5940334-4e94-2047-75ca-f53f0c771a0c,79d8982d-fef7-7135-181a-0fb6af4a0e63,4293157,1967-11-09,1967-11-09 19:10:04+00:00,,,32817,,1,,7f547fc0-ee75-bac0-0707-04ff4623b828,,384758001,4293157,
1,81b4e1f9-4d65-b2a8-303d-56f4ecc017d5,098d2b36-b839-488e-28d1-db369b3abc6b,4293157,1979-05-19,1979-05-19 22:47:10+00:00,,,32817,,1,,a6ce1507-9875-3f92-2847-53dda1f2d89a,,384758001,4293157,
2,057c0a7a-98b4-684f-2a24-dd2653b72719,5498f452-4f9e-ea6c-9587-e4bf0d97a2a0,40481459,1986-07-19,1986-07-19 11:19:35+00:00,,,32817,,1,,743816b1-6a76-2c34-00ae-18cd5337c924,,443402002,40481459,
4,d95fb87a-062e-97fd-cbea-5d981c13fe39,561d73b3-1c56-ed8d-266e-70c9e8712efb,40481459,1997-01-08,1997-01-08 13:39:43+00:00,,,32817,,1,,869e0e5d-2aec-841c-44b0-9dbcb6cf92ac,,443402002,40481459,
8,f7f75673-8153-d99f-6ba4-e6303055752e,1d26a818-351d-22f4-15d7-04cf0f520780,40481459,2003-11-21,2003-11-21 15:30:45+00:00,,,32817,,1,,fb708140-8873-5618-e24a-71ba1d6d7176,,443402002,40481459,


### device_exposure

### measurement

### observation

In [16]:
def allergyIntolerance_to_observation(data, concept):
    if data['code']['coding'][0]['code'] != '419199007':

        observations = []

        for reaction in data.get('reaction', []):
            observation = {
                'observation_id': data['id'],
                'person_id': data['patient']['reference'].split('/')[-1],
                'observation_concept_id': 4169307,
                'observation_date': datetime.strptime(data['recordedDate'].split('T')[0], '%Y-%m-%d').date(),
                'observation_datetime': datetime.fromisoformat(data['recordedDate']),
                'observation_type_concept_id': 32817,
                'value_as_number': None,
                'value_as_string': None,
                'value_as_concept_id': find_concept_id(
                    concept, 
                    concept_codes=[data['code']['coding'][0]['code']], 
                    vocabulary_ids=['SNOMED'], 
                    domain_ids=['Observation'], 
                    invalid_reason=False, 
                    standard_concept='S', 
                    concept_class_ids=['Substance']
                ),
                'qualifier_concept_id': find_concept_id(
                    concept, 
                    concept_names=[data['criticality'].capitalize()], 
                    vocabulary_ids=['SNOMED'], 
                    domain_ids=['Meas Value'], 
                    invalid_reason=False, 
                    standard_concept='S',
                    concept_class_ids=['Qualifier Value']
                ),
                'unit_concept_id': pd.NA,
                'provider_id': pd.NA,
                'visit_occurrence_id': None,
                'visit_detail_id': None,
                'observation_source_value': None,
                'observation_source_concept_id': 4169307,
                'unit_source_value': None,
                'qualifier_source_value': data['criticality'],
                'value_source_value': data['code']['coding'][0]['code'],
                'observation_event_id': None,
                'obs_event_field_concept_id': None
            }
            observations.append(observation)

        return observations

    return None

observation_rows = []

with open('/workspaces/synthea_dw/data/fhir/AllergyIntolerance.ndjson', 'r') as file:
    for line in file:
        data = json.loads(line)
        rows = allergyIntolerance_to_observation(data, concept)
        if rows:
            observation_rows.extend(rows)

observation = pd.DataFrame(observation_rows).drop_duplicates()
observation = observation[observation['observation_source_concept_id'] != 0]

observation.head()

Unnamed: 0,observation_id,person_id,observation_concept_id,observation_date,observation_datetime,observation_type_concept_id,value_as_number,value_as_string,value_as_concept_id,qualifier_concept_id,unit_concept_id,provider_id,visit_occurrence_id,visit_detail_id,observation_source_value,observation_source_concept_id,unit_source_value,qualifier_source_value,value_source_value,observation_event_id,obs_event_field_concept_id
0,68268edd-ab8e-517e-b225-e1791bb3940f,79d8982d-fef7-7135-181a-0fb6af4a0e63,4169307,1967-11-26,1967-11-26 04:21:04+00:00,32817,,,4122068,4267416,,,,,,4169307,,low,288328004,,
1,449cb165-2eb3-b213-87cc-e2d8a77083cf,098d2b36-b839-488e-28d1-db369b3abc6b,4169307,1979-06-04,1979-06-04 17:47:10+00:00,32817,,,4138133,4267416,,,,,,4169307,,low,264287008,,
2,516ddc8d-0eb3-fe92-1648-a068cae9929c,098d2b36-b839-488e-28d1-db369b3abc6b,4169307,1979-06-04,1979-06-04 17:47:10+00:00,32817,,,4106307,4267416,,,,,,4169307,,low,256355007,,
3,9cf89d4f-6a70-09ab-f7ca-99ac88dee5c2,79d8982d-fef7-7135-181a-0fb6af4a0e63,4169307,1967-11-26,1967-11-26 04:21:04+00:00,32817,,,4138133,4267416,,,,,,4169307,,low,264287008,,
4,39fe9eb5-35e2-a2ff-b7c2-e3465745766a,79d8982d-fef7-7135-181a-0fb6af4a0e63,4169307,1967-11-26,1967-11-26 04:21:04+00:00,32817,,,42539493,4267416,,,,,,4169307,,low,735971005,,


In [9]:
def carePlan_to_observation(data, concept):
    observation_rows = []

    for category in data.get('category', []):
        for coding in category.get('coding', []):
            if 'display' in coding:
                observation = {
                    'observation_id': data['id'],
                    'person_id': data['subject']['reference'].split('/')[-1],
                    'observation_concept_id': find_concept_id(
                        concept, 
                        concept_codes=[coding['code']], 
                        vocabulary_ids=['SNOMED'], 
                        domain_ids=['Observation'], 
                        invalid_reason=False, 
                        standard_concept='S'
                    ),
                    'observation_date': datetime.strptime(data['period']['start'].split('T')[0], '%Y-%m-%d').date(),
                    'observation_datetime': datetime.fromisoformat(data['period']['start']),
                    'observation_type_concept_id': 32817,
                    'value_as_number': None,
                    'value_as_string': None,
                    'value_as_concept_id': pd.NA,
                    'qualifier_concept_id': pd.NA,
                    'unit_concept_id': pd.NA,
                    'provider_id': pd.NA,
                    'visit_occurrence_id': data['encounter']['reference'].split('/')[-1],
                    'visit_detail_id': pd.NA,
                    'observation_source_value': coding['code'],
                    'observation_source_concept_id': find_concept_id(
                        concept, 
                        concept_codes=[coding['code']], 
                        vocabulary_ids=['SNOMED'], 
                        domain_ids=['Observation']
                    ),
                    'qualifier_source_value': None,
                    'value_source_value': None,
                    'observation_event_id': data['id'],
                    'obs_event_field_concept_id': None  
                }
                observation_rows.append(observation)

    return observation_rows

observation_rows = []

with open('/workspaces/synthea_dw/data/fhir/CarePlan.ndjson', 'r') as file:
    for line in file:
        data = json.loads(line)
        observations = carePlan_to_observation(data, concept)
        if observations:
            observation_rows.extend(observations)

observation = pd.DataFrame(observation_rows).drop_duplicates()
observation = observation[observation['observation_source_concept_id'] != 0]

observation.head()


Unnamed: 0,observation_id,person_id,observation_concept_id,observation_date,observation_datetime,observation_type_concept_id,value_as_number,value_as_string,value_as_concept_id,qualifier_concept_id,unit_concept_id,provider_id,visit_occurrence_id,visit_detail_id,observation_source_value,observation_source_concept_id,qualifier_source_value,value_source_value,observation_event_id,obs_event_field_concept_id
13,06025c2f-1402-31aa-4a16-1fc00ac2aa40,561d73b3-1c56-ed8d-266e-70c9e8712efb,4047564,2013-12-04,2013-12-04 13:39:43+00:00,32817,,,,,,,78780d5c-aaff-7bcd-3dae-c9d09aaae3ac,,134435003,4047564,,,06025c2f-1402-31aa-4a16-1fc00ac2aa40,
16,8617ba57-b1b3-d2a6-369b-f3e51c4c47f6,5498f452-4f9e-ea6c-9587-e4bf0d97a2a0,4047564,2001-09-29,2001-09-29 11:19:35+00:00,32817,,,,,,,cb42e515-3f71-d2e4-4f18-0ae602c69dc9,,134435003,4047564,,,8617ba57-b1b3-d2a6-369b-f3e51c4c47f6,
21,cf24fe5e-4d38-ab08-c879-e7b219f2f6aa,fffe0830-f71e-bd50-e90d-fc5f23c55433,4021315,2015-09-23,2015-09-23 13:08:35+00:00,32817,,,,,,,a5cba29c-4d9e-9f16-a985-364a1627c23a,,225358003,4021315,,,cf24fe5e-4d38-ab08-c879-e7b219f2f6aa,
22,0a9efc99-11c2-ffcb-5e86-74f17d302ee9,8d3c566e-e2f0-3f11-eee2-dce3c68c498d,46272846,2021-04-16,2021-04-16 18:00:56+00:00,32817,,,,,,,a33ad2b8-fc4a-355d-bd4c-6501c2dedb3d,,711282006,46272846,,,0a9efc99-11c2-ffcb-5e86-74f17d302ee9,
26,e2a4ae8f-677f-b7cc-90c1-97e896b1e332,1d26a818-351d-22f4-15d7-04cf0f520780,4047564,2016-02-05,2016-02-05 15:30:45+00:00,32817,,,,,,,088054ee-86a5-9143-8d4f-86ec3a97e0cf,,134435003,4047564,,,e2a4ae8f-677f-b7cc-90c1-97e896b1e332,


In [17]:
def claim_to_observation(data, concept):
    observation_rows = []

    if any(coding['code'] in ['professional', 'institutional'] for coding in data['type']['coding']):
        person_id = data['patient']['reference'].split('/')[-1]

        for item in data.get('item', []):
            if 'productOrService' in item and 'coding' in item['productOrService']:
                for coding in item['productOrService']['coding']:
                    observation = {
                        'observation_id': data['id'],
                        'person_id': person_id,
                        'observation_concept_id': find_concept_id(
                            concept, 
                            concept_codes=[coding['code']], 
                            vocabulary_ids=['SNOMED'], 
                            domain_ids=['Observation'], 
                            invalid_reason=False, 
                            standard_concept='S'
                        ),
                        'observation_date': datetime.strptime(data['created'].split('T')[0], '%Y-%m-%d').date(),
                        'observation_datetime': datetime.fromisoformat(data['created']),
                        'observation_type_concept_id': 32817,
                        'value_as_number': pd.NA,
                        'value_as_string': None,
                        'value_as_concept_id': pd.NA,
                        'qualifier_concept_id': pd.NA,
                        'unit_concept_id': None,
                        'provider_id': pd.NA,
                        'visit_occurrence_id': item['encounter'][0]['reference'].split('/')[-1] if 'encounter' in item else None,
                        'visit_detail_id': pd.NA,
                        'observation_source_value': coding['code'],
                        'observation_source_concept_id': find_concept_id(
                            concept, 
                            concept_codes=[coding['code']], 
                            vocabulary_ids=['SNOMED'], 
                            domain_ids=['Observation'], 
                            invalid_reason=False, 
                            standard_concept='S'
                        ),
                        'unit_source_value': None,
                        'qualifier_source_value': None,
                        'value_source_value': None,
                        'observation_event_id': data['id'],
                        'obs_event_field_concept_id': None
                    }
                    observation_rows.append(observation)

    return observation_rows

observation_rows = []

with open('/workspaces/synthea_dw/data/fhir/Claim.ndjson', 'r') as file:
    for line in file:
        data = json.loads(line)
        observations = claim_to_observation(data, concept)
        observation_rows.extend(observations)

observation = pd.DataFrame(observation_rows).drop_duplicates()
observation = observation[observation['observation_source_concept_id'] != 0]

observation.head()

Unnamed: 0,observation_id,person_id,observation_concept_id,observation_date,observation_datetime,observation_type_concept_id,value_as_number,value_as_string,value_as_concept_id,qualifier_concept_id,unit_concept_id,provider_id,visit_occurrence_id,visit_detail_id,observation_source_value,observation_source_concept_id,unit_source_value,qualifier_source_value,value_source_value,observation_event_id,obs_event_field_concept_id
1,171fd2b0-6de8-8ead-d848-2028d8bf0791,d23456ac-957d-67ad-1ba4-34c3f8a54744,4022661,1948-01-16,1948-01-16 01:08:01+00:00,32817,,,,,,,,,105531004,4022661,,,,171fd2b0-6de8-8ead-d848-2028d8bf0791,
2,171fd2b0-6de8-8ead-d848-2028d8bf0791,d23456ac-957d-67ad-1ba4-34c3f8a54744,4072733,1948-01-16,1948-01-16 01:08:01+00:00,32817,,,,,,,,,224295006,4072733,,,,171fd2b0-6de8-8ead-d848-2028d8bf0791,
5,49d6c2b3-cae9-84da-1732-c4dae9127698,8d3c566e-e2f0-3f11-eee2-dce3c68c498d,4199788,2019-10-22,2019-10-22 18:15:56+00:00,32817,,,,,,,,,314529007,4199788,,,,49d6c2b3-cae9-84da-1732-c4dae9127698,
8,c3169eb7-d753-b4d0-7caf-15c9750fe010,561d73b3-1c56-ed8d-266e-70c9e8712efb,4313474,1994-07-12,1994-07-12 13:54:43+00:00,32817,,,,,,,c22f4649-e10b-d081-37dc-c25bd58a8073,,424619006,4313474,,,,c3169eb7-d753-b4d0-7caf-15c9750fe010,
14,00801b3a-ea90-9617-7cf7-92f32a1ea825,098d2b36-b839-488e-28d1-db369b3abc6b,4085798,1979-05-19,1979-05-19 23:02:10+00:00,32817,,,,,,,a6ce1507-9875-3f92-2847-53dda1f2d89a,,185347001,4085798,,,,00801b3a-ea90-9617-7cf7-92f32a1ea825,


### death

### note

In [10]:
def carePlan_to_note(data):
    # Extracting note_title and note_text from 'text' -> 'div'
    div_text = data['text']['div']
    note_title_end_index = div_text.find('<br/>')

    return {
        'note_id': data['id'],
        'person_id': data['subject']['reference'].split('/')[-1],
        'note_date': datetime.strptime(data['period']['start'].split('T')[0], '%Y-%m-%d').date(),
        'note_datetime': datetime.fromisoformat(data['period']['start']),
        'note_type_concept_id': 32817,
        'note_class_concept_id': 706300,
        'note_title': data['text']['div'][len('<div xmlns="http://www.w3.org/1999/xhtml">'):note_title_end_index],
        'note_text': div_text[note_title_end_index + len('<br/>'):],
        'encoding_concept_id': 32678,
        'language_concept_id': 4175745,
        'provider_id': pd.NA,
        'visit_occurrence_id': data['encounter']['reference'].split('/')[-1],
        'visit_detail_id': pd.NA,
        'note_source_value': div_text,
        'note_event_id': data['encounter']['reference'].split('/')[-1],
        'note_event_field_concept_id': pd.NA
    }

note_rows = []

with open('/workspaces/synthea_dw/data/fhir/CarePlan.ndjson', 'r') as file:
    for line in file:
        data = json.loads(line)
        note_row = carePlan_to_note(data)
        note_rows.append(note_row)

note = pd.DataFrame(note_rows).drop_duplicates()

note.head()

Unnamed: 0,note_id,person_id,note_date,note_datetime,note_type_concept_id,note_class_concept_id,note_title,note_text,encoding_concept_id,language_concept_id,provider_id,visit_occurrence_id,visit_detail_id,note_source_value,note_event_id,note_event_field_concept_id
0,b5940334-4e94-2047-75ca-f53f0c771a0c,79d8982d-fef7-7135-181a-0fb6af4a0e63,1967-11-09,1967-11-09 19:10:04+00:00,32817,706300,Care Plan for Self-care interventions (procedu...,Activities: <ul><li>Self-care interventions (p...,32678,4175745,,7f547fc0-ee75-bac0-0707-04ff4623b828,,"<div xmlns=""http://www.w3.org/1999/xhtml"">Care...",7f547fc0-ee75-bac0-0707-04ff4623b828,
1,81b4e1f9-4d65-b2a8-303d-56f4ecc017d5,098d2b36-b839-488e-28d1-db369b3abc6b,1979-05-19,1979-05-19 22:47:10+00:00,32817,706300,Care Plan for Self-care interventions (procedu...,Activities: <ul><li>Self-care interventions (p...,32678,4175745,,a6ce1507-9875-3f92-2847-53dda1f2d89a,,"<div xmlns=""http://www.w3.org/1999/xhtml"">Care...",a6ce1507-9875-3f92-2847-53dda1f2d89a,
2,057c0a7a-98b4-684f-2a24-dd2653b72719,5498f452-4f9e-ea6c-9587-e4bf0d97a2a0,1986-07-19,1986-07-19 11:19:35+00:00,32817,706300,Care Plan for Lifestyle education regarding hy...,Care plan is meant to treat Essential hyperten...,32678,4175745,,743816b1-6a76-2c34-00ae-18cd5337c924,,"<div xmlns=""http://www.w3.org/1999/xhtml"">Care...",743816b1-6a76-2c34-00ae-18cd5337c924,
3,35e1b2d7-c43f-f5fb-f3f9-6bb4538be596,d23456ac-957d-67ad-1ba4-34c3f8a54744,1980-11-28,1980-11-28 00:14:03+00:00,32817,706300,Care Plan for Diabetes self management plan.,Care plan is meant to treat Prediabetes.<br/>A...,32678,4175745,,bf76c911-3186-547a-fec4-f63389c5e3a2,,"<div xmlns=""http://www.w3.org/1999/xhtml"">Care...",bf76c911-3186-547a-fec4-f63389c5e3a2,
4,d95fb87a-062e-97fd-cbea-5d981c13fe39,561d73b3-1c56-ed8d-266e-70c9e8712efb,1997-01-08,1997-01-08 13:39:43+00:00,32817,706300,Care Plan for Lifestyle education regarding hy...,Care plan is meant to treat Essential hyperten...,32678,4175745,,869e0e5d-2aec-841c-44b0-9dbcb6cf92ac,,"<div xmlns=""http://www.w3.org/1999/xhtml"">Care...",869e0e5d-2aec-841c-44b0-9dbcb6cf92ac,


### specimen

### location

### care_site

### provider

### episode

In [11]:
def careTeam_to_episode(data, concept):
    episode_rows = []

    # Check if reasonCode exists
    if 'reasonCode' in data:
        person_id = None
        for participant in data.get('participant', []):
            for role in participant.get('role', []):
                for coding in role.get('coding', []):
                    if coding.get('code') == '116154003':
                        person_id = participant['member']['reference'].split('/')[-1]
                        break
                if person_id:
                    break

        if person_id:
            for reasonCode in data['reasonCode']:
                for coding in reasonCode.get('coding', []):
                    episode = {
                        'episode_id': data['id'],
                        'person_id': person_id,
                        'episode_concept_id': 32533,
                        'episode_start_date': datetime.strptime(data['period']['start'].split('T')[0], '%Y-%m-%d').date(),
                        'episode_start_datetime': datetime.fromisoformat(data['period']['start']),
                        'episode_end_date': None,
                        'episode_end_datetime': None,
                        'episode_parent_id': pd.NA,
                        'episode_number': 1,
                        'episode_object_concept_id': find_concept_id(
                            concept, 
                            concept_codes=[coding['code']], 
                            vocabulary_ids=['SNOMED'], 
                            domain_ids=['Condition'],
                            invalid_reason=False, 
                            standard_concept='S'
                        ),
                        'episode_type_concept_id': 32817,
                        'episode_source_value': coding['code'],
                        'episode_source_concept_id': find_concept_id(
                            concept, 
                            concept_codes=[coding['code']], 
                            vocabulary_ids=['SNOMED'], 
                            domain_ids=['Condition'],
                            invalid_reason=True, 
                        ),
                    }
                    episode_rows.append(episode)

    return episode_rows

episode_rows = []

with open('/workspaces/synthea_dw/data/fhir/CareTeam.ndjson', 'r') as file:
    for line in file:
        data = json.loads(line)
        episodes = careTeam_to_episode(data, concept)
        if episodes:
            episode_rows.extend(episodes)

episode = pd.DataFrame(episode_rows).drop_duplicates()

episode.head()

Unnamed: 0,episode_id,person_id,episode_concept_id,episode_start_date,episode_start_datetime,episode_end_date,episode_end_datetime,episode_parent_id,episode_number,episode_object_concept_id,episode_type_concept_id,episode_source_value,episode_source_concept_id
0,3651bbb8-8edf-df25-cd5e-bd98fc74c62c,5498f452-4f9e-ea6c-9587-e4bf0d97a2a0,32533,1986-07-19,1986-07-19 11:19:35+00:00,,,,1,320128,32817,59621000,320128
1,bd073957-3567-46dc-8624-85d129014110,d23456ac-957d-67ad-1ba4-34c3f8a54744,32533,1980-11-28,1980-11-28 00:14:03+00:00,,,,1,0,32817,15777000,40316773
2,98b4003d-5a49-0126-84b3-c2fae5b12327,561d73b3-1c56-ed8d-266e-70c9e8712efb,32533,1997-01-08,1997-01-08 13:39:43+00:00,,,,1,320128,32817,59621000,320128
3,86969333-0db4-2e41-c86d-21cc1422aa12,1d26a818-351d-22f4-15d7-04cf0f520780,32533,2000-09-24,2000-09-24 00:13:34+00:00,,,,1,442588,32817,78275009,442588
4,936bf292-663b-0f86-ac71-5dc7c24d1378,fffe0830-f71e-bd50-e90d-fc5f23c55433,32533,2008-09-15,2008-09-15 11:42:35+00:00,,,,1,0,32817,15777000,40316773


### cost

In [18]:
def claim_to_cost(data):
    cost_rows = []

    cost = {
        'cost_id': data['id'],
        'cost_event_id': pd.NA,
        'cost_domain_id': 32007,
        'cost_type_concept_id': 5032,
        'currency_concept_id': 44818668, 
        'total_charge': pd.NA,
        'total_cost': data['total']['value'],
        'total_paid': pd.NA,
        'paid_by_payer': pd.NA,
        'paid_by_patient': pd.NA,
        'paid_patient_copay': pd.NA,
        'paid_patient_coinsurance': pd.NA,
        'paid_patient_deductible': pd.NA,
        'paid_by_primary': pd.NA,
        'paid_ingredient_cost': pd.NA,
        'paid_dispensing_fee': pd.NA,
        'payer_plan_period_id': pd.NA,
        'amount_allowed': pd.NA,
        'revenue_code_concept_id': 38003025,
        'revenue_code_source_value': None,
        'drg_concept_id': pd.NA,
        'drg_source_value': None
    }
    cost_rows.append(cost)

    return cost_rows

cost_rows = []

with open('/workspaces/synthea_dw/data/fhir/Claim.ndjson', 'r') as file:
    for line in file:
        data = json.loads(line)
        costs = claim_to_cost(data)
        cost_rows.extend(costs)

cost = pd.DataFrame(cost_rows).drop_duplicates()

cost.head()

Unnamed: 0,cost_id,cost_event_id,cost_domain_id,cost_type_concept_id,currency_concept_id,total_charge,total_cost,total_paid,paid_by_payer,paid_by_patient,paid_patient_copay,paid_patient_coinsurance,paid_patient_deductible,paid_by_primary,paid_ingredient_cost,paid_dispensing_fee,payer_plan_period_id,amount_allowed,revenue_code_concept_id,revenue_code_source_value,drg_concept_id,drg_source_value
0,171fd2b0-6de8-8ead-d848-2028d8bf0791,,32007,5032,44818668,,704.2,,,,,,,,,,,,38003025,,,
1,49d6c2b3-cae9-84da-1732-c4dae9127698,,32007,5032,44818668,,994.95,,,,,,,,,,,,38003025,,,
2,c3169eb7-d753-b4d0-7caf-15c9750fe010,,32007,5032,44818668,,1005.38,,,,,,,,,,,,38003025,,,
3,73916f03-a54a-3197-2c92-fc537ca90033,,32007,5032,44818668,,136.8,,,,,,,,,,,,38003025,,,
4,49d7ecf9-a01a-641a-5caf-2089247dc358,,32007,5032,44818668,,971.38,,,,,,,,,,,,38003025,,,
