In [1]:
import numpy as np
import pandas as pd
import os
from prediction_utils.extraction_utils.database import BQDatabase

In [2]:
db = BQDatabase()



In [3]:
config_dict = {
    'dataset_project': 'som-nero-phi-nigam-starr',
    'dataset': 'mimic_omop',
    'rs_dataset_project': 'som-nero-phi-nigam-starr',
    'rs_dataset': 'temp_dataset',
    'starr_project': 'som-rit-phi-starr-prod',
    'starr_dataset': 'starr_omop_cdm5_deid_latest',
    'mimic_project': 'som-nero-phi-nigam-starr',
    'mimic_dataset': 'mimic'
}

In [4]:
# Race and ethnicity in MIMIC-OMOP
# (Seems like ethnicity_concept_id should be populated based on source code)
query = """
    SELECT DISTINCT race_concept_id, race_source_value, ethnicity_concept_id, COUNT(*) as counts
    FROM {dataset_project}.{dataset}.person
    GROUP BY race_concept_id, race_source_value, ethnicity_concept_id
    ORDER BY counts DESCd
""".format(**config_dict)
df = db.read_sql_query(query)

Downloading: 100%|██████████| 41/41 [00:01<00:00, 25.50rows/s]


In [5]:
df.head(20)

Unnamed: 0,race_concept_id,race_source_value,ethnicity_concept_id,counts
0,8527,WHITE,0,32074
1,4218674,UNKNOWN/NOT SPECIFIED,0,4236
2,38003599,BLACK/AFRICAN AMERICAN,0,3585
3,4188159,HISPANIC OR LATINO,0,1350
4,8515,ASIAN,0,1304
5,4087921,OTHER,0,1256
6,4218674,UNABLE TO OBTAIN,0,792
7,4218674,PATIENT DECLINED TO ANSWER,0,498
8,38003579,ASIAN - CHINESE,0,223
9,38003600,BLACK/CAPE VERDEAN,0,159


In [6]:
# Race and Ethnicity in STARR-OMOP
query = """
    SELECT DISTINCT race_concept_id, ethnicity_concept_id, COUNT(*) as counts
    FROM {starr_project}.{starr_dataset}.person
    GROUP BY race_concept_id, race_source_value, ethnicity_concept_id
    ORDER BY counts DESC
""".format(**config_dict)
df = db.read_sql_query(query)

Downloading: 100%|██████████| 381/381 [00:01<00:00, 277.84rows/s]


In [7]:
# Race in STARR-OMOP
query = """
    SELECT DISTINCT race_concept_id, COUNT(*) as counts
    FROM {starr_project}.{starr_dataset}.person
    GROUP BY race_concept_id
    ORDER BY counts DESC
""".format(**config_dict)
df = db.read_sql_query(query)
df

Downloading: 100%|██████████| 6/6 [00:01<00:00,  3.55rows/s]


Unnamed: 0,race_concept_id,counts
0,0,1224212
1,8527,1214760
2,8515,368410
3,8516,97655
4,8557,25908
5,8657,7536


In [8]:
# Condition timing in MIMIC-OMOP
# -> 0 rows implies timing is tied to visit_start / visit_end
query = """
    SELECT * EXCEPT (person_id, provider_id)
    FROM {dataset_project}.{dataset}.visit_occurrence t1
    INNER JOIN {dataset_project}.{dataset}.condition_occurrence as t2 
        USING (visit_occurrence_id)
    WHERE (visit_start_datetime != condition_start_datetime) OR (visit_end_datetime != visit_end_datetime)
""".format_map(config_dict)
df = db.read_sql_query(query)

Downloading: 0rows [00:00, ?rows/s]


In [9]:
## Explore visit counts
query = """
    SELECT visit_concept_id, COUNT(*) as counts
    FROM {dataset_project}.{dataset}.visit_occurrence t1
    GROUP BY visit_concept_id
    ORDER by counts DESC
""".format_map(config_dict)
df = db.read_sql_query(query)
df

Downloading: 100%|██████████| 2/2 [00:01<00:00,  1.49rows/s]


Unnamed: 0,visit_concept_id,counts
0,262,43407
1,9201,15569


In [10]:
## Compare to raw MIMIC
query = """
    SELECT admission_type, COUNT(*) as counts
    FROM {mimic_project}.{mimic_dataset}.admissions t1
    GROUP BY admission_type
    ORDER by counts DESC
""".format_map(config_dict)
df = db.read_sql_query(query)
df

Downloading: 100%|██████████| 4/4 [00:01<00:00,  2.61rows/s]


Unnamed: 0,admission_type,counts
0,EMERGENCY,42071
1,NEWBORN,7863
2,ELECTIVE,7706
3,URGENT,1336


In [11]:
# visit_detail_assign
# (These counts look incorrect... no emegency or icu? - mapping is incorrect in ETL)
query = """
    SELECT is_first, is_last, is_icu, is_emergency, COUNT(*) as counts
    FROM {dataset_project}.{dataset}.visit_detail_assign
    GROUP BY is_first, is_last, is_icu, is_emergency
    ORDER BY counts DESC
""".format_map(config_dict)
df = db.read_sql_query(query)
df

Downloading: 100%|██████████| 4/4 [00:01<00:00,  2.66rows/s]


Unnamed: 0,is_first,is_last,is_icu,is_emergency,counts
0,False,False,False,False,84871
1,True,False,False,False,54644
2,False,True,False,False,54644
3,True,True,False,False,4306


In [12]:
# visit_detail
# (these counts seem ok-ish - some discrepancy with raw MIMIC)
query = """
    SELECT visit_detail_concept_id, visit_type_concept_id, concept_name, count(*) as counts
    FROM {dataset_project}.{dataset}.visit_detail t1
    INNER JOIN {dataset_project}.{dataset}.concept as t2
        on t1.visit_detail_concept_id = t2.concept_id
    GROUP BY visit_detail_concept_id, visit_type_concept_id, concept_name
    ORDER BY counts DESC
""".format_map(config_dict)
df = db.read_sql_query(query)
df

Downloading: 100%|██████████| 7/7 [00:01<00:00,  4.53rows/s]


Unnamed: 0,visit_detail_concept_id,visit_type_concept_id,concept_name,counts
0,9201,2000000006,Inpatient Visit,96013
1,32037,2000000006,Intensive Care,71575
2,45763735,45770670,General medical service,39244
3,9203,2000000006,Emergency Room Visit,30877
4,4149152,45770670,Surgical service,25946
5,4237225,45770670,Newborn care service,8152
6,4150859,45770670,Psychiatry service,1


In [13]:
## Compare to raw MIMIC
query = """
    SELECT COUNT(DISTINCT icustay_id) as counts
    FROM {mimic_project}.{mimic_dataset}.icustays t1
""".format_map(config_dict)
df = db.read_sql_query(query)
print('Num unique ICU stays: {}'.format(df.counts.values))

query = """
    SELECT COUNT(DISTINCT icustay_id) as counts
    FROM {mimic_project}.{mimic_dataset}.transfers t1
""".format_map(config_dict)
df = db.read_sql_query(query)
print('Num unique ICU stays: {}'.format(df.counts.values))

Downloading: 100%|██████████| 1/1 [00:01<00:00,  1.31s/rows]


Num unique ICU stays: [61532]


Downloading: 100%|██████████| 1/1 [00:01<00:00,  1.64s/rows]

Num unique ICU stays: [61532]





### Replicating the MIMIC-Extract Cohort

In [14]:
# Proceed with constructing the cohort based on ICU stays - attempting to match MIMIC-Extract
query = """
    WITH initial_cohort AS (
        SELECT t1.*, 
            ROW_NUMBER() OVER (PARTITION BY t1.person_id ORDER BY t1.visit_start_datetime) as row_id,
            DATETIME_ADD(t1.visit_start_datetime, INTERVAL 24 HOUR) as index_datetime,
            DATETIME_DIFF(t1.visit_end_datetime, t1.visit_start_datetime, HOUR) as los_icu_hours,
            DATETIME_DIFF(t2.visit_end_datetime, t2.visit_start_datetime, HOUR) as los_hospital_hours,
            CAST(t1.discharge_to_concept_id = 4216643 AS INT64) AS mortality_icu,
            CAST(t2.discharge_to_concept_id = 4216643 AS INT64) AS mortality_hospital
            FROM {dataset_project}.{dataset}.visit_detail t1
            LEFT JOIN {dataset_project}.{dataset}.visit_occurrence as t2 USING (visit_occurrence_id)
            WHERE visit_detail_concept_id = 32037
    ),
    transformed_cohort AS (
    SELECT t1.*,
        CAST(DATETIME_DIFF(index_datetime, birth_datetime, DAY) AS FLOAT64) / 365.25 as age_in_years,
        CAST(los_icu_hours > 3*24 AS INT64) as los_icu_3days,
        CAST(los_icu_hours > 7*24 AS INT64) as los_icu_7days
    FROM initial_cohort t1
    INNER JOIN {dataset_project}.{dataset}.person as t2 USING (person_id)
    )
    SELECT person_id, visit_occurrence_id, visit_detail_id, 
    index_datetime, age_in_years
        los_hospital_hours, los_icu_hours,  
        los_icu_3days, los_icu_7days, 
        mortality_hospital, mortality_icu
    FROM transformed_cohort
    WHERE 
        row_id = 1
        AND age_in_years >= 15.0
        AND los_icu_hours BETWEEN 30 AND 240
""".format_map(config_dict)
db.execute_sql_to_destination_table(query, '{rs_dataset_project}.{rs_dataset}.mimic_icu_cohort'.format_map(config_dict))
icu_cohort = db.read_sql_query("SELECT * FROM {rs_dataset_project}.{rs_dataset}.mimic_icu_cohort".format_map(config_dict))

Downloading: 100%|██████████| 23137/23137 [00:01<00:00, 16228.18rows/s]


In [15]:
icu_cohort[['los_icu_3days', 'los_icu_7days', 'mortality_icu', 'mortality_hospital']].agg('mean')

los_icu_3days         0.380170
los_icu_7days         0.064702
mortality_icu         0.061201
mortality_hospital    0.105848
dtype: float64

In [16]:
# Proceed with constructing the cohort based on hospital admissions
query = """
    WITH cohort0 AS (
        SELECT *, 
            ROW_NUMBER() OVER (PARTITION BY person_id ORDER BY visit_start_datetime) as row_id,
            DATETIME_ADD(visit_start_datetime, INTERVAL 24 HOUR) as index_datetime,
            DATETIME_DIFF(visit_end_datetime, visit_start_datetime, HOUR) as los_hospital_hours,
            CAST(discharge_to_concept_id = 4216643 AS INT64) AS mortality_hospital
            FROM {dataset_project}.{dataset}.visit_occurrence
    ),
    cohort1 AS (
        SELECT t1.*,
            CAST(DATETIME_DIFF(index_datetime, birth_datetime, DAY) AS FLOAT64) / 365.25 as age_in_years,
            CAST(los_hospital_hours > 7*24 AS INT64) as los_hospital_7days
        FROM cohort0 t1
        INNER JOIN {dataset_project}.{dataset}.person as t2 USING (person_id)
    )
    SELECT person_id, visit_occurrence_id, index_datetime, 
        los_hospital_hours, los_hospital_7days, mortality_hospital, age_in_years
    FROM cohort1
    WHERE 
        row_id = 1
        AND age_in_years >= 15.0
        AND los_hospital_hours BETWEEN 30 AND 240
""".format_map(config_dict)
db.execute_sql_to_destination_table(query, '{rs_dataset_project}.{rs_dataset}.mimic_hospital_cohort'.format_map(config_dict))
hospital_cohort = db.read_sql_query("SELECT * FROM {rs_dataset_project}.{rs_dataset}.mimic_hospital_cohort".format_map(config_dict))

Downloading: 100%|██████████| 24614/24614 [00:01<00:00, 19362.92rows/s]


In [17]:
hospital_cohort[['mortality_hospital', 'los_hospital_7days']].agg('mean')

mortality_hospital    0.088364
los_hospital_7days    0.276875
dtype: float64

In [26]:
hospital_cohort.shape

(24614, 7)

In [29]:
hospital_cohort.head()

Unnamed: 0,person_id,visit_occurrence_id,index_datetime,los_hospital_hours,los_hospital_7days,mortality_hospital,age_in_years
0,392784927,11838,2185-05-31 14:23:00,30,0,0,48.451745
1,392801822,47958,2197-03-21 07:15:00,30,0,0,56.563997
2,392807317,44313,2113-10-21 07:02:00,30,0,0,68.930869
3,392784526,11064,2192-03-20 13:42:00,30,0,0,32.648871
4,392802449,38207,2148-05-18 08:01:00,30,0,0,45.234771


In [38]:
## Assigning new race_eth columns
query = """
    WITH source_concepts AS (
        SELECT person_id, race_concept_id as concept_id, concept_name as race_concept_name
        FROM {rs_dataset_project}.{rs_dataset}.mimic_hospital_cohort t1
        INNER JOIN {dataset_project}.{dataset}.person as t2 USING (person_id) 
        INNER JOIN {dataset_project}.{dataset}.concept as t3
            ON t2.race_concept_id=t3.concept_id
    )
    SELECT * 
    FROM source_concepts
""".format_map(config_dict)
df = db.read_sql_query(query)

Downloading: 100%|██████████| 24614/24614 [00:01<00:00, 17240.56rows/s]


In [39]:
df

Unnamed: 0,person_id,concept_id,race_concept_name
0,392798498,38003591,Thai
1,392786197,38003591,Thai
2,392815911,38003591,Thai
3,392811541,8557,Native Hawaiian or Other Pacific Islander
4,392811662,8557,Native Hawaiian or Other Pacific Islander
...,...,...,...
24609,392800086,38003614,European
24610,392818408,38003614,European
24611,392812303,38003614,European
24612,392782907,38003614,European


In [107]:
## Assigning new race_eth columns
query = """
    WITH source_concepts AS (
        SELECT person_id, race_concept_id as concept_id 
        FROM {rs_dataset_project}.{rs_dataset}.mimic_hospital_cohort
        INNER JOIN {dataset_project}.{dataset}.person USING (person_id)
    ),
    concept_ancestors AS (
        SELECT person_id, COALESCE(ancestor_concept_id, concept_id) as concept_id
        FROM source_concepts t1
        LEFT JOIN {dataset_project}.{dataset}.concept_ancestor as t2
            ON t1.concept_id = t2.descendant_concept_id
    ),
    race_eth_rollup AS (
        SELECT t1.* EXCEPT(concept_id), concept_id as race_eth_concept_id, concept_name as race_eth_concept_name
        FROM concept_ancestors t1
        INNER JOIN {dataset_project}.{dataset}.concept USING (concept_id)
        --WHERE concept_id in (8527, 8515, 8516, 8557, 8657, 86571, 4188159)
        --WHERE concept_id in (8527, 8515, 8516, 86571, 4188159)
        WHERE concept_id in (8527)
    ),
    result AS (
        SELECT 
            t1.*, 
            COALESCE(race_eth_concept_id, 0) as race_eth_concept_id,
            COALESCE(race_eth_concept_name, 'Other') as race_eth_concept_name,
            t3.race_concept_id, t3.race_source_value, t4.concept_name as gender_concept_name
        FROM {rs_dataset_project}.{rs_dataset}.mimic_hospital_cohort t1
        LEFT JOIN race_eth_rollup as t2 USING (person_id)
        LEFT JOIN {dataset_project}.{dataset}.person as t3 USING (person_id)
        INNER JOIN {dataset_project}.{dataset}.concept as t4
            ON t3.gender_concept_id = t4.concept_id
    )
    SELECT * 
    FROM result

""".format_map(config_dict)
df = db.read_sql_query(query)

Downloading: 100%|██████████| 24614/24614 [00:01<00:00, 17046.16rows/s]


In [108]:
df.head()

Unnamed: 0,person_id,visit_occurrence_id,index_datetime,los_hospital_hours,los_hospital_7days,mortality_hospital,age_in_years,race_eth_concept_id,race_eth_concept_name,race_concept_id,race_source_value,gender_concept_name
0,392801822,47958,2197-03-21 07:15:00,30,0,0,56.563997,0,Other,4212311,MULTI RACE ETHNICITY,FEMALE
1,392813039,56248,2175-12-03 05:27:00,31,0,0,32.659822,0,Other,4188159,HISPANIC/LATINO - SALVADORAN,FEMALE
2,392801256,42488,2182-07-17 06:39:00,33,0,0,60.197125,0,Other,38003600,BLACK/CAPE VERDEAN,FEMALE
3,392813474,52748,2182-06-03 03:15:00,38,0,0,299.997262,0,Other,4212311,MULTI RACE ETHNICITY,FEMALE
4,392810620,43313,2136-03-25 19:39:00,44,0,0,82.099932,0,Other,4212311,MULTI RACE ETHNICITY,FEMALE


In [111]:
df.groupby(['gender_concept_name','race_eth_concept_id', 'race_eth_concept_name']).size()

gender_concept_name  race_eth_concept_id  race_eth_concept_name
FEMALE               0                    Other                     3062
                     8527                 White                     7624
MALE                 0                    Other                     3922
                     8527                 White                    10006
dtype: int64

In [20]:
# # Getting MIMIC-specific demographic variables from observation table
# insurance_concepts = [21498981, 45885112, 45883718 ,21498514, 21499435]
# query = """
#     SELECT t1.*, value_as_string as insurance_type
#     FROM {rs_dataset_project}.{rs_dataset}.mimic_hospital_cohort t1
#     INNER JOIN
#         {dataset_project}.{dataset}.observation USING (person_id, visit_occurrence_id)
#     WHERE observation_concept_id = 46235654
# """.format_map(config_dict)
# insurance_df = db.read_sql_query(query)