In [94]:
from datasets.labelers import Labeler
import pandas as pd
import os

In [95]:
cohort_path = '/local-scratch/nigam/projects/jlemmon/transfer_learning/experiments/data/cohort'

In [96]:
def read_file(filename, columns=None, **kwargs):
    '''
    Helper function to read parquet and csv files into DataFrame
    '''
    print(filename)
    load_extension = os.path.splitext(filename)[-1]
    if load_extension == ".parquet":
        return pd.read_parquet(filename, columns=columns,**kwargs)
    elif load_extension == ".csv":
        return pd.read_csv(filename, usecols=columns, **kwargs)

In [97]:
cohort = read_file(
            os.path.join(
                cohort_path,
                "cohort_split_no_nb_bkup.parquet"
            ),
            engine='pyarrow'
        )

/local-scratch/nigam/projects/jlemmon/transfer_learning/experiments/data/cohort/cohort_split_no_nb_bkup.parquet


In [98]:
pi = cohort['prediction_id']
pi

0         -3134472912893814959
1          4480474938827236338
2         -3560330683692477362
3         -4753983747243999108
4         -7773785417047211617
                  ...         
316192     1804162505245553017
316193     2615013708302764373
316194    -4824395189820815920
316195    -2202492482082563123
316196     3653428970191862623
Name: prediction_id, Length: 274430, dtype: Int64

In [99]:
labeler = Labeler()



In [100]:

labeler.configure(
    rs_dataset="jlemmon_explore",
    cohort_name="tl_admission_rollup_temp",
    target_table_name="lab_test_admissions_rollup_filtered_labeled",
    dataset="starr_omop_cdm5_deid_2022_08_01"
)

In [102]:

# obtain all labels except for readmission
labeler.create_label_table(exclude_labeler_ids=['mortality','los_7','icu_admission','readmission_30'])

df = pd.read_gbq(
    "select * from `som-nero-nigam-starr.jlemmon_explore.lab_test_admissions_rollup_filtered_labeled`",
    use_bqstorage_api=True
)

In [103]:
merge_df = cohort.merge(df, how='left', on=['person_id', 'admit_date','discharge_date'])
merge_df = merge_df.drop(columns = ['aki_base_creatinine',
 'aki_max_creatinine',
 'aki1_creatinine',
 'aki1_creatinine_time',
 'aki1_label',
 'aki2_creatinine',
 'aki2_creatinine_time',
 'aki2_label',
 'hg_min_glucose',
 'hg_glucose',
 'hg_glucose_time',
 'hg_label',
 'np_min_neutrophils',
 'np_500_neutrophils',
 'np_500_neutrophils_time',
 'np_500_label',
 'np_1000_neutrophils',
 'np_1000_neutrophils_time',
 'np_1000_label',
 'aki1_label_fold_id',
 'aki2_label_fold_id',
 'hg_label_fold_id',
 'np_500_label_fold_id',
 'np_1000_label_fold_id'])

In [104]:
merge_df.to_parquet(
		os.path.join(cohort_path, "cohort_no_nb.parquet"), engine="pyarrow", index=False,
	)

In [105]:
merge_df

Unnamed: 0,person_id,admit_date,discharge_date,admit_date_midnight,discharge_date_midnight,hospital_mortality,death_date,month_mortality,LOS_days,LOS_7,...,anemia_dx_label,anemia_dx_start_datetime,hyperkalemia_dx_label,hyperkalemia_dx_start_datetime,hyponatremia_dx_label,hyponatremia_dx_start_datetime,thrombocytopenia_dx_label,thrombocytopenia_dx_start_datetime,neutropenia_dx_label,neutropenia_dx_start_datetime
0,29936887,2019-12-22 22:44:00,2019-12-26 16:00:00,2019-12-22 23:59:00,2019-12-26 23:59:00,0,NaT,0,4,0,...,0,NaT,0,NaT,0,NaT,0,NaT,0,NaT
1,29936888,2010-07-12 06:34:00,2010-07-14 10:38:00,2010-07-12 23:59:00,2010-07-14 23:59:00,0,NaT,0,2,0,...,0,NaT,0,NaT,0,NaT,0,NaT,0,NaT
2,29936900,2014-10-31 15:08:00,2014-11-03 13:25:00,2014-10-31 23:59:00,2014-11-03 23:59:00,0,NaT,0,3,0,...,0,NaT,0,NaT,0,NaT,0,NaT,0,NaT
3,29936906,2013-10-02 19:55:00,2013-10-04 12:43:00,2013-10-02 23:59:00,2013-10-04 23:59:00,0,NaT,0,2,0,...,0,NaT,0,NaT,0,NaT,0,NaT,0,NaT
4,29936914,2018-06-23 05:20:00,2018-06-25 13:48:00,2018-06-23 23:59:00,2018-06-25 23:59:00,0,NaT,0,2,0,...,0,NaT,0,NaT,0,NaT,0,NaT,0,NaT
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
274425,80975389,2022-07-01 05:05:00,2022-07-02 11:13:00,2022-07-01 23:59:00,2022-07-02 23:59:00,0,NaT,0,1,0,...,0,NaT,0,NaT,0,NaT,0,NaT,0,NaT
274426,80975457,2022-05-28 20:32:00,2022-06-01 21:01:00,2022-05-28 23:59:00,2022-06-01 23:59:00,0,NaT,0,4,0,...,0,NaT,0,NaT,0,NaT,0,NaT,0,NaT
274427,81037546,2022-07-25 03:03:00,2022-08-07 18:50:00,2022-07-25 23:59:00,2022-08-07 23:59:00,1,2022-08-07,1,13,1,...,1,2022-07-25 03:03:00,0,NaT,1,2022-07-25 03:03:00,1,2022-07-25 03:03:00,0,NaT
274428,85694881,2022-08-04 17:28:00,2022-08-14 12:10:00,2022-08-04 23:59:00,2022-08-14 23:59:00,0,NaT,0,10,1,...,1,2022-08-04 17:28:00,0,NaT,0,NaT,0,NaT,0,NaT


In [106]:
df['thrombocytopenia_lab_severe_label']

0         <NA>
1         <NA>
2         <NA>
3         <NA>
4         <NA>
          ... 
527801    <NA>
527802    <NA>
527803    <NA>
527804    <NA>
527805    <NA>
Name: thrombocytopenia_lab_severe_label, Length: 527806, dtype: Int64