In [1]:
import numpy as np
import pandas as pd
import os
import glob
import torch
import torch.nn.functional as F
import joblib
import itertools
import scipy
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
import warnings
import string
from sklearn.metrics import roc_auc_score, average_precision_score, brier_score_loss, recall_score, precision_score
from prediction_utils.util import df_dict_concat, yaml_read
from matplotlib.ticker import FormatStrFormatter

In [2]:
project_dir = '/share/pi/nigam/projects/spfohl/cohorts/admissions/mimic_omop/'
experiment_name_baseline = 'baseline_tuning_fold_1_10'
experiment_name_fair = 'fair_tuning_fold_1_10'
tasks = ['los_icu_3days', 'los_icu_7days', 'mortality_hospital', 'mortality_icu']
cohort_path = os.path.join(project_dir, 'cohort', 'cohort.parquet')
row_id_map_path = os.path.join(
    project_dir, 'merged_features_binary/features_sparse/features_row_id_map.parquet'
)
result_path = os.path.join(project_dir, 'experiments', 'merged_results_fold_1_10')
os.makedirs(result_path, exist_ok=True)

In [3]:
attributes = ['gender_concept_name', 'age_group', 'race_eth']

In [4]:
cohort = pd.read_parquet(cohort_path)
row_id_map = pd.read_parquet(row_id_map_path)
cohort = cohort.merge(row_id_map)

In [5]:
### Cohort table
cohort_df_long = (
    cohort
    .melt(
        id_vars = ['person_id'] + attributes,
        value_vars = tasks,
        var_name = 'task',
        value_name = 'labels'
    )
    .melt(
        id_vars = ['person_id', 'task', 'labels'],
        value_vars = attributes,
        var_name = 'attribute',
        value_name = 'group'
    )
)

In [6]:
cohort_statistics_df = (
    cohort_df_long
    .groupby(['task', 'attribute', 'group'])
    .agg(
        prevalence=('labels', 'mean'),
    )
    .reset_index()
    .groupby('attribute')
    .apply(lambda x: x.pivot_table(index = 'group', columns = 'task', values = 'prevalence'))
    .reset_index()
)

group_size_df = (
    cohort_df_long
    .groupby(['task', 'attribute', 'group'])
    .agg(
        size = ('labels', lambda x: x.shape[0])
    )
    .reset_index()
    .drop(columns = 'task')
    .drop_duplicates()
)

cohort_statistics_df = cohort_statistics_df.merge(group_size_df)
cohort_statistics_df = (
    cohort_statistics_df
    .set_index(['attribute', 'group'])
    [['size'] + tasks]
)

In [7]:
cohort_statistics_df

Unnamed: 0_level_0,Unnamed: 1_level_0,size,los_icu_3days,los_icu_7days,mortality_hospital,mortality_icu
attribute,group,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
age_group,[15-30),1345,0.274349,0.049071,0.038662,0.023792
age_group,[30-45),2621,0.27356,0.049981,0.054178,0.033193
age_group,[45-55),3865,0.296507,0.050453,0.074256,0.042173
age_group,[55-65),5358,0.307577,0.052445,0.076894,0.045539
age_group,[65-75),5620,0.327758,0.057117,0.096085,0.055694
age_group,[75-91),7361,0.356473,0.05828,0.14047,0.079337
gender_concept_name,FEMALE,11108,0.325981,0.056806,0.101548,0.059327
gender_concept_name,MALE,15062,0.313703,0.052583,0.088899,0.050724
race_eth,Other,7639,0.325173,0.057861,0.105773,0.062443
race_eth,White,18531,0.316335,0.052938,0.089526,0.05105


In [8]:
## Write to Latex
table_path = './../figures/mimic_omop/'
os.makedirs(table_path, exist_ok=True)
with open(os.path.join(table_path, 'cohort_table.txt'), 'w') as fp:
    (
        cohort_statistics_df
        .reset_index().drop(columns='attribute').set_index(['group'])
        .to_latex(
            fp, 
            float_format = '%.3g', 
            index_names = False, 
            index=True
        )
    )