In [1]:
import os
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression, SGDClassifier
from prediction_utils.pytorch_utils.metrics import CalibrationEvaluator

In [31]:
db='starr_20200523'
# db='optum'
data_path = os.path.join('/share/pi/nigam/projects/spfohl/cohorts/admissions/', db)

# predictions_path = os.path.join(data_path, 'experiments', 'baseline_tuning_fold_1', 'performance', 'LOS_7', '0.yaml', '1', 'output_df.parquet')
predictions_path = os.path.join(data_path, 'experiments', 'baseline_tuning_fold_1_10', 'performance', 'LOS_7', '0.yaml', '1', 'output_df.parquet')
cohort_path = os.path.join(data_path, 'cohort', 'cohort.parquet')
row_id_map_path = os.path.join(data_path, 'merged_features_binary', 'features_sparse', 'features_row_id_map.parquet')

In [32]:
pred_df = pd.read_parquet(predictions_path)
cohort = pd.read_parquet(cohort_path)
row_id_map = pd.read_parquet(row_id_map_path)

In [33]:
output_df_eval = pred_df.merge(
    row_id_map, left_on="row_id", right_on="features_row_id"
).merge(cohort)

eval_attributes = ['race_eth', 'gender_concept_name', 'age_group']

output_df_long = output_df_eval.melt(
    id_vars=set(output_df_eval.columns) - set(eval_attributes),
    value_vars=eval_attributes,
    var_name="attribute",
    value_name="group",
)

In [34]:
output_df_long

Unnamed: 0,pred_probs,month_mortality,hospital_mortality,discharge_date,readmission_30,features_row_id,outputs,LOS_7,age_in_years,prediction_id,admit_date,fold_id,phase,person_id,row_id,LOS_days,labels,attribute,group
0,0.320541,0,0,2011-03-14,0,16,-0.348077,0,64,7385626856771428939,2011-03-09,1,val,30764035,16,5,0,race_eth,Other
1,0.108406,0,0,2009-03-15,0,21,-1.123326,1,37,-4702183990252977972,2009-03-06,1,val,32106718,21,9,1,race_eth,Black or African American
2,0.070200,0,0,2010-03-27,1,24,-1.406118,0,50,-1431298667878583006,2010-03-24,1,val,30227681,24,3,0,race_eth,Other
3,0.435668,0,0,2010-04-13,0,35,-0.075177,1,75,152682555259088853,2010-03-11,1,val,30227109,35,33,1,race_eth,Other
4,0.071058,0,0,2020-04-14,0,37,-1.382698,0,86,-4997482894161615682,2020-04-12,1,val,32374694,37,2,0,race_eth,Other
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
113221,0.112350,0,0,2019-10-08,0,198593,-1.068830,0,18,-2195287455090566522,2019-10-04,test,test,32506977,198593,4,0,age_group,[18-30)
113222,0.220773,0,0,2008-12-14,0,198620,-0.630099,0,20,4586937179410381483,2008-12-10,test,test,30091703,198620,4,0,age_group,[18-30)
113223,0.407975,0,0,2020-04-10,0,198626,-0.140749,0,21,-1325349102893666372,2020-04-04,test,test,30225282,198626,6,0,age_group,[18-30)
113224,0.139420,0,0,2014-06-07,0,198636,-0.981189,0,24,-3817511086782223380,2014-06-05,test,test,30225427,198636,2,0,age_group,[18-30)


In [38]:
from pandas.testing import assert_frame_equal

class CalibrationEvaluator:
    def get_calibration_df(
        self,
        df,
        group_vars=["config_filename", "phase", "task", "attribute", "group"],
    ):
        
        group_vars = [var for var in group_vars if var in df.columns]
        model_dict = {}
        calibration_dict = {}
        
        for group, df_grouped in df.groupby(group_vars):
            print(group)
            df_grouped = df_grouped.query("pred_probs > 0")
            log_probs = np.log(df_grouped.pred_probs.values.reshape(-1, 1))
            model_dict[group] = LogisticRegression(solver="lbfgs", penalty="none", max_iter=1000)
            model_dict[group].fit(log_probs, df_grouped.labels.values)
            calibration_dict[group] = df_grouped.assign(
                density_conditional_y1=model_dict[group].predict_proba(log_probs)[
                    :, -1
                ]
            ).reset_index(drop=True)
        calibration_df = pd.concat(calibration_dict).reset_index(drop=True)
        
        return calibration_df, model_dict

    def get_calibration_df_combined(
        self,
        df,
        group_vars=[
            "sensitive_attribute",
            "config_filename",
            "phase",
            "task",
            "attribute",
        ],
    ):
        group_vars = [var for var in group_vars if var in df.columns]
        df = df[group_vars + ['pred_probs', 'labels', 'group']]
            
        calibration_df_group, model_dict_group = self.get_calibration_df(
            df, group_vars=group_vars + ["group"]
        )
        calibration_df_overall, model_dict_overall = self.get_calibration_df(df, group_vars=group_vars)
        
        # Joining on a floating point value is very slow
        sort_columns = list(set(calibration_df_group.columns) - set(['density_conditional_y1']))
        
        calibration_df_group = calibration_df_group.sort_values(sort_columns).reset_index(drop=True)
        calibration_df_overall = calibration_df_overall.sort_values(sort_columns).reset_index(drop=True)
        calibration_df = calibration_df_group
        calibration_df['density_conditional_y1_overall'] = calibration_df_overall['density_conditional_y1']
        
        return calibration_df, model_dict_group, model_dict_overall

    def get_calibration_result(
        self, df, group_vars=["config_filename", "phase", "task", "attribute", "group"],
    ):
        group_vars = [var for var in group_vars if var in df.columns]
        return (
            df.assign(
                brier_diff_signed=lambda x: x.labels - x.pred_probs,
                brier_diff_squared=lambda x: x.brier_diff_signed ** 2,
                calib_diff_signed=lambda x: x.density_conditional_y1 - x.pred_probs,
                calib_diff_squared=lambda x: x.calib_diff_signed ** 2,
                calib_density_diff_signed=lambda x: x.density_conditional_y1
                - x.density_conditional_y1_overall,
                calib_density_diff_squared=lambda x: x.calib_density_diff_signed ** 2,
            )
            .groupby(group_vars)
            .agg(
                brier=("brier_diff_squared", lambda x: x.mean()),
                brier_signed=("brier_diff_signed", lambda x: x.mean()),
                calib_error=("calib_diff_squared", lambda x: x.mean()),
                calib_error_signed=("calib_diff_signed", lambda x: x.mean()),
                calib_group_error=("calib_density_diff_squared", lambda x: x.mean()),
                calib_group_error_signed=(
                    "calib_density_diff_signed",
                    lambda x: x.mean(),
                ),
            )
            .reset_index()
        )

    def filter_by_group_spec(self, df, group_vars, group_values):
        for group_var, group_value in zip(group_vars, group_values):
            df = df.loc[df[group_var] == group_value]
        return df

In [39]:
evaluator = CalibrationEvaluator()
group_vars = ["phase", "task", "sensitive_attribute", "attribute"]
%time calibration_df, model_dict_group, model_dict_overall = evaluator.get_calibration_df_combined(output_df_long, group_vars=group_vars)
%time calibration_result = evaluator.get_calibration_result(calibration_df, group_vars=group_vars + ['group'])

('test', 'age_group', '[18-30)')
('test', 'age_group', '[30-45)')
('test', 'age_group', '[45-55)')
('test', 'age_group', '[55-65)')
('test', 'age_group', '[65-75)')
('test', 'age_group', '[75-91)')
('test', 'gender_concept_name', 'FEMALE')
('test', 'gender_concept_name', 'MALE')
('test', 'race_eth', 'Asian')
('test', 'race_eth', 'Black or African American')
('test', 'race_eth', 'Hispanic or Latino')
('test', 'race_eth', 'Other')
('test', 'race_eth', 'White')
('val', 'age_group', '[18-30)')
('val', 'age_group', '[30-45)')
('val', 'age_group', '[45-55)')
('val', 'age_group', '[55-65)')
('val', 'age_group', '[65-75)')
('val', 'age_group', '[75-91)')
('val', 'gender_concept_name', 'FEMALE')
('val', 'gender_concept_name', 'MALE')
('val', 'race_eth', 'Asian')
('val', 'race_eth', 'Black or African American')
('val', 'race_eth', 'Hispanic or Latino')
('val', 'race_eth', 'Other')
('val', 'race_eth', 'White')
('test', 'age_group')
('test', 'gender_concept_name')
('test', 'race_eth')
('val', 'age

ValueError: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()

CPU times: user 391 ms, sys: 3.95 ms, total: 395 ms
Wall time: 77.5 ms


In [25]:
print({key: (value.coef_, value.intercept_) for key, value in model_dict_group.items()})
{key: (value.coef_, value.intercept_) for key, value in model_dict_overall.items()}

{('test', 'age_group', '[18-30)'): (array([[1.44140276]]), array([0.71915796])), ('test', 'age_group', '[30-45)'): (array([[1.35877299]]), array([0.54623489])), ('test', 'age_group', '[45-55)'): (array([[1.22867753]]), array([0.55727798])), ('test', 'age_group', '[55-65)'): (array([[1.2022206]]), array([0.6705304])), ('test', 'age_group', '[65-75)'): (array([[1.33212857]]), array([0.92575402])), ('test', 'age_group', '[75-91)'): (array([[1.32317904]]), array([1.03085769])), ('test', 'gender_concept_name', 'FEMALE'): (array([[1.45593926]]), array([1.04899774])), ('test', 'gender_concept_name', 'MALE'): (array([[1.41591774]]), array([0.97715419])), ('test', 'race_eth', 'Other'): (array([[1.44209551]]), array([1.02268419])), ('val', 'age_group', '[18-30)'): (array([[1.40630799]]), array([0.62935198])), ('val', 'age_group', '[30-45)'): (array([[1.33788673]]), array([0.50476228])), ('val', 'age_group', '[45-55)'): (array([[1.21099865]]), array([0.52680394])), ('val', 'age_group', '[55-65)')

{('test', 'age_group'): (array([[1.44209551]]), array([1.02268419])),
 ('test', 'gender_concept_name'): (array([[1.44209551]]), array([1.02268419])),
 ('test', 'race_eth'): (array([[1.44209551]]), array([1.02268419])),
 ('val', 'age_group'): (array([[1.43614933]]), array([1.01668956])),
 ('val', 'gender_concept_name'): (array([[1.43614933]]), array([1.01668956])),
 ('val', 'race_eth'): (array([[1.43614933]]), array([1.01668956]))}

In [8]:
calibration_result

Unnamed: 0,phase,attribute,group,brier,brier_signed,calib_error,calib_error_signed,calib_group_error,calib_group_error_signed
0,test,age_group,[18-30),0.05592,-0.036887,0.002421,-0.036887,0.001296,-0.010087
1,test,age_group,[30-45),0.057131,-0.045318,0.003618,-0.045318,0.002104,-0.01711
2,test,age_group,[45-55),0.114982,-0.055336,0.003146,-0.055337,0.000951,-0.023042
3,test,age_group,[55-65),0.146898,-0.030047,0.001417,-0.030047,0.000129,-0.007242
4,test,age_group,[65-75),0.18014,-0.010222,0.002122,-0.010222,0.00021,-0.006084
5,test,age_group,[75-91),0.230606,0.067382,0.007413,0.067382,0.003023,0.048995
6,test,gender_concept_name,FEMALE,0.116256,-0.008749,0.000836,-0.008749,7.5e-05,0.005813
7,test,gender_concept_name,MALE,0.160503,-0.023332,0.002949,-0.023332,0.000346,-0.009666
8,test,race_eth,Other,0.132873,-0.014225,0.001284,-0.014225,0.0,0.0
9,val,age_group,[18-30),0.055205,-0.038325,0.002683,-0.038325,0.001497,-0.011764


In [9]:
# db='starr_20200523'
db='optum'
data_path = os.path.join('/share/pi/nigam/projects/spfohl/cohorts/admissions/', db)

predictions_path = os.path.join(data_path, 'experiments', 'fair_tuning_fold_1', 'performance', 'LOS_7', 'age_group', '0.yaml', '1', 'output_df.parquet')
cohort_path = os.path.join(data_path, 'cohort', 'cohort.parquet')
row_id_map_path = os.path.join(data_path, 'merged_features_binary', 'features_sparse', 'features_row_id_map.parquet')

pred_df = pd.read_parquet(predictions_path)
cohort = pd.read_parquet(cohort_path)
row_id_map = pd.read_parquet(row_id_map_path)
output_df_eval = pred_df.merge(
    row_id_map, left_on="row_id", right_on="features_row_id"
).merge(cohort)

eval_attributes = ['race_eth', 'gender_concept_name', 'age_group']
output_df_long = output_df_eval.melt(
    id_vars=set(output_df_eval.columns) - set(eval_attributes),
    value_vars=eval_attributes,
    var_name="attribute",
    value_name="group",
)

In [10]:
evaluator = CalibrationEvaluator()
group_vars = ["phase", "task", "sensitive_attribute", "attribute"]
%time calibration_df, model_dict_group, model_dict_overall = evaluator.get_calibration_df_combined(output_df_long, group_vars=group_vars)
%time calibration_result = evaluator.get_calibration_result(calibration_df, group_vars=group_vars + ['group'])

('test', 'age_group', '[18-30)')
('test', 'age_group', '[30-45)')
('test', 'age_group', '[45-55)')
('test', 'age_group', '[55-65)')
('test', 'age_group', '[65-75)')
('test', 'age_group', '[75-91)')
('test', 'gender_concept_name', 'FEMALE')
('test', 'gender_concept_name', 'MALE')
('test', 'race_eth', 'Other')
('val', 'age_group', '[18-30)')
('val', 'age_group', '[30-45)')
('val', 'age_group', '[45-55)')
('val', 'age_group', '[55-65)')
('val', 'age_group', '[65-75)')
('val', 'age_group', '[75-91)')
('val', 'gender_concept_name', 'FEMALE')
('val', 'gender_concept_name', 'MALE')
('val', 'race_eth', 'Other')
('test', 'age_group')
('test', 'gender_concept_name')
('test', 'race_eth')
('val', 'age_group')
('val', 'gender_concept_name')
('val', 'race_eth')
  phase            attribute  pred_probs  labels    group  \
0  test  gender_concept_name    0.000057       0   FEMALE   
1  test             race_eth    0.000057       0    Other   
2  test            age_group    0.000057       0  [30-45)  

In [11]:
calibration_result

Unnamed: 0,phase,attribute,group,brier,brier_signed,calib_error,calib_error_signed,calib_group_error,calib_group_error_signed
0,test,age_group,[18-30),0.051202,-0.021056,0.00061,-0.021056,0.000674,-0.017048
1,test,age_group,[30-45),0.051505,-0.022333,0.000783,-0.022333,0.000876,-0.018041
2,test,age_group,[45-55),0.107223,-0.010723,0.00049,-0.010723,0.000582,-0.010868
3,test,age_group,[55-65),0.139341,0.006153,0.000681,0.006153,0.000386,0.00339
4,test,age_group,[65-75),0.168092,0.014358,0.001034,0.014358,0.000139,0.008922
5,test,age_group,[75-91),0.211801,0.031831,0.002785,0.031832,0.000858,0.028183
6,test,gender_concept_name,FEMALE,0.107092,-0.000747,0.000485,-0.000747,3e-06,0.000724
7,test,gender_concept_name,MALE,0.14961,0.002642,0.000516,0.002642,9e-06,-0.001204
8,test,race_eth,Other,0.123059,0.000526,0.000494,0.000526,0.0,0.0
9,val,age_group,[18-30),0.050638,-0.02226,0.000725,-0.02226,0.000846,-0.018569
