In [1]:
import numpy as np
import os
import torch
import pandas as pd
import yaml

from sklearn.externals import joblib

from pytorch_utils.datasets import ArrayDataset
from pytorch_utils.models import FeedforwardNetModel, FixedWidthModel
import pytorch_utils

In [2]:
outcome = 'los'

data_path = 'data/'
features_path = os.path.join(data_path, 'features', str(0))
label_path = os.path.join(data_path, 'labels')
config_path = os.path.join(data_path, 'config', 'grid', 'baseline')
checkpoints_path = os.path.join(data_path, 'checkpoints', 'scratch', outcome)
performance_path = os.path.join(data_path, 'performance', 'scratch', outcome)

In [3]:
os.makedirs(checkpoints_path, exist_ok=True)
os.makedirs(performance_path, exist_ok=True)

In [4]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [5]:
features_dict = joblib.load(os.path.join(features_path, 'features.pkl'))
label_dict = joblib.load(os.path.join(label_path, 'label_dict.pkl'))

In [6]:
experiment_name = 1

In [7]:
data_dict = {split: features_dict[split]['features'] for split in features_dict.keys()}
outcome_dict = {split : label_dict[split][outcome] for split in label_dict.keys()}

In [8]:
# with open(os.path.join(config_path, '{}.yaml'.format(experiment_name)), 'r') as fp:
#     config_dict = yaml.load(fp)
    
# config_dict['num_epochs'] = 3 # For testing

## A more complex network
# config_dict = {
#     'input_dim' : data_dict['train'].shape[1],
#     'lr' : 1e-5,
#     'num_epochs' : 3,
#     'batch_size' : 256,
#     'hidden_dim' : 128,
#     'num_hidden' : 2,
#     'output_dim' : 2,
#     'drop_prob' : 0.75,
#     'normalize' : True,
#     'iters_per_epoch' : 100,
#     'gamma' : 0.99,
#     'resnet' : True,
#     'sparse' : True,
#     'sparse_mode' : 'binary'
# }
# model = FixedWidthModel(config_dict)

config_dict = {
    'input_dim' : data_dict['train'].shape[1],
    'lr' : 1e-3,
    'num_epochs' : 3,
    'batch_size' : 256,
    'hidden_dim_list' : [128, 64],
    'output_dim' : 2,
    'drop_prob' : 0.75,
    'normalize' : True,
    'iters_per_epoch' : 100,
    'gamma' : 0.99,
    'resnet' : True,
    'sparse' : True,
    'sparse_mode' : 'binary'
}
model = FeedforwardNetModel(config_dict)
for child in model.model.children():
    print(child)

HiddenLinearLayer(
  (linear): LinearLayerWrapper(
    (linear): EmbeddingBagLinear(
      in_features=368117, out_features=128, bias=True
      (embed): EmbeddingBag(368117, 128, mode=sum)
    )
  )
  (dropout): Dropout(p=0.75)
  (normalize_layer): LayerNorm(torch.Size([128]), eps=1e-05, elementwise_affine=True)
)
ModuleList(
  (0): HiddenLinearLayer(
    (linear): LinearLayerWrapper(
      (linear): EmbeddingBagLinear(
        in_features=368117, out_features=128, bias=True
        (embed): EmbeddingBag(368117, 128, mode=sum)
      )
    )
    (dropout): Dropout(p=0.75)
    (normalize_layer): LayerNorm(torch.Size([128]), eps=1e-05, elementwise_affine=True)
  )
  (1): ResidualBlock(
    (layer1): HiddenLinearLayer(
      (linear): LinearLayerWrapper(
        (linear): Linear(in_features=128, out_features=128, bias=True)
      )
      (dropout): Dropout(p=0.75)
      (normalize_layer): LayerNorm(torch.Size([128]), eps=1e-05, elementwise_affine=True)
    )
    (layer2): HiddenLinearLaye

In [9]:
%%time
result = model.train(data_dict, outcome_dict)

Epoch 0/2
----------
Phase: train:
 loss: 0.720706,
 auc: 0.532003, auprc: 0.218294, brier: 0.208352,
Phase: val:
 loss: 0.488385,
 auc: 0.766164, auprc: 0.427121, brier: 0.156146,
Best model updated
Epoch 1/2
----------
Phase: train:
 loss: 0.530080,
 auc: 0.568173, auprc: 0.241843, brier: 0.170036,
Phase: val:
 loss: 0.493818,
 auc: 0.773188, auprc: 0.439105, brier: 0.157480,
Epoch 2/2
----------
Phase: train:
 loss: 0.492061,
 auc: 0.627839, auprc: 0.284985, brier: 0.158318,
Phase: val:
 loss: 0.482564,
 auc: 0.791554, auprc: 0.465549, brier: 0.153159,
Best model updated
Best val performance: 0.482564
CPU times: user 22.2 s, sys: 1.89 s, total: 24.1 s
Wall time: 19.7 s


In [10]:
result_eval = model.predict(data_dict, outcome_dict, phases = ['val', 'test'])

In [11]:
result_eval

({'val': {'outputs': array([[2.842108 , 1.6990509],
          [3.1794379, 2.192236 ],
          [2.6950653, 1.6928692],
          ...,
          [2.770583 , 1.5288856],
          [2.8603177, 2.065533 ],
          [3.3030226, 2.2160127]], dtype=float32),
   'pred_probs': array([[0.7582405 , 0.24175952],
          [0.7285349 , 0.2714651 ],
          [0.73149014, 0.26850986],
          ...,
          [0.7758593 , 0.22414069],
          [0.68885773, 0.3111422 ],
          [0.7478183 , 0.25218177]], dtype=float32),
   'labels': array([0, 1, 0, ..., 0, 0, 0])},
  'test': {'outputs': array([[2.8326886, 1.4966626],
          [3.0463965, 2.0163543],
          [2.771066 , 1.6888778],
          ...,
          [2.8095229, 1.9429191],
          [2.8411334, 2.059024 ],
          [2.669098 , 1.3839407]], dtype=float32),
   'pred_probs': array([[0.79183567, 0.20816433],
          [0.73692405, 0.26307595],
          [0.74690783, 0.25309214],
          ...,
          [0.70403856, 0.29596147],
          

In [12]:
## Save weights
model.save_weights(os.path.join(checkpoints_path, '{}.chk'.format(experiment_name)))

In [13]:
result_df_training = model.process_result_dict(result)
result_df_eval = model.process_result_dict(result_eval[1])

print(result_df_training)
print(result_df_eval)

   metric  phase  epoch  performance
0     auc  train      0     0.532003
1     auc  train      1     0.568173
2     auc  train      2     0.627839
3   auprc  train      0     0.218294
4   auprc  train      1     0.241843
5   auprc  train      2     0.284985
6   brier  train      0     0.208352
7   brier  train      1     0.170036
8   brier  train      2     0.158318
9    loss  train      0     0.720706
10   loss  train      1     0.530080
11   loss  train      2     0.492061
12    auc    val      0     0.766164
13    auc    val      1     0.773188
14    auc    val      2     0.791554
15  auprc    val      0     0.427121
16  auprc    val      1     0.439105
17  auprc    val      2     0.465549
18  brier    val      0     0.156146
19  brier    val      1     0.157480
20  brier    val      2     0.153159
21   loss    val      0     0.488385
22   loss    val      1     0.493818
23   loss    val      2     0.482564
  metric phase  epoch  performance
0    auc   val      0     0.791554
1  au

In [14]:
## Get performance by group
sensitive_variables = ['race_eth', 'gender', 'age']
data_dict_by_group = {sensitive_variable: {} for sensitive_variable in sensitive_variables}
outcome_dict_by_group = {sensitive_variable: {} for sensitive_variable in sensitive_variables}
for sensitive_variable in sensitive_variables:
    groups = np.unique(label_dict['train'][sensitive_variable])
    for group in groups:
        data_dict_by_group[sensitive_variable][group] = {split: 
                                       data_dict[split][label_dict[split][sensitive_variable] == group]
                                       for split in data_dict.keys()
                                      }
        outcome_dict_by_group[sensitive_variable][group] = {split: 
                                       outcome_dict[split][label_dict[split][sensitive_variable] == group]
                                       for split in data_dict.keys()
                                      }
result_df_by_group = pd.concat({sensitive_variable: 
                            pd.concat({
                                group: model.process_result_dict(model.predict(data_dict_by_group[sensitive_variable][group],
                                                    outcome_dict_by_group[sensitive_variable][group],
                                                    phases = ['val', 'test'])[1])
                                for group in data_dict_by_group[sensitive_variable].keys()
                            })
                            for sensitive_variable in data_dict_by_group.keys()
                           })
result_df_by_group.index = result_df_by_group.index.set_names(['sensitive_variable', 'group', 'index'])
result_df_by_group = result_df_by_group.reset_index(level = [0, 1])
result_df_by_group.head()

Unnamed: 0_level_0,sensitive_variable,group,metric,phase,epoch,performance
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,age,0,auc,val,0,0.843472
1,age,0,auprc,val,0,0.521279
2,age,0,brier,val,0,0.14006
3,age,0,loss,val,0,0.451907
4,age,0,auc,test,0,0.812588


In [15]:
result_df_by_group

Unnamed: 0_level_0,sensitive_variable,group,metric,phase,epoch,performance
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,age,0,auc,val,0,0.843472
1,age,0,auprc,val,0,0.521279
2,age,0,brier,val,0,0.140060
3,age,0,loss,val,0,0.451907
4,age,0,auc,test,0,0.812588
5,age,0,auprc,test,0,0.492850
6,age,0,brier,test,0,0.139766
7,age,0,loss,test,0,0.450432
0,age,1,auc,val,0,0.817009
1,age,1,auprc,val,0,0.424521


In [16]:
result_df_training.to_csv(os.path.join(performance_path, '{}_training'.format(experiment_name)), index = False)
result_df_eval.to_csv(os.path.join(performance_path, '{}_eval'.format(experiment_name)), index = False)
result_df_by_group.to_csv(os.path.join(performance_path, '{}_by_group'.format(experiment_name)), index = False)