In [1]:
import numpy as np
import os
import torch
import pandas as pd
import yaml

from sklearn.externals import joblib

from pytorch_utils.datasets import ArrayDataset
from pytorch_utils.models import FeedforwardNetModel
import pytorch_utils

In [2]:
outcome = 'los'

data_path = 'data/'
features_path = os.path.join(data_path, 'features', str(0))
label_path = os.path.join(data_path, 'labels')
config_path = os.path.join(data_path, 'config', 'grid', 'baseline')
checkpoints_path = os.path.join(data_path, 'checkpoints', 'scratch', outcome)
performance_path = os.path.join(data_path, 'performance', 'scratch', outcome)

In [3]:
os.makedirs(checkpoints_path, exist_ok=True)
os.makedirs(performance_path, exist_ok=True)

In [4]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [5]:
features_dict = joblib.load(os.path.join(features_path, 'features.pkl'))
label_dict = joblib.load(os.path.join(label_path, 'label_dict.pkl'))

In [6]:
experiment_name = 1

In [7]:
data_dict = {split: features_dict[split]['features'] for split in features_dict.keys()}
outcome_dict = {split : label_dict[split][outcome] for split in label_dict.keys()}

In [8]:
# with open(os.path.join(config_path, '{}.yaml'.format(experiment_name)), 'r') as fp:
#     config_dict = yaml.load(fp)
    
# config_dict['num_epochs'] = 3 # For testing

## A more complex network
config_dict = {
    'input_dim' : data_dict['train'].shape[1],
    'lr' : 1e-5,
    'num_epochs' : 30,
    'batch_size' : 256,
    'hidden_dim' : 128,
    'num_hidden' : 1,
    'output_dim' : 2,
    'drop_prob' : 0.75,
    'normalize' : True,
    'iters_per_epoch' : 100,
    'gamma' : 0.99,
    'resnet' : True,
    'sparse' : True,
    'sparse_mode' : 'binary'
}

## Logistic Regression
# config_dict = {
#     'input_dim' : data_dict['train'].shape[1],
#     'lr' : 1e-5,
#     'num_epochs' : 20,
#     'batch_size' : 256,
#     'hidden_dim' : 128,
#     'num_hidden' : 0,
#     'output_dim' : 2,
#     'drop_prob' : 0.0,
#     'normalize' : False,
#     'iters_per_epoch' : 100,
#     'gamma' : 0.99,
#     'resnet' : False,
#     'sparse' : True,
#     'sparse_mode' : 'binary'
# }

In [9]:
config_dict

{'input_dim': 368117,
 'lr': 1e-05,
 'num_epochs': 30,
 'batch_size': 256,
 'hidden_dim': 128,
 'num_hidden': 1,
 'output_dim': 2,
 'drop_prob': 0.75,
 'normalize': True,
 'iters_per_epoch': 100,
 'gamma': 0.99,
 'resnet': True,
 'sparse': True,
 'sparse_mode': 'binary'}

In [10]:
model = FeedforwardNetModel(config_dict)

In [11]:
for child in model.model.children():
    print(child)

HiddenLinearLayer(
  (linear): LinearLayerWrapper(
    (linear): EmbeddingBagLinear(
      in_features=368117, out_features=128, bias=True
      (embed): EmbeddingBag(368117, 128, mode=sum)
    )
  )
  (dropout): Dropout(p=0.75)
  (normalize_layer): LayerNorm(torch.Size([128]), eps=1e-05, elementwise_affine=True)
)
ModuleList(
  (0): HiddenLinearLayer(
    (linear): LinearLayerWrapper(
      (linear): EmbeddingBagLinear(
        in_features=368117, out_features=128, bias=True
        (embed): EmbeddingBag(368117, 128, mode=sum)
      )
    )
    (dropout): Dropout(p=0.75)
    (normalize_layer): LayerNorm(torch.Size([128]), eps=1e-05, elementwise_affine=True)
  )
  (1): Linear(in_features=128, out_features=2, bias=True)
)
Linear(in_features=128, out_features=2, bias=True)


In [12]:
%%time
result = model.train(data_dict, outcome_dict)

Epoch 0/29
----------
Phase: train:
 loss: 0.551042,
 auc: 0.629967, auprc: 0.280831, brier: 0.171044,
Phase: val:
 loss: 0.421038,
 auc: 0.776039, auprc: 0.457692, brier: 0.135734,
Best model updated
Epoch 1/29
----------
Phase: train:
 loss: 0.445136,
 auc: 0.737439, auprc: 0.397938, brier: 0.142216,
Phase: val:
 loss: 0.406940,
 auc: 0.796348, auprc: 0.488625, brier: 0.131421,
Best model updated
Epoch 2/29
----------
Phase: train:
 loss: 0.429316,
 auc: 0.774847, auprc: 0.463903, brier: 0.138262,
Phase: val:
 loss: 0.397943,
 auc: 0.805851, auprc: 0.506099, brier: 0.128218,
Best model updated
Epoch 3/29
----------
Phase: train:
 loss: 0.417246,
 auc: 0.782361, auprc: 0.475069, brier: 0.133851,
Phase: val:
 loss: 0.391626,
 auc: 0.814382, auprc: 0.520845, brier: 0.126240,
Best model updated
Epoch 4/29
----------
Phase: train:
 loss: 0.408786,
 auc: 0.793306, auprc: 0.488606, brier: 0.130890,
Phase: val:
 loss: 0.386905,
 auc: 0.820619, auprc: 0.531641, brier: 0.124786,
Best model upd

In [13]:
result_eval = model.predict(data_dict, outcome_dict, phases = ['val', 'test'])

In [14]:
result_eval

({'val': {'outputs': array([[ 1.5389009 , -1.9386427 ],
          [ 0.6821247 ,  0.3053855 ],
          [ 1.0206412 , -1.1488929 ],
          ...,
          [ 1.2974274 , -0.67337537],
          [ 0.45683527, -0.06768572],
          [ 1.0755595 , -0.40655932]], dtype=float32),
   'pred_probs': array([[0.970042  , 0.02995798],
          [0.5930864 , 0.40691358],
          [0.8974801 , 0.10251988],
          ...,
          [0.87769735, 0.12230269],
          [0.6282043 , 0.37179565],
          [0.81489235, 0.18510757]], dtype=float32),
   'labels': array([0, 1, 0, ..., 0, 0, 0])},
  'test': {'outputs': array([[ 2.3771539 , -1.0269226 ],
          [ 1.0082144 , -1.6580966 ],
          [ 2.1443512 , -1.3769734 ],
          ...,
          [ 1.1553407 ,  0.21470371],
          [-0.2818885 ,  1.3498124 ],
          [ 2.9866784 , -1.8732203 ]], dtype=float32),
   'pred_probs': array([[0.96783173, 0.03216831],
          [0.9350092 , 0.0649908 ],
          [0.9712885 , 0.02871153],
          ...

In [15]:
## Save weights
model.save_weights(os.path.join(checkpoints_path, '{}.chk'.format(experiment_name)))

NameError: name 'grid_element' is not defined

In [None]:
result_df_training = model.process_result_dict(result)
result_df_eval = model.process_result_dict(result_eval[1])

print(result_df_training)
print(result_df_eval)

In [None]:
## Get performance by group
sensitive_variables = ['race_eth', 'gender', 'age']
data_dict_by_group = {sensitive_variable: {} for sensitive_variable in sensitive_variables}
outcome_dict_by_group = {sensitive_variable: {} for sensitive_variable in sensitive_variables}
for sensitive_variable in sensitive_variables:
    groups = np.unique(label_dict['train'][sensitive_variable])
    for group in groups:
        data_dict_by_group[sensitive_variable][group] = {split: 
                                       data_dict[split][label_dict[split][sensitive_variable] == group]
                                       for split in data_dict.keys()
                                      }
        outcome_dict_by_group[sensitive_variable][group] = {split: 
                                       outcome_dict[split][label_dict[split][sensitive_variable] == group]
                                       for split in data_dict.keys()
                                      }
result_df_by_group = pd.concat({sensitive_variable: 
                            pd.concat({
                                group: model.process_result_dict(model.predict(data_dict_by_group[sensitive_variable][group],
                                                    outcome_dict_by_group[sensitive_variable][group],
                                                    phases = ['val', 'test'])[1])
                                for group in data_dict_by_group[sensitive_variable].keys()
                            })
                            for sensitive_variable in data_dict_by_group.keys()
                           })
result_df_by_group.index = result_df_by_group.index.set_names(['sensitive_variable', 'group', 'index'])
result_df_by_group = result_df_by_group.reset_index(level = [0, 1])
result_df_by_group.head()

In [None]:
result_df_by_group

In [None]:
result_df_training.to_csv(os.path.join(performance_path, '{}_training'.format(experiment_name)), index = False)
result_df_eval.to_csv(os.path.join(performance_path, '{}_eval'.format(experiment_name)), index = False)
result_df_by_group.to_csv(os.path.join(performance_path, '{}_by_group'.format(experiment_name)), index = False)