In [1]:
import numpy as np
import os
import torch
import pandas as pd
import yaml

from sklearn.externals import joblib

from pytorch_utils.datasets import ArrayDataset
from pytorch_utils.models import FeedforwardNetModel
import pytorch_utils

In [2]:
outcome = 'los'

data_path = 'data/'
features_path = os.path.join(data_path, 'features', str(0))
label_path = os.path.join(data_path, 'labels')
config_path = os.path.join(data_path, 'config', 'grid', 'baseline')
checkpoints_path = os.path.join(data_path, 'checkpoints', 'scratch', outcome)
performance_path = os.path.join(data_path, 'performance', 'scratch', outcome)

In [3]:
os.makedirs(checkpoints_path, exist_ok=True)
os.makedirs(performance_path, exist_ok=True)

In [4]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [5]:
features_dict = joblib.load(os.path.join(features_path, 'features.pkl'))
label_dict = joblib.load(os.path.join(label_path, 'label_dict.pkl'))

In [6]:
grid_element = 1

In [7]:
data_dict = {split: features_dict[split]['features'] for split in features_dict.keys()}
outcome_dict = {split : label_dict[split][outcome] for split in label_dict.keys()}

In [8]:
# with open(os.path.join(config_path, '{}.yaml'.format(grid_element)), 'r') as fp:
#     config_dict = yaml.load(fp)
    
# config_dict['num_epochs'] = 3 # For testing

## A more complex network
# config_dict = {
#     'input_dim' : data_dict['train'].shape[1],
#     'lr' : 1e-5,
#     'num_epochs' : 20,
#     'batch_size' : 256,
#     'hidden_dim' : 128,
#     'num_hidden' : 1,
#     'output_dim' : 2,
#     'drop_prob' : 0.5,
#     'normalize' : True,
#     'iters_per_epoch' : 100,
#     'gamma' : 0.99,
#     'resnet' : True,
#     'sparse' : True,
#     'sparse_mode' : 'binary'
# }

## Logistic Regression
config_dict = {
    'input_dim' : data_dict['train'].shape[1],
    'lr' : 1e-5,
    'num_epochs' : 20,
    'batch_size' : 256,
    'hidden_dim' : 128,
    'num_hidden' : 0,
    'output_dim' : 2,
    'drop_prob' : 0.0,
    'normalize' : False,
    'iters_per_epoch' : 100,
    'gamma' : 0.99,
    'resnet' : False,
    'sparse' : True,
    'sparse_mode' : 'binary'
}

In [9]:
config_dict

{'input_dim': 368117,
 'lr': 1e-05,
 'num_epochs': 20,
 'batch_size': 256,
 'hidden_dim': 128,
 'num_hidden': 0,
 'output_dim': 2,
 'drop_prob': 0.0,
 'normalize': False,
 'iters_per_epoch': 100,
 'gamma': 0.99,
 'resnet': False,
 'sparse': True,
 'sparse_mode': 'binary'}

In [10]:
model = FeedforwardNetModel(config_dict)

In [11]:
for child in model.model.children():
    print(child)

LinearLayerWrapper(
  (linear): EmbeddingBagLinear(
    in_features=368117, out_features=2, bias=True
    (embed): EmbeddingBag(368117, 2, mode=sum)
  )
)
ModuleList(
  (0): LinearLayerWrapper(
    (linear): EmbeddingBagLinear(
      in_features=368117, out_features=2, bias=True
      (embed): EmbeddingBag(368117, 2, mode=sum)
    )
  )
)


In [12]:
%%time
result = model.train(data_dict, outcome_dict)

Epoch 0/19
----------
Phase: train:
 loss: 0.525749,
 auc: 0.568305, auprc: 0.242586, brier: 0.170020,
Phase: val:
 loss: 0.513886,
 auc: 0.615560, auprc: 0.311554, brier: 0.165039,
Best model updated
Epoch 1/19
----------
Phase: train:
 loss: 0.505149,
 auc: 0.650494, auprc: 0.347303, brier: 0.161898,
Phase: val:
 loss: 0.497735,
 auc: 0.671225, auprc: 0.371098, brier: 0.158867,
Best model updated
Epoch 2/19
----------
Phase: train:
 loss: 0.487446,
 auc: 0.701484, auprc: 0.405082, brier: 0.155131,
Phase: val:
 loss: 0.485753,
 auc: 0.704719, auprc: 0.399489, brier: 0.154498,
Best model updated
Epoch 3/19
----------
Phase: train:
 loss: 0.483347,
 auc: 0.729702, auprc: 0.440197, brier: 0.154256,
Phase: val:
 loss: 0.476673,
 auc: 0.729497, auprc: 0.417532, brier: 0.151234,
Best model updated
Epoch 4/19
----------
Phase: train:
 loss: 0.472264,
 auc: 0.747419, auprc: 0.447539, brier: 0.150242,
Phase: val:
 loss: 0.469392,
 auc: 0.742043, auprc: 0.425944, brier: 0.148702,
Best model upd

In [13]:
result_eval = model.predict(data_dict, outcome_dict, phases = ['val', 'test'])

ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



Traceback (most recent call last):
  File "/labs/shahlab/spfohl/miniconda3/envs/py_env/lib/python3.6/site-packages/IPython/core/interactiveshell.py", line 2963, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-13-577a716d75f1>", line 1, in <module>
    result_eval = model.predict(data_dict, outcome_dict, phases = ['val', 'test'])
  File "/home/spfohl/projects/fairness_cf/pytorch_utils/models.py", line 275, in predict
    outputs = self.model(inputs)
  File "/labs/shahlab/spfohl/miniconda3/envs/py_env/lib/python3.6/site-packages/torch/nn/modules/module.py", line 489, in __call__
    result = self.forward(*input, **kwargs)
  File "/home/spfohl/projects/fairness_cf/pytorch_utils/layers.py", line 229, in forward
    return y_pred
  File "/labs/shahlab/spfohl/miniconda3/envs/py_env/lib/python3.6/site-packages/torch/nn/modules/container.py", line 92, in forward
    input = module(input)
  File "/labs/shahlab/spfohl/miniconda3/envs/py_env/lib/python3.6/

KeyboardInterrupt: 

In [None]:
result_eval

In [None]:
## Save weights
model.save_weights(os.path.join(checkpoints_path, '{}.chk'.format(grid_element)))

In [None]:
result_df_training = model.process_result_dict(result)
result_df_eval = model.process_result_dict(result_eval[1])

print(result_df_training)
print(result_df_eval)

In [None]:
## Get performance by group
sensitive_variables = ['race_eth', 'gender', 'age']
data_dict_by_group = {sensitive_variable: {} for sensitive_variable in sensitive_variables}
outcome_dict_by_group = {sensitive_variable: {} for sensitive_variable in sensitive_variables}
for sensitive_variable in sensitive_variables:
    groups = np.unique(label_dict['train'][sensitive_variable])
    for group in groups:
        data_dict_by_group[sensitive_variable][group] = {split: 
                                       data_dict[split][label_dict[split][sensitive_variable] == group]
                                       for split in data_dict.keys()
                                      }
        outcome_dict_by_group[sensitive_variable][group] = {split: 
                                       outcome_dict[split][label_dict[split][sensitive_variable] == group]
                                       for split in data_dict.keys()
                                      }
result_df_by_group = pd.concat({sensitive_variable: 
                            pd.concat({
                                group: model.process_result_dict(model.predict(data_dict_by_group[sensitive_variable][group],
                                                    outcome_dict_by_group[sensitive_variable][group],
                                                    phases = ['val', 'test'])[1])
                                for group in data_dict_by_group[sensitive_variable].keys()
                            })
                            for sensitive_variable in data_dict_by_group.keys()
                           })
result_df_by_group.index = result_df_by_group.index.set_names(['sensitive_variable', 'group', 'index'])
result_df_by_group = result_df_by_group.reset_index(level = [0, 1])
result_df_by_group.head()

In [None]:
result_df_by_group

In [None]:
result_df_training.to_csv(os.path.join(performance_path, '{}_training'.format(grid_element)), index = False)
result_df_eval.to_csv(os.path.join(performance_path, '{}_eval'.format(grid_element)), index = False)
result_df_by_group.to_csv(os.path.join(performance_path, '{}_by_group'.format(grid_element)), index = False)