In [1]:
import numpy as np
import os
import torch
import pandas as pd
import yaml

from sklearn.externals import joblib

from pytorch_utils.datasets import ArrayDataset
from pytorch_utils.models import FeedforwardNetModel, FixedWidthModel, BottleneckModel
import pytorch_utils

In [2]:
outcome = 'los'

data_path = 'data/'
features_path = os.path.join(data_path, 'features', str(0))
label_path = os.path.join(data_path, 'labels')
config_path = os.path.join(data_path, 'config', 'grid', 'baseline')
checkpoints_path = os.path.join(data_path, 'checkpoints', 'scratch', outcome)
performance_path = os.path.join(data_path, 'performance', 'scratch', outcome)

In [3]:
os.makedirs(checkpoints_path, exist_ok=True)
os.makedirs(performance_path, exist_ok=True)

In [4]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [5]:
features_dict = joblib.load(os.path.join(features_path, 'features.pkl'))
label_dict = joblib.load(os.path.join(label_path, 'label_dict.pkl'))

In [6]:
experiment_name = 1

In [7]:
data_dict = {split: features_dict[split]['features'] for split in features_dict.keys()}
outcome_dict = {split : label_dict[split][outcome] for split in label_dict.keys()}

In [None]:
# with open(os.path.join(config_path, '{}.yaml'.format(experiment_name)), 'r') as fp:
#     config_dict = yaml.load(fp)
    
# config_dict['num_epochs'] = 3 # For testing

## A more complex network
# config_dict = {
#     'input_dim' : data_dict['train'].shape[1],
#     'lr' : 1e-5,
#     'num_epochs' : 3,
#     'batch_size' : 256,
#     'hidden_dim' : 128,
#     'num_hidden' : 2,
#     'output_dim' : 2,
#     'drop_prob' : 0.75,
#     'normalize' : True,
#     'iters_per_epoch' : 100,
#     'gamma' : 0.99,
#     'resnet' : True,
#     'sparse' : True,
#     'sparse_mode' : 'binary'
# }
# model = FixedWidthModel(config_dict)

config_dict = {
    'input_dim' : data_dict['train'].shape[1],
    'lr' : 1e-2,
    'num_epochs' : 10,
    'batch_size' : 256,
#     'hidden_dim' : 128,
    'bottleneck_size' : 64,
    'num_hidden' : 2,
    'output_dim' : 2,
    'drop_prob' : 0.75,
    'normalize' : True,
    'iters_per_epoch' : 100,
    'gamma' : 0.99,
    'resnet' : True,
    'sparse' : True,
    'sparse_mode' : 'binary'
}
model = BottleneckModel(config_dict)

# config_dict = {
#     'input_dim' : data_dict['train'].shape[1],
#     'lr' : 1e-3,
#     'num_epochs' : 3,
#     'batch_size' : 256,
#     'hidden_dim_list' : [128, 64],
#     'output_dim' : 2,
#     'drop_prob' : 0.75,
#     'normalize' : True,
#     'iters_per_epoch' : 100,
#     'gamma' : 0.99,
#     'resnet' : True,
#     'sparse' : True,
#     'sparse_mode' : 'binary'
# }
# model = FeedforwardNetModel(config_dict)
for child in model.model.children():
    print(child)

In [None]:
%%time
result = model.train(data_dict, outcome_dict)

In [None]:
result_eval = model.predict(data_dict, outcome_dict, phases = ['val', 'test'])

In [None]:
result_eval

In [None]:
## Save weights
model.save_weights(os.path.join(checkpoints_path, '{}.chk'.format(experiment_name)))

In [None]:
result_df_training = model.process_result_dict(result)
result_df_eval = model.process_result_dict(result_eval[1])

print(result_df_training)
print(result_df_eval)

In [None]:
## Get performance by group
sensitive_variables = ['race_eth', 'gender', 'age']
data_dict_by_group = {sensitive_variable: {} for sensitive_variable in sensitive_variables}
outcome_dict_by_group = {sensitive_variable: {} for sensitive_variable in sensitive_variables}
for sensitive_variable in sensitive_variables:
    groups = np.unique(label_dict['train'][sensitive_variable])
    for group in groups:
        data_dict_by_group[sensitive_variable][group] = {split: 
                                       data_dict[split][label_dict[split][sensitive_variable] == group]
                                       for split in data_dict.keys()
                                      }
        outcome_dict_by_group[sensitive_variable][group] = {split: 
                                       outcome_dict[split][label_dict[split][sensitive_variable] == group]
                                       for split in data_dict.keys()
                                      }
result_df_by_group = pd.concat({sensitive_variable: 
                            pd.concat({
                                group: model.process_result_dict(model.predict(data_dict_by_group[sensitive_variable][group],
                                                    outcome_dict_by_group[sensitive_variable][group],
                                                    phases = ['val', 'test'])[1])
                                for group in data_dict_by_group[sensitive_variable].keys()
                            })
                            for sensitive_variable in data_dict_by_group.keys()
                           })
result_df_by_group.index = result_df_by_group.index.set_names(['sensitive_variable', 'group', 'index'])
result_df_by_group = result_df_by_group.reset_index(level = [0, 1])
result_df_by_group.head()

In [None]:
result_df_by_group

In [None]:
result_df_training.to_csv(os.path.join(performance_path, '{}_training'.format(experiment_name)), index = False)
result_df_eval.to_csv(os.path.join(performance_path, '{}_eval'.format(experiment_name)), index = False)
result_df_by_group.to_csv(os.path.join(performance_path, '{}_by_group'.format(experiment_name)), index = False)