In [1]:
import numpy as np
import os
import torch
import pandas as pd
import yaml

from sklearn.externals import joblib

from pytorch_utils.cfvae_models import CFVAEModel

In [2]:
outcome = 'los'
sensitive_variable = 'gender'
data_path = 'data/'

features_path = os.path.join(data_path, 'features', str(0), '{}_excluded'.format(sensitive_variable))
label_path = os.path.join(data_path, 'labels')
config_path = os.path.join(data_path, 'config', 'grid', 'baseline')
checkpoints_path = os.path.join(data_path, 'checkpoints', 'scratch', outcome)
performance_path = os.path.join(data_path, 'performance', 'scratch', outcome)

In [3]:
os.makedirs(checkpoints_path, exist_ok=True)
os.makedirs(performance_path, exist_ok=True)

In [4]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [5]:
features_dict = joblib.load(os.path.join(features_path, 'features.pkl'))
master_label_dict = joblib.load(os.path.join(label_path, 'label_dict.pkl'))

In [6]:
data_dict = {split: features_dict[split]['features'] for split in features_dict.keys()}
label_dict = {split : master_label_dict[split][outcome] for split in master_label_dict.keys()}
group_dict = {split : master_label_dict[split][sensitive_variable] for split in master_label_dict.keys()}

In [7]:
group_map = pd.read_csv(os.path.join(label_path, '{}_map.csv'.format(sensitive_variable)))

In [8]:
# # with open(os.path.join(config_path, '{}.yaml'.format(grid_element)), 'r') as fp:
# #     config_dict = yaml.load(fp)
    
# CFVAE
# config_dict = {
#     # Standard parameters
#     'input_dim' : data_dict['train'].shape[1],
#     'num_groups' : len(np.unique(group_dict['train'])),
#     'lr' : 1e-3,
#     'lr_final_classifier' : 1e-3,
#     'gamma' : 0.99,
#     'num_epochs' : 1,
#     'iters_per_epoch' : 100,
#     'output_dim' : 2,
#     'batch_size' : 256,
#     'sparse' : True,
#     'sparse_mode' : 'binary',
    
#     # Parameters corresponding to the size of the VAE
#     'group_embed_dim' : 64,
#     'latent_dim' : 64,
#     'num_hidden' : 2,
#     'drop_prob' : 0.0,
#     'resnet' : False,
#     'normalize' : False,
    
#     # Parameters corresponding to the size of classifier
#     'hidden_dim_classifier' : 128,
#     'num_hidden_classifier' : 1,
#     'drop_prob_classifier' : 0.0,
#     'resnet_classifier' : False,
#     'normalize_classifier' : False,

#     # Lambda
#     'lambda_reconstruction' : 1e3,
#     'lambda_mmd' : 1e4,
#     'lambda_kl' : 0.0,
#     'lambda_classification' : 1e1,
#     'lambda_mmd_group' : 1e3
# }


config_dict = {
    # Standard parameters
    'input_dim' : data_dict['train'].shape[1],
    'num_groups' : len(np.unique(group_dict['train'])),
    'lr' : 1e-3,
    'lr_final_classifier' : 1e-3,
    'gamma' : 0.99,
    'num_epochs' : 1,
    'iters_per_epoch' : 100,
    'output_dim' : 2,
    'batch_size' : 256,
    'sparse' : True,
    'sparse_mode' : 'binary',
    
    # Parameters corresponding to the size of the VAE
    'group_embed_dim' : 4,
    'latent_dim' : 4,
    'num_hidden' : 1,
    'drop_prob' : 0.0,
    'resnet' : False,
    'normalize' : False,
    
    # Parameters corresponding to the size of classifier
    'hidden_dim_classifier' : 32,
    'num_hidden_classifier' : 1,
    'drop_prob_classifier' : 0.0,
    'resnet_classifier' : False,
    'normalize_classifier' : False,

    # Lambda
    'lambda_reconstruction' : 1e3,
    'lambda_mmd' : 1e4,
    'lambda_kl' : 0.0,
    'lambda_classification' : 1e1,
    'lambda_mmd_group' : 1e3
}

In [9]:
if sensitive_variable == 'gender':
    data_dict = {k: v[group_dict[k] < 2] for k,v in data_dict.items()}
    label_dict = {k: v[group_dict[k] < 2] for k,v in label_dict.items()}
    group_dict = {k: v[group_dict[k] < 2] for k,v in group_dict.items()}

In [10]:
model = CFVAEModel(config_dict)
for child in model.model.children():
    print(child)

VAEEncoder(
  (encoder): FeedforwardNet(
    (layers): ModuleList(
      (0): HiddenLinearLayer(
        (linear): LinearLayerWrapper(
          (linear): EmbeddingBagLinear(
            in_features=368117, out_features=8, bias=True
            (embed): EmbeddingBag(368117, 8, mode=sum)
          )
        )
        (dropout): Dropout(p=0.0)
      )
      (1): Linear(in_features=8, out_features=8, bias=True)
    )
  )
  (reparameterization_layer): ReparameterizationLayer()
)
ConditionalDecoder(
  (decoder): FeedforwardNet(
    (layers): ModuleList(
      (0): HiddenLinearLayer(
        (linear): LinearLayerWrapper(
          (linear): Linear(in_features=8, out_features=16, bias=True)
        )
        (dropout): Dropout(p=0.0)
      )
      (1): Linear(in_features=16, out_features=368115, bias=True)
    )
  )
  (conditional_layer): Embedding(2, 4)
)
ConditionalDecoder(
  (decoder): FeedforwardNet(
    (layers): ModuleList(
      (0): HiddenLinearLayer(
        (linear): LinearLayerWrap

In [11]:
%%time
result = model.train(data_dict, label_dict, group_dict)

Epoch 0/0
----------
Phase: train:
 loss: 612.339494, elbo: 2.096533, mmd: 0.005290, reconstruction: 0.540204, kl: 0.964292, classification: 0.592037, mmd_group: 0.013317,
 auc: 0.506715, auprc: 0.209042, brier: 0.199878,
Phase: val:
 loss: 395.837189, elbo: 2.342093, mmd: 0.003425, reconstruction: 0.346317, kl: 1.486957, classification: 0.508819, mmd_group: 0.010183,
 auc: 0.549555, auprc: 0.234686, brier: 0.162843,
Best model updated
Best val performance: 395.837189
CPU times: user 30.9 s, sys: 14.5 s, total: 45.4 s
Wall time: 44.3 s


In [12]:
result_eval = model.predict(data_dict, label_dict, group_dict, phases = ['val', 'test'])

In [13]:
# result_final_classifier = model.train(data_dict, label_dict, group_dict)

In [14]:
model.process_result_dict(result)

Unnamed: 0,metric,phase,epoch,performance
0,auc,train,0,0.506715
1,auprc,train,0,0.209042
2,brier,train,0,0.199878
3,classification,train,0,0.592037
4,elbo,train,0,2.096533
5,kl,train,0,0.964292
6,loss,train,0,612.339494
7,mmd,train,0,0.00529
8,mmd_group,train,0,0.013317
9,reconstruction,train,0,0.540204


In [16]:
model.process_result_dict(result_eval[1])

Unnamed: 0,metric,phase,epoch,performance
0,auc,val,0,0.556166
1,auprc,val,0,0.237191
2,brier,val,0,0.162322
3,classification,val,0,0.507314
4,elbo,val,0,2.341147
5,kl,val,0,1.486957
6,loss,val,0,398.92854
7,mmd,val,0,0.003626
8,mmd_group,val,0,0.010718
9,reconstruction,val,0,0.346877
