In [1]:
import numpy as np
import os
import torch
import yaml
import json
import random 
import pandas as pd

from sklearn.externals import joblib
from sklearn.model_selection import ParameterGrid

In [2]:
random.seed(234)
np.random.seed(5432)

In [3]:
data_path = './data/'
features_path = os.path.join(data_path, 'features', str(0))
label_path = os.path.join(data_path, 'labels')
features_dict = joblib.load(os.path.join(features_path, 'features.pkl'))
master_label_dict = joblib.load(os.path.join(label_path, 'label_dict.pkl'))
data_dict = {split: features_dict[split]['features'] for split in features_dict.keys()}

In [4]:
grid_size = 100

In [5]:
def yaml_write(x, path):
    with open(path, 'w') as fp:
        yaml.dump(x, fp)
        
def yaml_read(path):
    with open(path, 'r') as fp:
        return yaml.load(fp)

In [6]:
param_grid = {
    # Standard parameters
    'input_dim' : [data_dict['train'].shape[1]],
#     'num_groups' : [group_map.shape[0]],
    'lr' : [1e-2, 1e-3, 1e-4, 1e-5],
    'lr_final_classifier' : [1e-3],
    'gamma' : [None, 0.99, 0.95],
    'num_epochs' : [30],
    'iters_per_epoch' : [100],
    'output_dim' : [2],
    'batch_size' : [64, 128, 256, 512],
    'sparse' : [True],
    'sparse_mode' : ['binary'],
    
    # Parameters corresponding to the size of the VAE
    'group_embed_dim' : [16, 32, 64, 128],
    'latent_dim' : [16, 32, 64, 128],
    'num_hidden' : [1, 2, 3],
    'drop_prob' : [0.0, 0.25, 0.5, 0.75],
    'resnet' : [False],
    'normalize' : [False],
    # Parameters corresponding to the size of classifier
    'hidden_dim_classifier' : [128, 256, 512],
    'num_hidden_classifier' : [1, 2, 3],
    'drop_prob_classifier' : [0.0, 0.25, 0.5, 0.75],
    'resnet_classifier' : [False],
    'normalize_classifier' : [False, True],

    # Lambda
    'lambda_reconstruction' : [1e3],
    'lambda_mmd' : [1e4],
    'lambda_kl' : [0.0],
    'lambda_classification' : [1e1],
    'lambda_mmd_group' : [1e3]
}

the_grid = list(ParameterGrid(param_grid))
np.random.shuffle(the_grid)
the_grid = the_grid[:grid_size]


In [7]:
for sensitive_variable in ['age', 'race_eth', 'gender']:
    
    config_path = os.path.join(data_path, 'config', 'grid', 'cfvae', sensitive_variable)
    os.makedirs(config_path, exist_ok = True)

    group_dict = {split : master_label_dict[split][sensitive_variable] for split in master_label_dict.keys()}
    num_groups = len(np.unique(group_dict['train']))
    
    grid_df = pd.DataFrame(the_grid)
    grid_df.to_csv(os.path.join(config_path, 'config.csv'), index_label = 'id')
    
    for i, config_dict in enumerate(the_grid):
        config_dict['num_groups'] = num_groups
        yaml_write(config_dict, os.path.join(config_path, '{}.yaml'.format(i)))