# M3E2

## Author: Raquel Aoki

Date: Spring 2021


In [3]:
!git clone https://github.com/raquelaoki/ParKCa.git
!git clone https://github.com/raquelaoki/CompBioAndSimulated_Datasets.git
#!git clone https://github.com/JakeColtman/bartpy.git

fatal: destination path 'ParKCa' already exists and is not an empty directory.
fatal: destination path 'CompBioAndSimulated_Datasets' already exists and is not an empty directory.


In [1]:
import random
import pandas as pd
import numpy as np
import sys
import yaml
import time
from sklearn.model_selection import train_test_split
import torch

sys.path.insert(0, 'src/')
#sys.path.insert(0, 'bartpy/')  # https://github.com/JakeColtman/bartpy
sys.path.insert(0, 'ParKCa/src/')
#from ParKCa.src.train import *
from CompBioAndSimulated_Datasets.simulated_data_multicause import *
import model_m3e2 as m3e2


def main(config_path, seed_models, seed_data):
    """Start: Parameters Loading"""
    with open(config_path) as f:
        config = yaml.safe_load(f)
    params = config['parameters']

    # Fix numpy seed for reproducibility
    np.random.seed(seed_models)
    # Fix random seed for reproducibility
    random.seed(seed_models)
    # Fix Torch graph-level seed for reproducibility
    torch.manual_seed(seed_models)

    if 'gwas' in params['data']:

        params_b = {'DA': {'k': [15]},
                    'CEVAE': {'num_epochs': 100, 'batch': 200, 'z_dim': 10}}

        params["n_treatments"] = trykey(params, 'n_treatments', 5)
        prop = params["n_treatments"] / (params["n_treatments"] + params['n_covariates'])

        sdata_gwas = gwas_simulated_data(prop_tc=prop,
                                         pca_path='CompBioAndSimulated_Datasets/data/tgp_pca2.txt',
                                         seed=seed_data,
                                         n_units=params['n_sample'],
                                         n_causes=params["n_treatments"] + params['n_covariates'],
                                         true_causes=params["n_treatments"])
        X, y, y01, treatement_columns, treatment_effects, group = sdata_gwas.generate_samples()
        # Train and Test split use the same seed
        params['baselines'] = trykey(params, 'baselines', False)
        if params['baselines']:
            baselines_results, exp_time, f1_test = baselines(params['baselines_list'], pd.DataFrame(X), y01, params_b,
                                                             TreatCols=treatement_columns, timeit=True,
                                                             seed=seed_models)
        else:
            baselines_results, exp_time, f1_test = baselines(['noise'], pd.DataFrame(X), y01, params_b,
                                                             TreatCols=treatement_columns, timeit=True,
                                                             seed=seed_models)

        start_time = time.time()
        X_train, X_test, y_train, y_test = train_test_split(X, y01, test_size=0.33, random_state=seed_models)
        print('... Target - proportion of 1s', np.sum(y01) / len(y01))
        # Split X1, X2 on GWAS: case with no clinicla variables , X2 = X
        X1_cols = []
        X2_cols = range(X.shape[1] - len(treatement_columns))

        data_nnl = m3e2.data_nn(X_train.values, X_test.values, y_train, y_test, treatement_columns,
                                treatment_effects[treatement_columns], X1_cols, X2_cols)
        loader_train, loader_val, loader_test, num_features = data_nnl.loader(params['suffle'], params['batch_size'],
                                                                              seed_models)
        params['pos_weights'] = data_nnl.treat_weights
        params['pos_weight_y'] = trykey(params, 'pos_weight_y', 1)
        params['hidden1'] = trykey(params, 'hidden1', 64)
        params['hidden2'] = trykey(params, 'hidden2', 8)
        cate_m3e2, f1_test_ = m3e2.fit_nn(loader_train, loader_val, loader_test, params, treatement_columns,
                                          num_features,
                                          X1_cols, X2_cols)
        print('... CATE')
        baselines_results['M3E2'] = cate_m3e2
        exp_time['M3E2'] = time.time() - start_time
        f1_test['M3E2'] = f1_test_
        output = organize_output(baselines_results.copy(), treatment_effects[treatement_columns], exp_time, f1_test)
    if 'copula' in params['data']:
        params_b = {'DA': {'k': [5]},
                    'CEVAE': {'num_epochs': 100, 'batch': 200, 'z_dim': 5}}

        sdata_copula = copula_simulated_data(seed=seed_data, n=params['n_sample'], s=params['n_covariates'])
        X, y, y01, treatement_columns, treatment_effects = sdata_copula.generate_samples()

        if params['baselines']:
            baselines_results, exp_time, f1_test = baselines(params['baselines_list'], pd.DataFrame(X), y01, params_b,
                                                             TreatCols=treatement_columns, timeit=True,
                                                             seed=seed_models)
        else:
            baselines_results, exp_time, f1_test = baselines(['noise'], pd.DataFrame(X), y01, params_b,
                                                             TreatCols=treatement_columns, timeit=True,
                                                             seed=seed_models)
        start = time.time()
        X_train, X_test, y_train, y_test = train_test_split(X, y01, test_size=0.33, random_state=seed_models)
        X1_cols = []
        X2_cols = range(X.shape[1] - len(treatement_columns))
        # TODO: add other baselines here to run everything on the same train/testing sets

        data_nnl = m3e2.data_nn(X_train, X_test, y_train, y_test, treatement_columns,
                                treatment_effects, X1_cols, X2_cols)
        loader_train, loader_val, loader_test, num_features = data_nnl.loader(params['suffle'], params['batch_size'],
                                                                              seed_models)
        params['pos_weights'] = data_nnl.treat_weights
        params['pos_weight_y'] = trykey(params, 'pos_weight_y', 1)
        params['hidden1'] = trykey(params, 'hidden1', 6)
        params['hidden2'] = trykey(params, 'hidden2', 6)

        cate_m3e2, f1_test_ = m3e2.fit_nn(loader_train, loader_val, loader_test, params, treatement_columns,
                                          num_features,
                                          X1_cols, X2_cols)
        print('... CATE')
        cate = pd.DataFrame({'CATE_M3E2': cate_m3e2, 'True_Effect': treatment_effects})
        baselines_results['M3E2'] = cate_m3e2
        exp_time['M3E2'] = time.time() - start_time
        f1_test['M3E2'] = f1_test_
        output = organize_output(baselines_results.copy(), treatment_effects[treatement_columns], exp_time, f1_test)
    if 'gwas' not in params['data'] and 'copula' not in params['data']:
        print(
            "ERRROR! \nDataset not recognized. \nChange the parameter data in your config.yaml file to gwas or copula.")

    name = 'output_' + params['data'][0] + '_' + params['id'] + '.csv'
    output['seed_data'] = seed_data
    output['seed_models'] = seed_models

    return output, name


def trykey(params, key, default):
    try:
        return params[key]
    except KeyError:
        params[key] = default
        return params[key]


def baselines(BaselinesList, X, y, ParamsList, seed=63, TreatCols=None, id='', timeit=False):
    """
    input:
        X, colnamesX: potential causes and their names
        Z, colnamesZ: confounders and their names
        y: 01 outcome
        causes: name of the potential causes
    """

    if TreatCols is None:
        TreatCols = list(range(X.shape[1]))

    # check if binary treatments
    X01 = X.copy()
    for col in TreatCols:
        a = X01.iloc[:, col]
        if not ((a == 0) | (a == 1)).all():
            mean_v = np.mean(X01.iloc[:, col])
            X01.iloc[:, col] = [1 if i > mean_v else 0 for i in X01.iloc[:, col]]
        else:
            pass

    X_train, X_test, y_train, y_test, X_train01, X_test01 = train_test_split(X, y, X01,
                                                                             test_size=0.33, random_state=seed)
    coef_table = pd.DataFrame(columns=['causes'])
    coef_table['causes'] = ['T' + str(i) for i in range(len(TreatCols))]
    times, f1_test = {}, {}

    if 'DA' in BaselinesList:
        start_time = time.time()
        from deconfounder import deconfounder_algorithm as DA
        ParamsList['DA']['k'] = trykey(ParamsList['DA'], 'k', 15)  # if exploring multiple latent sizes
        for k in ParamsList['DA']['k']:
            if len(ParamsList['DA']['k']) > 1:
                coln = 'DA_' + str(id) + str(k)
            else:
                coln = 'DA'
            model_da = DA(X_train, X_test, y_train, y_test, k, print_=False)
            ParamsList['DA']['class_weight'] = trykey(ParamsList['DA'], 'class_weight', {0: 1, 1: 1})
            coef, coef_continuos, roc, f1_test['DA'] = model_da.fit(class_weight=ParamsList['DA']['class_weight'])
            coef_table[coln] = coef_continuos[TreatCols]
        times['DA'] = time.time() - start_time
        print('\nDone!')

    if 'BART' in BaselinesList:
        start_time = time.time()
        from bart import BART as BART
        model_bart = BART(X_train01, X_test01, y_train, y_test)
        ParamsList['BART']['n_trees'] = trykey(ParamsList['BART'], 'n_trees', 50)
        ParamsList['BART']['n_burn'] = trykey(ParamsList['BART'], 'n_burn', 100)
        model_bart.fit(n_trees=ParamsList['BART']['n_trees'], n_burn=ParamsList['BART']['n_burn'], print_=False)
        print('...... predictions')
        coef_table['BART'], f1_test['BART'] = model_bart.cate(TreatCols, print_=False)
        times['BART'] = time.time() - start_time
        print('\nDone!')

    if 'CEVAE' in BaselinesList:
        print('\n\n Learner: CEVAE')
        start_time = time.time()
        from cevae import CEVAE as CEVAE
        print('Note: Treatments should be the first columns of X')
        ParamsList['CEVAE']['epochs'] = trykey(ParamsList['CEVAE'], 'epochs', 100)
        ParamsList['CEVAE']['batch'] = trykey(ParamsList['CEVAE'], 'batch', 200)
        ParamsList['CEVAE']['z_dim'] = trykey(ParamsList['CEVAE'], 'z_dim', 5)

        confeatures, binfeatures = [], []
        for col in range(X_train01.shape[1]):
            a = X_train01.iloc[:, col]
            if not ((a == 0) | (a == 1)).all():
                confeatures.append(col)
            else:
                binfeatures.append(col)

        print('... length con and bin features', len(confeatures), len(binfeatures))
        model_cevae = CEVAE(X_train01, X_test01, y_train, y_test, TreatCols,
                            binfeats=binfeatures, contfeats=confeatures,
                            epochs=ParamsList['CEVAE']['epochs'],
                            batch=ParamsList['CEVAE']['batch'],
                            z_dim=ParamsList['CEVAE']['z_dim'])
        coef_table['CEVAE'], f1_test['CEVAE'] = model_cevae.fit_all(print_=False)
        times['CEVAE'] = time.time() - start_time
        print('\nDone!')

    if not timeit:
        return coef_table
    else:
        return coef_table, times, f1_test


def organize_output(experiments, true_effect, exp_time=None, f1_scores=None):
    """
    Important: experiments, experiments times and f1 scores should be in the same order
    Parameters
    ----------
    experiments
    true_effect
    exp_time

    Returns
    -------
    """
    Treatments = experiments['causes']
    experiments.set_index('causes', inplace=True)
    experiments['TrueTreat'] = true_effect
    Treatments_cate = np.transpose(experiments)
    BaselinesNames = experiments.columns
    mae = []
    for col in BaselinesNames:
        dif = np.abs(experiments[col] - experiments['TrueTreat'])
        mae.append(np.nanmean(dif))
    output = pd.DataFrame({'Method': BaselinesNames, 'MAE': mae})
    exp_time['TrueTreat'] = 0
    f1_scores['TrueTreat'] = 0
    if f1_scores is not None:
        output['F1_Test'] = [f1_scores[m] for m in output['Method'].values]
    if exp_time is not None:
        output['Time(s)'] = [exp_time[m] for m in output['Method'].values]

    out = pd.DataFrame(Treatments_cate, columns=Treatments)
    out.reset_index(inplace=True, drop=True)

    return pd.concat((output, out), 1)


colab = False
notebook = True
arg = {'config_path': 'config1.yaml',
       'seed_models': 5,
       'seed_data': 3,
       }
if colab:
    arg['path'] = '/content/'
    arg['config_path'] = arg['path']+arg['config_path']
else:
    arg['path'] = ''

if __name__ == "__main__":
    start_time = time.time()
    if notebook:
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        print("Cuda Availble:", torch.cuda.is_available(), " device: ", device)
        for j in range(arg['seed_data']):
            print('Data',j)
            for i in range(arg['seed_models']):
                print('Models',i)
                if i == 0 and j == 0:
                    output, name = main(config_path=arg['config_path'], seed_models=i, seed_data=j)
                else:
                    output_, name = main(config_path=arg['config_path'], seed_models=i, seed_data=j)
                    output = pd.concat([output, output_], 0, ignore_index=True)
    else:
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        print("Cuda Availble:", torch.cuda.is_available(), " device: ", device)
        for j in range(sys.argv[3]):
            print('Data', j)
            for i in range(sys.argv[2]):
                print('Models', i)
                if i == 0:
                    output, name = main(config_path=sys.argv[1], seed_models=i+1, seed_data=j+1)
                else:
                    output_, name = main(config_path=sys.argv[1], seed_models=i+1, seed_data=j+1)
                    output = pd.concat([output, output_], 0, ignore_index=True)

    output.to_csv(name)
    end_time = time.time() - start_time
    end_time_m = end_time / 60
    end_time_h = end_time_m / 60
    print("Time ------ {} min / {} hours ------".format(end_time_m, end_time_h))


Cuda Availble: True  device:  cuda
Data 0
Models 0
GWAS simulated data initialized!
...  5 true causes and  995  confounders
... Treatments:  5  proportions  [0.0058, 0.0034, 0.0036, 0.0153, 0.0143]
... Confounders:  995
... Target (y) : 0.3481
... Sample Size: 10000
 Data Simulation Done!
Instructions for updating:
non-resource variables are not supported in the long term
Running DA
... Done Holdout
... Done PPCA
... Pass Predictive Check: 0.5383335236320693
... Fitting Outcome Model

Done!
... Target - proportion of 1s 0.3481
M3E2: Train Shape  (6700, 995) (6700, 5)
... Model initialization done!
... Training
......  0  
... Train: loss  20.37 0.12 metric  [0.5        0.5        0.5        0.5        0.5        0.49002999] 
... Val: loss  14.76 metric  [0.5        0.5        0.5        0.50272578 0.5        0.45149724]
......  100  
... Train: loss  5.89 0.11 metric  [0.83163725 0.95313229 0.93847373 0.92556585 0.96140612 0.64519874] 
... Val: loss  22.77 metric  [0.54697443 0.552111

  coef_z = np.divide(coef_m, np.sqrt(coef_var / b))



Done!


 Learner: CEVAE
Available devices  1
Current cuda device  0
Note: Treatments should be the first columns of X
... length con and bin features 0 1000

Done!
... Target - proportion of 1s 0.33
M3E2: Train Shape  (67, 995) (67, 5)
... Model initialization done!
... Training
......  0  
... Train: loss  24.31 0.15 metric  [0.5 0.5 0.5 0.5 0.5 0.5] 
... Val: loss  15.74 metric  [0.5 0.5 nan 0.5 nan 0.5]
......  100  
... Train: loss  12.1 0.06 metric  [0.97638889 0.78298611 1.         0.87053571 0.97619048 0.74444444] 
... Val: loss  16.96 metric  [0.44444444 0.25              nan 0.61111111        nan 0.14285714]
... Final Metrics - Target
...... Train :  0.917
...... Val :  0.167
...... Test :  0.548
Outcome Y [ 0.25857115  0.21552475 -0.16355006 -0.06570905  0.0882514   0.7775242
  0.620648    0.5330294   0.8556335 ]
... CATE
Models 2
GWAS simulated data initialized!
...  5 true causes and  995  confounders
... Treatments:  5  proportions  [0.66, 0.48, 0.5, 1.88, 1.29]
... Confo

  coef_z = np.divide(coef_m, np.sqrt(coef_var / b))



Done!


 Learner: CEVAE
Note: Treatments should be the first columns of X
... length con and bin features 0 1000

Done!
... Target - proportion of 1s 0.33
M3E2: Train Shape  (67, 995) (67, 5)
... Model initialization done!
... Training
......  0  
... Train: loss  25.34 0.14 metric  [0.5 0.5 0.5 0.5 0.5 0.5] 
... Val: loss  14.69 metric  [0.5 nan 0.5 0.5 nan 0.5]


  alpha = loss_batch_treats.cpu().detach().numpy() / loss_batch_target.cpu().detach().numpy()


......  100  
... Train: loss  nan nan metric  [0.5 0.5 0.5 0.5 0.5 0.5] 
... Val: loss  nan metric  [0.5 nan 0.5 0.5 nan 0.5]
... Loading Best validation (epoch  68 )
... Final Metrics - Target
...... Train :  0.875
...... Val :  0.556
...... Test :  0.576
Outcome Y [ 0.84688747  0.21950173  0.52564734  0.41396508  0.7236869  -0.37232062
 -0.12663281  0.2849676  -0.37023973]
... CATE
Models 3
GWAS simulated data initialized!
...  5 true causes and  995  confounders
... Treatments:  5  proportions  [0.66, 0.48, 0.5, 1.88, 1.29]
... Confounders:  995
... Target (y) : 0.33
... Sample Size: 100
 Data Simulation Done!
Running DA
... Done Holdout
... Done PPCA
... Pass Predictive Check: 0.5372884732444249
... Fitting Outcome Model

Done!


 Learner: CEVAE
Note: Treatments should be the first columns of X
... length con and bin features 0 1000

Done!
... Target - proportion of 1s 0.33
M3E2: Train Shape  (67, 995) (67, 5)
... Model initialization done!
... Training
......  0  
... Train: loss

  coef_z = np.divide(coef_m, np.sqrt(coef_var / b))



Done!


 Learner: CEVAE
Note: Treatments should be the first columns of X
... length con and bin features 0 1000

Done!
... Target - proportion of 1s 0.33
M3E2: Train Shape  (67, 995) (67, 5)
... Model initialization done!
... Training
......  0  
... Train: loss  23.77 0.14 metric  [0.5        0.5        0.5        0.5        0.5        0.42757937] 
... Val: loss  16.39 metric  [0.5               nan 0.5        0.5        0.5        0.28333333]
......  100  
... Train: loss  11.78 0.06 metric  [0.74305556 0.98888889 0.52916667 0.96666667 0.96875    1.        ] 
... Val: loss  25.98 metric  [0.85              nan 0.30555556 0.5        0.5        0.38333333]


  alpha = loss_batch_treats.cpu().detach().numpy() / loss_batch_target.cpu().detach().numpy()


... Loading Best validation (epoch  103 )
... Final Metrics - Target
...... Train :  1.0
...... Val :  0.661
...... Test :  0.541
Outcome Y [ 0.34406677  0.45124665  0.84259874  0.38176963  0.46933913  0.8891741
 -0.17720579 -0.5124698   0.9996019 ]
... CATE
Data 1
Models 0
GWAS simulated data initialized!
...  5 true causes and  995  confounders
... Treatments:  5  proportions  [1.33, 1.52, 1.49, 1.43, 0.4]
... Confounders:  995
... Target (y) : 0.46
... Sample Size: 100
 Data Simulation Done!
Running DA
... Done Holdout
... Done PPCA
... Pass Predictive Check: 0.5392058807150415
... Fitting Outcome Model


  coef_z = np.divide(coef_m, np.sqrt(coef_var / b))



Done!


 Learner: CEVAE
Note: Treatments should be the first columns of X
... length con and bin features 0 1000

Done!
... Target - proportion of 1s 0.46
M3E2: Train Shape  (67, 995) (67, 5)
... Model initialization done!
... Training
......  0  
... Train: loss  24.72 0.15 metric  [0.5 0.5 0.5 0.5 0.5 0.5] 
... Val: loss  14.49 metric  [nan 0.5 0.5 0.5 0.5 0.5]


  alpha = loss_batch_treats.cpu().detach().numpy() / loss_batch_target.cpu().detach().numpy()


......  100  
... Train: loss  nan nan metric  [0.5 0.5 0.5 0.5 0.5 0.5] 
... Val: loss  nan metric  [nan 0.5 0.5 0.5 0.5 0.5]
... Loading Best validation (epoch  68 )
... Final Metrics - Target
...... Train :  0.917
...... Val :  0.268
...... Test :  0.591
Outcome Y [-0.09780908  0.01528432  0.75309014 -0.16267556 -0.14813279  0.7604158
 -0.32204288  0.93087125  0.8447084 ]
... CATE
Models 1
GWAS simulated data initialized!
...  5 true causes and  995  confounders
... Treatments:  5  proportions  [1.33, 1.52, 1.49, 1.43, 0.4]
... Confounders:  995
... Target (y) : 0.46
... Sample Size: 100
 Data Simulation Done!
Running DA
... Done Holdout
... Done PPCA
... Pass Predictive Check: 0.5391423957231164
... Fitting Outcome Model


  coef_z = np.divide(coef_m, np.sqrt(coef_var / b))



Done!


 Learner: CEVAE
Note: Treatments should be the first columns of X
... length con and bin features 0 1000

Done!
... Target - proportion of 1s 0.46
M3E2: Train Shape  (67, 995) (67, 5)
... Model initialization done!
... Training
......  0  
... Train: loss  24.16 0.14 metric  [0.5 0.5 0.5 0.5 0.5 0.5] 
... Val: loss  14.19 metric  [0.5 nan nan 0.5 0.5 0.5]
......  100  
... Train: loss  14.05 0.07 metric  [0.95486111 0.6        0.87847222 0.9625     0.95833333 0.70535714] 
... Val: loss  27.32 metric  [0.45              nan        nan 0.35       0.5        0.61666667]
... Loading Best validation (epoch  42 )
... Final Metrics - Target
...... Train :  nan
...... Val :  0.267
...... Test :  0.517
Outcome Y [0.7012444  0.60569566 0.24552627 0.41831723 0.35008743 0.74831754
 0.3972966  0.33638245 0.8625252 ]
... CATE
Models 2
GWAS simulated data initialized!
...  5 true causes and  995  confounders
... Treatments:  5  proportions  [1.33, 1.52, 1.49, 1.43, 0.4]
... Confounders:  995

  coef_z = np.divide(coef_m, np.sqrt(coef_var / b))



Done!


 Learner: CEVAE
Note: Treatments should be the first columns of X
... length con and bin features 0 1000

Done!
... Target - proportion of 1s 0.46
M3E2: Train Shape  (67, 995) (67, 5)
... Model initialization done!
... Training
......  0  
... Train: loss  22.92 0.15 metric  [0.5 0.5 0.5 0.5 0.5 0.5] 
... Val: loss  15.3 metric  [0.5 0.5 0.5 0.5 0.5 0.5]
......  100  
... Train: loss  10.77 0.05 metric  [1.         0.75810185 0.92361111 0.88161376 0.98611111 0.5       ] 
... Val: loss  22.9 metric  [0.5  0.5  0.5  0.35 0.5  0.5 ]
... Loading Best validation (epoch  179 )
... Final Metrics - Target
...... Train :  0.562
...... Val :  0.528
...... Test :  0.145
Outcome Y [ 0.9549008   0.1598577   0.39012355  0.22057512  0.6252206   0.15797989
 -0.09547663  0.46539685 -0.463152  ]
... CATE
Models 3
GWAS simulated data initialized!
...  5 true causes and  995  confounders
... Treatments:  5  proportions  [1.33, 1.52, 1.49, 1.43, 0.4]
... Confounders:  995
... Target (y) : 0.46
...

  coef_z = np.divide(coef_m, np.sqrt(coef_var / b))



Done!


 Learner: CEVAE
Note: Treatments should be the first columns of X
... length con and bin features 0 1000

Done!
... Target - proportion of 1s 0.46
M3E2: Train Shape  (67, 995) (67, 5)
... Model initialization done!
... Training
......  0  
... Train: loss  23.61 0.15 metric  [0.5 0.5 0.5 0.5 0.5 0.5] 
... Val: loss  14.19 metric  [0.5 0.5 0.5 nan 0.5 0.5]
......  100  
... Train: loss  13.38 0.04 metric  [0.80694444 0.65456349 0.85462963 1.         0.98611111 0.7718254 ] 
... Val: loss  16.85 metric  [0.5        0.55555556 0.3               nan 0.5        0.51785714]


  alpha = loss_batch_treats.cpu().detach().numpy() / loss_batch_target.cpu().detach().numpy()


... Loading Best validation (epoch  118 )
... Final Metrics - Target
...... Train :  1.0
...... Val :  0.607
...... Test :  0.47
Outcome Y [ 0.5423956   0.6129687   0.4875173   0.09075603  0.06100298 -0.2595773
  0.31979716  0.8272092   0.21450673]
... CATE
Models 4
GWAS simulated data initialized!
...  5 true causes and  995  confounders
... Treatments:  5  proportions  [1.33, 1.52, 1.49, 1.43, 0.4]
... Confounders:  995
... Target (y) : 0.46
... Sample Size: 100
 Data Simulation Done!
Running DA
... Done Holdout
... Done PPCA
... Pass Predictive Check: 0.5383482764381577
... Fitting Outcome Model


  coef_z = np.divide(coef_m, np.sqrt(coef_var / b))



Done!


 Learner: CEVAE
Note: Treatments should be the first columns of X
... length con and bin features 0 1000

Done!
... Target - proportion of 1s 0.46
M3E2: Train Shape  (67, 995) (67, 5)
... Model initialization done!
... Training
......  0  
... Train: loss  23.1 0.15 metric  [0.5        0.5        0.5        0.5        0.5        0.43115079] 
... Val: loss  17.46 metric  [       nan 0.5        0.5        0.5        0.5        0.46428571]
......  100  
... Train: loss  10.02 0.06 metric  [0.97638889 1.         0.5        0.91832011 1.         0.69722222] 
... Val: loss  36.71 metric  [nan 0.5 0.5 0.5 0.5 0.5]


  alpha = loss_batch_treats.cpu().detach().numpy() / loss_batch_target.cpu().detach().numpy()


... Loading Best validation (epoch  137 )
... Final Metrics - Target
...... Train :  1.0
...... Val :  0.467
...... Test :  0.531
Outcome Y [ 0.18443279  0.59209794  1.1865796   0.3596096   0.2738709   0.5351732
  0.08472682 -0.3953815   1.1208711 ]
... CATE
Data 2
Models 0
GWAS simulated data initialized!
...  5 true causes and  995  confounders
... Treatments:  5  proportions  [1.55, 0.6, 1.47, 0.46, 0.49]
... Confounders:  995
... Target (y) : 0.47
... Sample Size: 100
 Data Simulation Done!
Running DA
... Done Holdout
... Done PPCA
... Pass Predictive Check: 0.5348872097512547
... Fitting Outcome Model


  coef_z = np.divide(coef_m, np.sqrt(coef_var / b))



Done!


 Learner: CEVAE
Note: Treatments should be the first columns of X
... length con and bin features 0 1000

Done!
... Target - proportion of 1s 0.47
M3E2: Train Shape  (67, 995) (67, 5)
... Model initialization done!
... Training
......  0  
... Train: loss  24.55 0.13 metric  [0.5 0.5 0.5 0.5 0.5 0.5] 
... Val: loss  14.57 metric  [0.5 0.5 0.5 0.5 0.5 0.5]


  alpha = loss_batch_treats.cpu().detach().numpy() / loss_batch_target.cpu().detach().numpy()


......  100  
... Train: loss  nan nan metric  [0.5 0.5 0.5 0.5 0.5 0.5] 
... Val: loss  nan metric  [0.5 0.5 0.5 0.5 0.5 0.5]
... Loading Best validation (epoch  77 )
... Final Metrics - Target
...... Train :  1.0
...... Val :  0.267
...... Test :  0.483
Outcome Y [-0.10897332  0.03057417  0.71971935 -0.17652188 -0.02893458  0.7476999
 -0.43414894  0.95000386  0.84550816]
... CATE
Models 1
GWAS simulated data initialized!
...  5 true causes and  995  confounders
... Treatments:  5  proportions  [1.55, 0.6, 1.47, 0.46, 0.49]
... Confounders:  995
... Target (y) : 0.47
... Sample Size: 100
 Data Simulation Done!
Running DA
... Done Holdout
... Done PPCA
... Pass Predictive Check: 0.5339187027742541
... Fitting Outcome Model


  coef_z = np.divide(coef_m, np.sqrt(coef_var / b))



Done!


 Learner: CEVAE
Note: Treatments should be the first columns of X
... length con and bin features 0 1000

Done!
... Target - proportion of 1s 0.47
M3E2: Train Shape  (67, 995) (67, 5)
... Model initialization done!
... Training
......  0  
... Train: loss  21.82 0.13 metric  [0.5 0.5 0.5 0.5 0.5 0.5] 
... Val: loss  17.19 metric  [0.5 0.5 0.5 0.5 0.5 0.5]
......  100  
... Train: loss  11.5 0.06 metric  [1.         0.5        0.72222222 0.95767196 1.         0.77      ] 
... Val: loss  30.51 metric  [0.5        0.5        0.5        0.54166667 0.5        0.51666667]
... Loading Best validation (epoch  34 )
... Final Metrics - Target
...... Train :  0.75
...... Val :  0.646
...... Test :  nan
Outcome Y [0.67597353 0.61923516 0.20102645 0.43625847 0.4171147  0.7311788
 0.43616098 0.2828602  0.81574154]
... CATE
Models 2
GWAS simulated data initialized!
...  5 true causes and  995  confounders
... Treatments:  5  proportions  [1.55, 0.6, 1.47, 0.46, 0.49]
... Confounders:  995
..

  coef_z = np.divide(coef_m, np.sqrt(coef_var / b))



Done!


 Learner: CEVAE
Note: Treatments should be the first columns of X
... length con and bin features 0 1000

Done!
... Target - proportion of 1s 0.47
M3E2: Train Shape  (67, 995) (67, 5)
... Model initialization done!
... Training
......  0  
... Train: loss  22.96 0.13 metric  [0.5 0.5 0.5 0.5 0.5 0.5] 
... Val: loss  15.22 metric  [0.5 0.5 0.5 0.5 0.5 0.5]


  alpha = loss_batch_treats.cpu().detach().numpy() / loss_batch_target.cpu().detach().numpy()


......  100  
... Train: loss  nan nan metric  [0.5 0.5 0.5 0.5 0.5 0.5] 
... Val: loss  nan metric  [0.5 0.5 0.5 0.5 0.5 0.5]
... Loading Best validation (epoch  57 )
... Final Metrics - Target
...... Train :  0.75
...... Val :  0.528
...... Test :  0.47
Outcome Y [ 0.79440707  0.30277696  0.7722678   0.3625841   0.9212778   0.12681212
  0.1514689   0.06685656 -0.31714943]
... CATE
Models 3
GWAS simulated data initialized!
...  5 true causes and  995  confounders
... Treatments:  5  proportions  [1.55, 0.6, 1.47, 0.46, 0.49]
... Confounders:  995
... Target (y) : 0.47
... Sample Size: 100
 Data Simulation Done!
Running DA
... Done Holdout
... Done PPCA
... Pass Predictive Check: 0.5343775853510562
... Fitting Outcome Model


  coef_z = np.divide(coef_m, np.sqrt(coef_var / b))



Done!


 Learner: CEVAE
Note: Treatments should be the first columns of X
... length con and bin features 0 1000

Done!
... Target - proportion of 1s 0.47
M3E2: Train Shape  (67, 995) (67, 5)
... Model initialization done!
... Training
......  0  
... Train: loss  23.47 0.14 metric  [0.5 0.5 0.5 0.5 0.5 0.5] 
... Val: loss  14.98 metric  [0.5 0.5 0.5 0.5 nan 0.5]


  alpha = loss_batch_treats.cpu().detach().numpy() / loss_batch_target.cpu().detach().numpy()


......  100  
... Train: loss  nan nan metric  [0.5 0.5 0.5 0.5 0.5 0.5] 
... Val: loss  nan metric  [0.5 0.5 0.5 0.5 nan 0.5]
... Loading Best validation (epoch  43 )
... Final Metrics - Target
...... Train :  0.833
...... Val :  0.7
...... Test :  nan
Outcome Y [0.78763735 0.7776691  0.61727524 0.2918982  0.380773   0.26616123
 0.04834126 0.6533435  0.26228935]
... CATE
Models 4
GWAS simulated data initialized!
...  5 true causes and  995  confounders
... Treatments:  5  proportions  [1.55, 0.6, 1.47, 0.46, 0.49]
... Confounders:  995
... Target (y) : 0.47
... Sample Size: 100
 Data Simulation Done!
Running DA
... Done Holdout
... Done PPCA
... Pass Predictive Check: 0.5349611163201147
... Fitting Outcome Model


  coef_z = np.divide(coef_m, np.sqrt(coef_var / b))



Done!


 Learner: CEVAE
Note: Treatments should be the first columns of X
... length con and bin features 0 1000

Done!
... Target - proportion of 1s 0.47
M3E2: Train Shape  (67, 995) (67, 5)
... Model initialization done!
... Training
......  0  
... Train: loss  23.17 0.13 metric  [0.5        0.5        0.5        0.5        0.5        0.61894841] 
... Val: loss  15.28 metric  [0.5        0.5               nan 0.5        0.5        0.53571429]


  alpha = loss_batch_treats.cpu().detach().numpy() / loss_batch_target.cpu().detach().numpy()


......  100  
... Train: loss  nan nan metric  [0.5 0.5 0.5 0.5 0.5 0.5] 
... Val: loss  nan metric  [0.5 0.5 nan 0.5 0.5 0.5]
... Loading Best validation (epoch  28 )
... Final Metrics - Target
...... Train :  0.31
...... Val :  0.722
...... Test :  0.45
Outcome Y [0.47158942 0.7450585  0.9365874  0.67255044 0.64072895 0.43602413
 0.1825607  0.14941107 0.96457034]
... CATE
Time ------ 18.77427215973536 min / 0.3129045359955894 hours ------


In [None]:
#CEVAE IS KILLING IT

In [2]:
output 

Unnamed: 0,Method,MAE,F1_Test,Time(s),T0,T1,T2,T3,T4,seed_data,seed_models
0,DA,0.432298,0.008757,302.538863,-0.135996,0.057605,-0.086481,-0.137129,-0.04664,0,0
1,M3E2,0.588978,0.36831,182.667993,-0.463414,0.3834,0.336178,-0.754013,0.115517,0,0
2,TrueTreat,0.0,0.0,0.0,0.441013,0.100039,0.244684,0.560223,0.466889,0,0
3,DA,0.479307,0.0,6.907268,-0.138414,-0.148438,-0.0046,-0.322466,0.030232,0,1
4,CEVAE,0.523504,0.354776,11.40629,-0.142186,-0.376907,0.016633,-0.298752,-0.003456,0,1
5,M3E2,0.342146,0.380952,27.226099,0.258571,0.215525,-0.16355,-0.065709,0.088251,0,1
6,TrueTreat,0.0,0.0,0.0,0.441013,0.100039,0.244684,0.560223,0.466889,0,1
7,DA,0.428296,0.47619,6.888356,-0.053971,-0.143406,-0.006181,-0.146413,0.021341,0,2
8,CEVAE,0.47013,0.446743,11.219191,-0.027217,-0.376016,0.040967,-0.219545,0.044008,0,2
9,M3E2,0.241871,0.514286,27.523681,0.846887,0.219502,0.525647,0.413965,0.723687,0,2


In [3]:
output.groupby(['seed_data','Method']).mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,MAE,F1_Test,Time(s),T0,T1,T2,T3,T4,seed_models
seed_data,Method,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,CEVAE,0.497913,0.429529,11.145766,-0.021853,-0.363926,0.001769,-0.302155,0.009449,2.5
0,DA,0.445874,0.125561,65.841785,-0.062878,-0.103159,-0.040215,-0.229229,0.018963,2.0
0,M3E2,0.346664,0.335376,58.39727,0.315079,0.370299,0.376289,-0.003244,0.313483,2.0
0,TrueTreat,0.0,0.0,0.0,0.441013,0.100039,0.244684,0.560223,0.466889,2.0
1,CEVAE,0.304806,0.451771,11.07433,-0.046605,-0.022529,0.191743,-0.02457,-0.157119,2.0
1,DA,0.273532,0.302944,6.476975,-0.004463,-0.022075,0.123746,-0.021026,-0.106889,2.0
1,M3E2,0.462689,0.499799,27.795797,0.457033,0.397181,0.612567,0.185316,0.23241,2.0
1,TrueTreat,0.0,0.0,0.0,0.406086,-0.152939,-0.132043,-0.268242,0.216352,2.0
2,CEVAE,0.457083,0.40993,11.235518,-0.234662,-0.017276,0.235767,-0.193391,0.284392,2.0
2,DA,0.400401,0.383586,6.372289,-0.173463,-0.041432,0.083955,-0.130795,0.276253,2.0


In [None]:
import torch
a = torch.randn(2, 2)
print('Available devices ', torch.cuda.device_count())
print('Current cuda device ', torch.cuda.current_device())
cuda = torch.device(0)
b = a.cuda()
print(a)
print(b)