# M3E2

## Author: Raquel Aoki

Date: Spring 2021


In [None]:
!git clone https://github.com/raquelaoki/ParKCa.git
!git clone https://github.com/raquelaoki/CompBioAndSimulated_Datasets.git
#!git clone https://github.com/JakeColtman/bartpy.git

In [10]:
import random
import pandas as pd
import numpy as np
import sys
import yaml
import time
from sklearn.model_selection import train_test_split
import torch

sys.path.insert(0, 'src/')
#sys.path.insert(0, 'bartpy/')  # https://github.com/JakeColtman/bartpy
sys.path.insert(0, 'ParKCa/src/')
#from ParKCa.src.train import *
from CompBioAndSimulated_Datasets.simulated_data_multicause import *
import model_m3e2 as m3e2


def main(config_path, seed_models, seed_data):
    """Start: Parameters Loading"""
    with open(config_path) as f:
        config = yaml.safe_load(f)
    params = config['parameters']

    # Fix numpy seed for reproducibility
    np.random.seed(seed_models)
    # Fix random seed for reproducibility
    random.seed(seed_models)
    # Fix Torch graph-level seed for reproducibility
    torch.manual_seed(seed_models)

    if 'gwas' in params['data']:

        params_b = {'DA': {'k': [15]},
                    'CEVAE': {'num_epochs': 100, 'batch': 200, 'z_dim': 10}}

        params["n_treatments"] = trykey(params, 'n_treatments', 5)
        prop = params["n_treatments"] / (params["n_treatments"] + params['n_covariates'])

        sdata_gwas = gwas_simulated_data(prop_tc=prop,
                                         pca_path='CompBioAndSimulated_Datasets/data/tgp_pca2.txt',
                                         seed=seed_data,
                                         n_units=params['n_sample'],
                                         n_causes=params["n_treatments"] + params['n_covariates'],
                                         true_causes=params["n_treatments"])
        X, y, y01, treatement_columns, treatment_effects, group = sdata_gwas.generate_samples()
        # Train and Test split use the same seed
        params['baselines'] = trykey(params, 'baselines', False)
        if params['baselines']:
            baselines_results, exp_time, f1_test = baselines(params['baselines_list'], pd.DataFrame(X), y01, params_b,
                                                             TreatCols=treatement_columns, timeit=True,
                                                             seed=seed_models)
        else:
            baselines_results, exp_time, f1_test = baselines(['noise'], pd.DataFrame(X), y01, params_b,
                                                             TreatCols=treatement_columns, timeit=True,
                                                             seed=seed_models)

        start_time = time.time()
        X_train, X_test, y_train, y_test = train_test_split(X, y01, test_size=0.33, random_state=seed_models)
        print('... Target - proportion of 1s', np.sum(y01) / len(y01))
        # Split X1, X2 on GWAS: case with no clinicla variables , X2 = X
        X1_cols = []
        X2_cols = range(X.shape[1] - len(treatement_columns))

        data_nnl = m3e2.data_nn(X_train.values, X_test.values, y_train, y_test, treatement_columns,
                                treatment_effects[treatement_columns], X1_cols, X2_cols)
        loader_train, loader_val, loader_test, num_features = data_nnl.loader(params['suffle'], params['batch_size'],
                                                                              seed_models)
        params['pos_weights'] = data_nnl.treat_weights
        params['pos_weight_y'] = trykey(params, 'pos_weight_y', 1)
        params['hidden1'] = trykey(params, 'hidden1', 64)
        params['hidden2'] = trykey(params, 'hidden2', 8)
        cate_m3e2, f1_test_ = m3e2.fit_nn(loader_train, loader_val, loader_test, params, treatement_columns,
                                          num_features,
                                          X1_cols, X2_cols)
        print('... CATE')
        baselines_results['M3E2'] = cate_m3e2
        exp_time['M3E2'] = time.time() - start_time
        f1_test['M3E2'] = f1_test_
        output = organize_output(baselines_results.copy(), treatment_effects[treatement_columns], exp_time, f1_test)
    if 'copula' in params['data']:
        params_b = {'DA': {'k': [5]},
                    'CEVAE': {'num_epochs': 100, 'batch': 200, 'z_dim': 5}}

        sdata_copula = copula_simulated_data(seed=seed_data, n=params['n_sample'], s=params['n_covariates'])
        X, y, y01, treatement_columns, treatment_effects = sdata_copula.generate_samples()

        if params['baselines']:
            baselines_results, exp_time, f1_test = baselines(params['baselines_list'], pd.DataFrame(X), y01, params_b,
                                                             TreatCols=treatement_columns, timeit=True,
                                                             seed=seed_models)
        else:
            baselines_results, exp_time, f1_test = baselines(['noise'], pd.DataFrame(X), y01, params_b,
                                                             TreatCols=treatement_columns, timeit=True,
                                                             seed=seed_models)
        start = time.time()
        X_train, X_test, y_train, y_test = train_test_split(X, y01, test_size=0.33, random_state=seed_models)
        X1_cols = []
        X2_cols = range(X.shape[1] - len(treatement_columns))
        # TODO: add other baselines here to run everything on the same train/testing sets

        data_nnl = m3e2.data_nn(X_train, X_test, y_train, y_test, treatement_columns,
                                treatment_effects, X1_cols, X2_cols)
        loader_train, loader_val, loader_test, num_features = data_nnl.loader(params['suffle'], params['batch_size'],
                                                                              seed_models)
        params['pos_weights'] = data_nnl.treat_weights
        params['pos_weight_y'] = trykey(params, 'pos_weight_y', 1)
        params['hidden1'] = trykey(params, 'hidden1', 6)
        params['hidden2'] = trykey(params, 'hidden2', 6)

        cate_m3e2, f1_test_ = m3e2.fit_nn(loader_train, loader_val, loader_test, params, treatement_columns,
                                          num_features,
                                          X1_cols, X2_cols)
        print('... CATE')
        cate = pd.DataFrame({'CATE_M3E2': cate_m3e2, 'True_Effect': treatment_effects})
        baselines_results['M3E2'] = cate_m3e2
        exp_time['M3E2'] = time.time() - start_time
        f1_test['M3E2'] = f1_test_
        output = organize_output(baselines_results.copy(), treatment_effects[treatement_columns], exp_time, f1_test)
    if 'gwas' not in params['data'] and 'copula' not in params['data']:
        print(
            "ERRROR! \nDataset not recognized. \nChange the parameter data in your config.yaml file to gwas or copula.")

    name = 'output_' + params['data'][0] + '_' + params['id'] + '.csv'
    output['seed_data'] = seed_data
    output['seed_models'] = seed_models

    return output, name


def trykey(params, key, default):
    try:
        return params[key]
    except KeyError:
        params[key] = default
        return params[key]


def baselines(BaselinesList, X, y, ParamsList, seed=63, TreatCols=None, id='', timeit=False):
    """
    input:
        X, colnamesX: potential causes and their names
        Z, colnamesZ: confounders and their names
        y: 01 outcome
        causes: name of the potential causes
    """

    if TreatCols is None:
        TreatCols = list(range(X.shape[1]))

    # check if binary treatments
    X01 = X.copy()
    for col in TreatCols:
        a = X01.iloc[:, col]
        if not ((a == 0) | (a == 1)).all():
            mean_v = np.mean(X01.iloc[:, col])
            X01.iloc[:, col] = [1 if i > mean_v else 0 for i in X01.iloc[:, col]]
        else:
            pass

    X_train, X_test, y_train, y_test, X_train01, X_test01 = train_test_split(X, y, X01,
                                                                             test_size=0.33, random_state=seed)
    coef_table = pd.DataFrame(columns=['causes'])
    coef_table['causes'] = ['T' + str(i) for i in range(len(TreatCols))]
    times, f1_test = {}, {}

    if 'DA' in BaselinesList:
        start_time = time.time()
        from deconfounder import deconfounder_algorithm as DA
        ParamsList['DA']['k'] = trykey(ParamsList['DA'], 'k', 15)  # if exploring multiple latent sizes
        for k in ParamsList['DA']['k']:
            if len(ParamsList['DA']['k']) > 1:
                coln = 'DA_' + str(id) + str(k)
            else:
                coln = 'DA'
            model_da = DA(X_train, X_test, y_train, y_test, k, print_=False)
            ParamsList['DA']['class_weight'] = trykey(ParamsList['DA'], 'class_weight', {0: 1, 1: 1})
            coef, coef_continuos, roc, f1_test['DA'] = model_da.fit(class_weight=ParamsList['DA']['class_weight'])
            coef_table[coln] = coef_continuos[TreatCols]
        times['DA'] = time.time() - start_time
        print('\nDone!')

    if 'BART' in BaselinesList:
        start_time = time.time()
        from bart import BART as BART
        model_bart = BART(X_train01, X_test01, y_train, y_test)
        ParamsList['BART']['n_trees'] = trykey(ParamsList['BART'], 'n_trees', 50)
        ParamsList['BART']['n_burn'] = trykey(ParamsList['BART'], 'n_burn', 100)
        model_bart.fit(n_trees=ParamsList['BART']['n_trees'], n_burn=ParamsList['BART']['n_burn'], print_=False)
        print('...... predictions')
        coef_table['BART'], f1_test['BART'] = model_bart.cate(TreatCols, print_=False)
        times['BART'] = time.time() - start_time
        print('\nDone!')

    if 'CEVAE' in BaselinesList:
        print('\n\n Learner: CEVAE')
        start_time = time.time()
        from cevae import CEVAE as CEVAE
        print('Note: Treatments should be the first columns of X')
        ParamsList['CEVAE']['epochs'] = trykey(ParamsList['CEVAE'], 'epochs', 100)
        ParamsList['CEVAE']['batch'] = trykey(ParamsList['CEVAE'], 'batch', 200)
        ParamsList['CEVAE']['z_dim'] = trykey(ParamsList['CEVAE'], 'z_dim', 5)

        confeatures, binfeatures = [], []
        for col in range(X_train01.shape[1]):
            a = X_train01.iloc[:, col]
            if not ((a == 0) | (a == 1)).all():
                confeatures.append(col)
            else:
                binfeatures.append(col)

        print('... length con and bin features', len(confeatures), len(binfeatures))
        model_cevae = CEVAE(X_train01, X_test01, y_train, y_test, TreatCols,
                            binfeats=binfeatures, contfeats=confeatures,
                            epochs=ParamsList['CEVAE']['epochs'],
                            batch=ParamsList['CEVAE']['batch'],
                            z_dim=ParamsList['CEVAE']['z_dim'])
        coef_table['CEVAE'], f1_test['CEVAE'] = model_cevae.fit_all(print_=False)
        times['CEVAE'] = time.time() - start_time
        print('\nDone!')

    if not timeit:
        return coef_table
    else:
        return coef_table, times, f1_test


def organize_output(experiments, true_effect, exp_time=None, f1_scores=None):
    """
    Important: experiments, experiments times and f1 scores should be in the same order
    Parameters
    ----------
    experiments
    true_effect
    exp_time

    Returns
    -------
    """
    Treatments = experiments['causes']
    experiments.set_index('causes', inplace=True)
    experiments['TrueTreat'] = true_effect
    Treatments_cate = np.transpose(experiments)
    BaselinesNames = experiments.columns
    mae = []
    for col in BaselinesNames:
        dif = np.abs(experiments[col] - experiments['TrueTreat'])
        mae.append(np.nanmean(dif))
    output = pd.DataFrame({'Method': BaselinesNames, 'MAE': mae})
    exp_time['TrueTreat'] = 0
    f1_scores['TrueTreat'] = 0
    if f1_scores is not None:
        output['F1_Test'] = [f1_scores[m] for m in output['Method'].values]
    if exp_time is not None:
        output['Time(s)'] = [exp_time[m] for m in output['Method'].values]

    out = pd.DataFrame(Treatments_cate, columns=Treatments)
    out.reset_index(inplace=True, drop=True)

    return pd.concat((output, out), 1)


colab = False
notebook = True
arg = {'config_path': 'config1.yaml',
       'seed_models': 3,
       'seed_data': 2,
       }
if colab:
    arg['path'] = '/content/'
    arg['config_path'] = arg['path']+arg['config_path']
else:
    arg['path'] = ''

if __name__ == "__main__":
    start_time = time.time()
    if notebook:
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        print("Cuda Availble:", torch.cuda.is_available(), " device: ", device)
        for j in range(arg['seed_data']):
            print('Data',j)
            for i in range(arg['seed_models']):
                print('Models',i)
                if i == 0 and j == 0:
                    output, name = main(config_path=arg['config_path'], seed_models=i, seed_data=j)
                else:
                    output_, name = main(config_path=arg['config_path'], seed_models=i, seed_data=j)
                    output = pd.concat([output, output_], 0, ignore_index=True)
    else:
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        print("Cuda Availble:", torch.cuda.is_available(), " device: ", device)
        for j in range(sys.argv[3]):
            print('Data', j)
            for i in range(sys.argv[2]):
                print('Models', i)
                if i == 0:
                    output, name = main(config_path=sys.argv[1], seed_models=i+1, seed_data=j+1)
                else:
                    output_, name = main(config_path=sys.argv[1], seed_models=i+1, seed_data=j+1)
                    output = pd.concat([output, output_], 0, ignore_index=True)

    output.to_csv(name)
    end_time = time.time() - start_time
    end_time_m = end_time / 60
    end_time_h = end_time_m / 60
    print("Time ------ {} min / {} hours ------".format(end_time_m, end_time_h))


Cuda Availble: True  device:  cuda
Data 0
Models 0
GWAS simulated data initialized!
...  5 true causes and  995  confounders
... Treatments:  5  proportions  [0.049, 0.045, 0.065, 0.131, 0.154]
... Confounders:  995
... Target (y) : 0.378
... Sample Size: 1000
 Data Simulation Done!
... Target - proportion of 1s 0.378
M3E2: Train Shape  (670, 995) (670, 5)
... Model initialization done!
... Training
CHECKING LOSSES BEFORE WEIGHT: 6.710838 0.7795027 0.12966529
CHECKING LOSSES AFTER WEIGHT: 6.7108378410339355 7.795026898384094 1.2966528534889221
......  0  
... Train: loss  17.6 0.13 metric  [0.5       0.5       0.5       0.5       0.5       0.4934663] 
... Val: loss  13.87 metric  [0.5        0.5        0.5        0.5        0.5        0.56499203]
CHECKING LOSSES BEFORE WEIGHT: 6.6081305 0.65776396 0.113591835
CHECKING LOSSES AFTER WEIGHT: 6.60813045501709 6.577639579772949 1.1359183490276337
......  15  
... Train: loss  14.88 0.12 metric  [0.5        0.93543789 0.66348042 0.7762324  0

GWAS simulated data initialized!
...  5 true causes and  995  confounders
... Treatments:  5  proportions  [0.049, 0.045, 0.065, 0.131, 0.154]
... Confounders:  995
... Target (y) : 0.378
... Sample Size: 1000
 Data Simulation Done!
... Target - proportion of 1s 0.378
M3E2: Train Shape  (670, 995) (670, 5)
... Model initialization done!
... Training
CHECKING LOSSES BEFORE WEIGHT: 8.230016 0.94015074 0.11017722
CHECKING LOSSES AFTER WEIGHT: 8.230015754699707 9.401507377624512 1.1017721891403198
......  0  
... Train: loss  17.89 0.13 metric  [0.5 0.5 0.5 0.5 0.5 0.5] 
... Val: loss  14.7 metric  [0.5 0.5 0.5 0.5 0.5 0.5]
CHECKING LOSSES BEFORE WEIGHT: 7.3268895 0.72820914 0.11359765
CHECKING LOSSES AFTER WEIGHT: 7.326889514923096 7.282091379165649 1.1359764635562897
......  15  
... Train: loss  15.88 0.12 metric  [0.57879005 0.5        0.72433061 0.46444883 0.5        0.61796276] 
... Val: loss  14.62 metric  [0.47454844 0.5        0.60183566 0.51412067 0.5        0.47680818]
CHECKING 

Models 1
GWAS simulated data initialized!
...  5 true causes and  995  confounders
... Treatments:  5  proportions  [0.149, 0.14, 0.161, 0.126, 0.049]
... Confounders:  995
... Target (y) : 0.464
... Sample Size: 1000
 Data Simulation Done!
... Target - proportion of 1s 0.464
M3E2: Train Shape  (670, 995) (670, 5)
... Model initialization done!
... Training
CHECKING LOSSES BEFORE WEIGHT: 6.4871016 0.78818715 0.12636222
CHECKING LOSSES AFTER WEIGHT: 6.4871015548706055 7.881871461868286 1.2636221945285797
......  0  
... Train: loss  17.86 0.13 metric  [0.5 0.5 0.5 0.5 0.5 0.5] 
... Val: loss  14.85 metric  [0.5 0.5 0.5 0.5 0.5 0.5]
CHECKING LOSSES BEFORE WEIGHT: 7.132955 0.89931506 0.10866449
CHECKING LOSSES AFTER WEIGHT: 7.132955074310303 8.99315059185028 1.0866449028253555
......  15  
... Train: loss  17.29 0.12 metric  [0.5 0.5 0.5 0.5 0.5 0.5] 
... Val: loss  14.41 metric  [0.5 0.5 0.5 0.5 0.5 0.5]
CHECKING LOSSES BEFORE WEIGHT: 5.023471 0.75472915 0.10834636
CHECKING LOSSES AFTER 

In [None]:
#CEVAE IS KILLING IT

In [11]:
output 

Unnamed: 0,Method,MAE,F1_Test,Time(s),T0,T1,T2,T3,T4,seed_data,seed_models
0,M3E2,0.349089,0.434483,48.06861,0.198221,0.173027,-0.224656,-0.366928,0.500065,0,0
1,TrueTreat,0.0,0.0,0.0,0.441013,0.100039,0.244684,0.560223,0.466889,0,0
2,M3E2,0.365423,0.09375,46.823281,-0.041321,-0.232017,-0.040022,0.091658,0.207436,0,1
3,TrueTreat,0.0,0.0,0.0,0.441013,0.100039,0.244684,0.560223,0.466889,0,1
4,M3E2,0.444873,0.343137,47.496845,0.171636,0.088508,0.009028,-0.295582,-0.385104,0,2
5,TrueTreat,0.0,0.0,0.0,0.441013,0.100039,0.244684,0.560223,0.466889,0,2
6,M3E2,0.166265,0.469055,47.194874,0.264372,0.144836,-0.030677,-0.271953,0.503112,1,0
7,TrueTreat,0.0,0.0,0.0,0.406086,-0.152939,-0.132043,-0.268242,0.216352,1,0
8,M3E2,0.394281,0.435115,46.061867,-0.062907,0.158446,0.152092,0.442938,0.02064,1,1
9,TrueTreat,0.0,0.0,0.0,0.406086,-0.152939,-0.132043,-0.268242,0.216352,1,1


In [12]:
output.groupby(['seed_data','Method']).mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,MAE,F1_Test,Time(s),T0,T1,T2,T3,T4,seed_models
seed_data,Method,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,M3E2,0.386462,0.290457,47.462912,0.109512,0.009839,-0.085217,-0.190284,0.107466,1
0,TrueTreat,0.0,0.0,0.0,0.441013,0.100039,0.244684,0.560223,0.466889,1
1,M3E2,0.290442,0.471241,46.414378,0.169003,0.195585,0.079486,0.058043,0.081212,1
1,TrueTreat,0.0,0.0,0.0,0.406086,-0.152939,-0.132043,-0.268242,0.216352,1


In [None]:
import torch
a = torch.randn(2, 2)
print('Available devices ', torch.cuda.device_count())
print('Current cuda device ', torch.cuda.current_device())
cuda = torch.device(0)
b = a.cuda()
print(a)
print(b)