# Imports

In [1]:
import cv2
import matplotlib.pyplot as plt
import os
import pandas as pd
import tensorflow as tf
from tensorflow.io import gfile


#Local Imports 
import config 
import estimators
import data_kaggle as kd
import icetea_feature_extraction as fe
import icetea_data_simulation as ds 
import utils 


path_root = '/content/drive/MyDrive/ColabNotebooks/data'
#path_images_png = 'icetea_png/train'
#path_tfrecords = 'icetea_tfr/'
path_tfrecords_new = 'new_data_small/'#'icetea_newdata/' 
path_features = 'icetea_features/'
path_results = 'icetea_kaggle_results/'

# Prefix of images after join (images + simulated t and y )
prefix_trainNew = prefix_output = 'trainNew'

In [2]:
from google.colab import drive
drive.mount('/content/drive')

paths_list = [path_images_png, path_tfrecords, path_tfrecords_new, path_features, path_results]
paths_list = [os.path.join(path_root, path) for path in paths_list]

for path in paths_list:
  assert os.path.isdir(path), path+': Folder does not exist!'

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Causal Inference Model 

Use case

Each row on list_of_datasets contain a dataset. `sim_id` is a unique identifier of the datasets, defined by their knobs, setting id, and repetition index (a given setting (gamma, beta, alpha, knob) is repeted `b` times. 

In [3]:
list_of_datasets = pd.read_csv(os.path.join(path_root, path_features,'true_tau.csv'))
print(list_of_datasets.shape, path_tfrecords_new)
list_of_datasets.head()

(300, 9) new_data_small/


Unnamed: 0,setting_id,sim_id,repetition,alpha,tau,setting,gamma,beta,knob
0,ks0,sim_ks0_b0_0.1_0.5_0.5,b0,0.1,-0.226463,0,0.5,0.5,ks
1,ks1,sim_ks1_b0_0.5_0.5_0.5,b0,0.5,-0.242285,1,0.5,0.5,ks
2,ks2,sim_ks2_b0_1_0.5_0.5,b0,1.0,-0.23094,2,0.5,0.5,ks
3,ks3,sim_ks3_b0_2_0.5_0.5,b0,2.0,-0.215248,3,0.5,0.5,ks
4,ks4,sim_ks4_b0_8_0.5_0.5,b0,8.0,-0.235083,4,0.5,0.5,ks


In [4]:
#  IMPORTANT: in param_data and param_method, each key must receive a list []. 

param_data={
    'name':['kagle_retinal'],
    'path_tfrecords':[os.path.join(path_root, path_tfrecords_new)], #path_tfrecords
    'prefix_train':[prefix_trainNew],
    'image_size':[[256,256]],
    'batch_size':[8],
}

param_method = {
    'name_estimator':['aipw','oahaca'], 
    'name_metric': ['mse'],
    'name_base_model': ['resnet50','inceptionv3', 'image_regression'], 
    'name_prop_score':['LogisticRegression_NN'],
    'epochs':[2],
    'steps':[2], 
    'repetitions': [2]

}

parameters = config.MakeParameters(param_data, param_method)

#  Dataframe to keep all results
results_all_datasets = pd.DataFrame()

for i, sim_id in enumerate(list_of_datasets['sim_id']):
  if i > 3: 
    # Running a small test, so only running three datasets 
    continue 
  else: 
    #  Loads dataset with appropried sim_id.
    data = utils.ImageData(seed=sim_id, param_data=parameters.config_data[0])
    #  Creates a temporary DataFrame to keep the repetitions results under this dataset;
    #  Meaning: data is loaded once, and we have several models (defined in parameters.config_methods)
    #  using this dataset. 
    results_one_dataset = pd.DataFrame()
    for config in parameters.config_methods:
      # utils.repead_experiment: (data, setting) x param_method.repetitions
      results_one_config = utils.repeat_experiment(data,config)  
      results_one_dataset = pd.concat([results_one_dataset, results_one_config])
    results_one_dataset['sim_id'] = sim_id
  
  #  Combines all datasets together.
  results_all_datasets = pd.concat([results_all_datasets,results_one_dataset])
  #  It writes (and overwrite) the output after each dataset.
  with gfile.GFile(os.path.join(os.path.join(path_root, path_results), 'experiments_results' + '.csv'), 'w') as out:
    out.write(results_all_datasets.to_csv(index=False))  

results_all_datasets = pd.merge(results_all_datasets,list_of_datasets, how='left')
with gfile.GFile(os.path.join(os.path.join(path_root, path_results), 'experiments_results' + '.csv'), 'w') as out:
    out.write(results_all_datasets.to_csv(index=False))  

Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/resnet/resnet50_weights_tf_dim_ordering_tf_kernels_notop.h5
Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/inception_v3/inception_v3_weights_tf_dim_ordering_tf_kernels_notop.h5
param_method {'name_estimator': 'aipw', 'name_metric': 'mse', 'name_base_model': 'resnet50', 'name_prop_score': 'LogisticRegression_NN', 'epochs': 2, 'steps': 2, 'repetitions': 2, 'estimator': <function estimator at 0x7f950d8d0830>, 'base_model': <keras.engine.functional.Functional object at 0x7f948c462290>, 'metric': <function mean_squared_error at 0x7f950e7c9560>, 'prop_score': <config._LogisticRegressionNN object at 0x7f948c001a10>}
Epoch 1/2
2/2 - 17s - loss: 7.0191 - mse: 7.0191 - mae: 2.1597 - 17s/epoch - 9s/step
Epoch 2/2
2/2 - 1s - loss: 7.3555 - mse: 7.3555 - mae: 2.4536 - 718ms/epoch - 359ms/step
Epoch 1/2
2/2 - 1s - loss: 4.5753 - mse: 4.5753 - mae: 1.7050 - 536ms/epoch - 268ms/step
E

In [6]:
results_all_datasets.head()

Unnamed: 0,t_est,mse0,mse1,bias0,bias1,variance,name,seed,method_estimator,method_base_model,...,time,sim_id,setting_id,repetition,alpha,tau,setting,gamma,beta,knob
0,0.9438863,7.355524,9.045016,2.453632,2.469263,0.4354385,kagle_retinal,0,aipw,resnet50,...,90.519919,sim_ks0_b0_0.1_0.5_0.5,ks0,b0,0.1,-0.226463,0,0.5,0.5,ks
1,-0.6949798,0.551264,0.401521,0.586905,0.535468,0.02130879,kagle_retinal,1,aipw,resnet50,...,63.97172,sim_ks0_b0_0.1_0.5_0.5,ks0,b0,0.1,-0.226463,0,0.5,0.5,ks
2,-0.2567938,4.644087,1.038988,1.780988,0.803544,0.1128314,kagle_retinal,0,aipw,inceptionv3,...,71.947107,sim_ks0_b0_0.1_0.5_0.5,ks0,b0,0.1,-0.226463,0,0.5,0.5,ks
3,-0.5399491,0.413047,0.080623,0.446283,0.235398,120.1025,kagle_retinal,1,aipw,inceptionv3,...,64.298203,sim_ks0_b0_0.1_0.5_0.5,ks0,b0,0.1,-0.226463,0,0.5,0.5,ks
4,-3.0736789999999997e+29,52165.203125,35377.816406,206.210907,157.387665,4.537570000000001e+56,kagle_retinal,0,aipw,image_regression,...,63.898355,sim_ks0_b0_0.1_0.5_0.5,ks0,b0,0.1,-0.226463,0,0.5,0.5,ks
