# Spring 2021

## Raquel Aoki

Starting the project


Data simulation sources: 
- https://github.com/JiajingZ/CopulaSensitivity
- https://github.com/raquelaoki/ParKCa

In [1]:
import sys 
sys.path.insert(0,'src/')
sys.path.insert(0,'bartpy/') #https://github.com/JakeColtman/bartpy
from data_simulation import *
#from bartpy.sklearnmodel import SklearnModel as bart
#from bartpy.features.featureimportance import feature_importance
#from bartpy.features.featureselection import SelectNullDistributionThreshold, SelectSplitProportionThreshold
#from bartpy.diagnostics.features import *

from bartpy.sklearnmodel import SklearnModel as bart
from bartpy.features.featureselection import SelectNullDistributionThreshold, SelectSplitProportionThreshold
from bartpy.diagnostics.features import *
from bartpy.features.featureimportance import feature_importance
from sklearn.decomposition import PCA



from sklearn.metrics import confusion_matrix,f1_score,roc_curve,roc_auc_score, accuracy_score

def Find_Optimal_Cutoff(target, predicted):
    """ Find the optimal probability cutoff point for a classification model related to event rate
    Parameters
    ----------
    target : Matrix with dependent or target data, where rows are observations

    predicted : Matrix with predicted data, where rows are observations

    Returns
    -------     
    list type, with optimal cutoff value
    https://stackoverflow.com/questions/28719067/roc-curve-and-cut-off-point-python
    """
    fpr, tpr, threshold = roc_curve(target, predicted)
    i = np.arange(len(tpr)) 
    roc = pd.DataFrame({'tf' : pd.Series(tpr-(1-fpr), index=i), 'threshold' : pd.Series(threshold, index=i)})
    roc_t = roc.iloc[(roc.tf-0).abs().argsort()[:1]]

    return list(roc_t['threshold']) 





In [2]:
#sim_data = copula_simulated_data(s = 10)
#tr, u, y_continuous, y_binary = sim_data.get_data()
#effect_true, effect_obs, true_treat_obs_effect_01,true_treat_obs_effect= sim_data.get_true_coefs()
#sim_data.print_equation()


In [2]:
#change the proportion of true causes
gwas_data = gwas_simulated_data(1000, 100, 8, prop_tc = 0.05)
y, tc, X, col = gwas_data.generate_samples()
X = pd.DataFrame(X).sample(frac=1.0).values
#y = y.astype('float')


In [3]:
T = X[:,col]
X1 = np.delete(X,col,1)

print(X3.shape)


Total:  (1000, 100) 
Covariates:  (1000, 95) 
Treatments:  (1000, 5)


In [28]:

pca = PCA(n_components=5)
X3 = np.concatenate((T,pca.fit_transform(X1)),axis = 1)

model = bart(n_samples=1000, n_burn=10, n_trees=25, store_in_sample_predictions=False, n_jobs=1) # Use default parameters
model.fit(X3, y) # Fit the model
y_ = model.predict(X3)
thhold = Find_Optimal_Cutoff(y,y_)
y_01 = [0 if item < thhold else 1 for item in y_]

#random 
prob = sum(y)/len(y)
y_random = [np.random.binomial(1,prob) for item in y_]

print('\n...Training set: F1 - ',f1_score(y,y_01))
print('...Training set: acc - ',accuracy_score(y,y_01))
print('...... confusion matrix: \n',confusion_matrix(y,y_01))

print('\n...Random set: F1 - ',f1_score(y,y_random))
print('...Random set: acc - ',accuracy_score(y,y_random))
print('...... confusion matrix: \n',confusion_matrix(y,y_random))

#...Training set: F1 -  0.5418502202643172 25
#...... confusion matrix:  [347 232 192 229]

 40%|████      | 4/10 [00:00<00:00, 38.14it/s]

Starting burn


100%|██████████| 10/10 [00:00<00:00, 48.22it/s]
  1%|          | 6/1000 [00:00<00:17, 58.18it/s]

Starting sampling


100%|██████████| 1000/1000 [00:15<00:00, 63.33it/s]
100%|██████████| 10/10 [00:00<00:00, 52.97it/s]
  0%|          | 0/1000 [00:00<?, ?it/s]

Starting burn
Starting sampling


100%|██████████| 1000/1000 [00:15<00:00, 63.29it/s]
100%|██████████| 10/10 [00:00<00:00, 55.14it/s]
  0%|          | 0/1000 [00:00<?, ?it/s]

Starting burn
Starting sampling


100%|██████████| 1000/1000 [00:15<00:00, 62.78it/s]
100%|██████████| 10/10 [00:00<00:00, 50.60it/s]
  0%|          | 0/1000 [00:00<?, ?it/s]

Starting burn
Starting sampling


100%|██████████| 1000/1000 [00:15<00:00, 63.31it/s]



...Training set: F1 -  0.5175438596491229
...Training set: acc -  0.56
...... confusion matrix: 
 [[324 255]
 [185 236]]

...Random set: F1 -  0.39999999999999997
...Random set: acc -  0.502
...... confusion matrix: 
 [[336 243]
 [255 166]]


In [21]:
def cate(y_pred, X, tc, col):
    cate_ = []
    for i in range(X.shape[1]): 
        if len(np.unique(X3[:,i]))==2:
            y0 = np.array(y_)[X[:,i]==0]
            y1 = np.array(y_)[X[:,i]==1]
        else: 
            y0 = np.array(y_)[X[:,i]<=np.mean(X3[:,i])]
            y1 = np.array(y_)[X[:,i]>np.mean(X3[:,i])]
        if i in col: 
            cate_.append(y1.mean() - y0.mean())
            print('True Cause', y1.mean() - y0.mean(), tc[i])
        else: 
            print('Covariate', y1.mean() - y0.mean(), 0)
    
    pehe = sum(pow(tc-cate_,2))/len(tc)
    return cate_, pehe

cate_, pehe = cate(y_, X3, tc[col] , range(len(col)))

1

True Cause -0.024846314499687328 0.022801179154954943
True Cause 0.040963765531613905 0.2728206833180276
True Cause 0.0008479026512180354 -0.4867425772556366
True Cause 0.04142869906065205 -0.3465873830592348
True Cause 0.019657191127847418 -0.5741228935505409
Covariate -0.007815411598412147 0
Covariate 0.0016784553189072016 0
Covariate 0.000679953137107725 0
Covariate 0.011210015232946402 0
Covariate 0.007189559216126407 0


array([0., 1.])

In [None]:
X

In [None]:
y