# Spring 2021

## Raquel Aoki

Starting the project


Source: https://github.com/JiajingZ/CopulaSensitivity

Comments: 
- 7.1 Section data is in R. 
- 7.2 Section is the GWAS study from Parkca, and the deconfounder. The data generated is different from Blei paper. 


In [3]:
#7.2 GWAS simulated study // sparse effects setting
#https://github.com/raquelaoki/ParKCa/blob/master/src/datapreprocessing.py

def sim_genes_TGP(Fs, ps, n_hapmapgenes, n_causes, n_units, S, D, randseed):
    '''
    #Adapted from Deconfounder's authors
    generate the simulated data
    input:
        - Fs, ps, n_hapmapgenes: not adopted in this example
        - n_causes = integer
        - n_units = m (columns)
        - S: PCA output n x 2
    '''
    np.random.seed(randseed)

    S = expit(S)
    Gammamat = np.zeros((n_causes, 3))
    Gammamat[:,0] = 0.2*npr.uniform(size=n_causes) #0.45
    Gammamat[:,1] = 0.2*npr.uniform(size=n_causes) #0.45
    Gammamat[:,2] = 0.05*np.ones(n_causes)
    S = np.column_stack((S[npr.choice(S.shape[0],size=n_units,replace=True),], \
        np.ones(n_units)))
    F = S.dot(Gammamat.T)
    #it was 2 instead of 1: goal is make SNPs binary
    G = npr.binomial(1, F)
    #unobserved group
    lambdas = KMeans(n_clusters=3, random_state=123).fit(S).labels_
    sG = sparse.csr_matrix(G)
    return G, lambdas


def generate_samples(SIMULATIONS,n_units,n_causes):
    '''
    Input:
    SIMULATIONS: number of datasets to be produced
    n_units, n_causes: dimentions
    Output (pickle format):
    snp_simulated datasets
    y: output simulated and truecases for each datset are together in a single matrix
    Note: There are options to load the data from vcf format and run the pca
    Due running time, we save the files and load from the pca.txt file
    '''
    #ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/release/20130502/supporting/hd_genotype_chip/
    #tgp_pca2.txt created in https://github.com/raquelaoki/ParKCa/blob/master/src/datapreprocessing.py
    S = np.loadtxt('data_s//tgp_pca2.txt', delimiter=',')

    sim_y = []
    sim_tc = []
    for sim in range(SIMULATIONS):
        G0, lambdas = sim_genes_TGP([], [], 0 , n_causes, n_units, S, 3, sim )
        G1, tc, y01 = sim_dataset(G0,lambdas, n_causes,n_units,sim)
        G = add_colnames(G1,tc)
        del G0,G1

        G.to_pickle('data_s//snp_simulated_'+str(sim)+'.txt')
        sim_y.append(y01)
        sim_tc.append(tc)
    sim_y = np.transpose(np.matrix(sim_y))
    sim_y = pd.DataFrame(sim_y)
    sim_y.columns = ['sim_'+str(sim) for sim in range(SIMULATIONS)]

    sim_tc = np.transpose(np.matrix(sim_tc))
    sim_tc = pd.DataFrame(sim_tc)
    sim_tc.columns = ['sim_'+str(sim) for sim in range(SIMULATIONS)]

    sim_y.to_pickle('data_s//snp_simulated_y01.txt')
    sim_tc.to_pickle('data_s//snp_simulated_truecauses.txt')


Todo: 
- Get old data
- Read the ammount of causes used by deconfudner


7.1 Datasets 

Each dataset has 4 variables where the true effect is known. 


In [34]:
#tested 
import numpy as np 
import pandas as pd
import scipy.stats

#new simulated studies - binary nonlinear
#Reference: 
#https://github.com/JiajingZ/CopulaSensitivity/blob/CopSens/simulation/GaussianT_BinaryY_nonlinearYT/GaussianT_BinaryY_nonlinearYT_RR.R
#adapted from R to python
print('Dataset Creation')

k = 4 #number of covariates
s = 1 #?
B = [2,0.5, -0.4, 0.2] #?
gamma = 2.8 #if either B or gamma = 0, there is no confounding 
sigma2_t, sigma2_y = 1, 1 #variance on treatments and outcome

tau_l = [3, -1, 1, -0.06] #linear effect
tau_nl = -4 #non linear effect
coef_true = tau_l.copy()
coef_true.append(tau_nl)

def g_yt(t, tau_l, tau_nl): 
    '''
    t: t is n by k matrix
    outputs y
    '''    
    #col 3 
    t[:,2] = [item if item>0 else 0.7*item for item in t[:,2]] #t[,3] = ifelse(t[,3] > 0, t[,3], 0.7*t[,3])
    #print(t.shape, len(tau_l), tau_l)
    y = t.dot(tau_l)+pow(t[:,0],2)*tau_nl
    return y #n x 1


n = 80000 #sample size
u = np.random.normal(loc = 0, scale = 1 , size = n*s).reshape(n,s) #mu = rep(0, s), Sigma = diag(s)
#print('u.shape',u.shape)

tr = np.repeat(u,k).reshape(n,k) * B 
tr = tr+np.random.normal(loc = 0, scale = pow(sigma2_t,2), size = n*k).reshape(n,k)

y_continuous = g_yt(tr, tau_l, tau_nl)+(u*gamma).reshape(n,)+ np.random.normal(loc = 0, scale = sigma2_y, size = n)
y_binary = [1 if item > 0 else 0 for item in y_continuous] #very well balanced
print('Outcomes:\n', y_binary[0:5] ,'(binary) or ',y_continuous[0:5],'(continuous)' )

tr = pd.DataFrame(tr, columns = ['t1', 't2', 't3', 't4'])
print('Features:', tr.shape)
print(tr.head())

aux1 = np.linalg.solve(np.array(B).reshape(k,1)*(np.transpose(B).reshape(1,k)) + sigma2_t*np.identity(k),np.identity(k))
B = np.matrix(B).reshape(1,k)
coef_mu_u_t = B.dot(aux1)

## theoretical values -------------------------------------------------------------
sigma_u_t = np.sqrt(1-B*aux1*np.transpose(B))[0,0]
sigma_ytilde_t = (np.sqrt(pow(gamma,2)*pow(sigma_u_t,2)+sigma2_y))#[0,0]
sigma_ytilde_t_do = np.sqrt( pow(gamma,2) + sigma2_y )

#print('sigma_u_t',sigma_u_t)
#print('sigma_ytilde_t', sigma_ytilde_t)
#print('sigma_ytilde_t_do', sigma_ytilde_t_do)

# true Treatment effect #
t_choice = np.identity(k)
t2 = np.matrix(np.zeros(k))
ytilde_mean_do = g_yt(np.array(np.concatenate([t_choice,t2], axis = 0)),tau_l,tau_nl)
y_mean_do = scipy.stats.norm.cdf(ytilde_mean_do/sigma_ytilde_t_do)
effect_true = y_mean_do[0:4]/y_mean_do[4]
print('\nBinary Nonlinear:')

# true treatment effect bias #
#5 x 4 , 4 x 1 , 1 x 1 
ytilde_mean_do_bias = np.array((np.concatenate([t_choice,t2], axis = 0).dot(np.transpose(coef_mu_u_t)))*gamma)

# true observed treatment effect #
ytilde_mean_obs = ytilde_mean_do.reshape(k+1,1)+ytilde_mean_do_bias
y_mean_obs = scipy.stats.norm.cdf(ytilde_mean_obs/sigma_ytilde_t)
effect_obs = y_mean_obs[0:4]/y_mean_obs[4]


print("True effect", effect_true)
print("True obs effect", effect_obs.reshape(1,k))
print("True treat. effect", ytilde_mean_do_bias.reshape(1,k+1))
print("True treat. obs effect", ytilde_mean_obs.reshape(1,k+1))


print('\n Continuous Nonlinear')
#new simulated studies -  nonlinear
#Reference: 
#https://github.com/JiajingZ/CopulaSensitivity/blob/CopSens/simulation/GaussianT_nonlinearYT/GaussianT_nonlinearYT_fitobsbart.R
#adapted from R to python

# true treatment effect #
effect_true_c = g_yt(t_choice, tau_l,tau_nl) - g_yt(t2,tau_l,tau_nl)
effect_true_c

# true treatment effect bias #
effect_bias_c = (t_choice.dot(np.transpose(coef_mu_u_t)))*gamma
effect_bias_c 

# true observed treatment effect #
effect_obs_c = effect_true_c.reshape(1,k) + effect_bias_c.reshape(1,k)
print("True effect", effect_true_c)
print("True obs effect", effect_obs_c)




Dataset Creation
Outcomes:
 [0, 0, 1, 0, 1] (binary) or  [-45.99554929 -23.86717444   2.11181526  -0.7196207    0.29485913] (continuous)
Features: (80000, 4)
         t1        t2        t3        t4
0 -3.013646  0.408466  2.489331 -0.623558
1 -1.948275  0.054033 -0.723785 -0.559170
2  0.926265 -1.541654  0.288638  0.843599
3  0.209323  0.058708 -0.715242  0.634934
4  0.717897  1.157337  0.730321 -0.432559

Binary Nonlinear:
True effect [0.73661721 0.73661721 1.26338279 0.98389964]
True obs effect [[1.01406203 0.63416235 1.3890912  1.02184143]]
True treat. effect [[ 1.02752294  0.25688073 -0.20550459  0.10275229  0.        ]]
True treat. obs effect [[ 0.02752294 -0.74311927  0.79449541  0.04275229  0.        ]]

 Continuous Nonlinear
True effect [[-1.   -1.    1.   -0.06]]
True obs effect [[ 0.02752294 -0.74311927  0.79449541  0.04275229]]


Todo: 
- why the true obs effect in the continous case is so much worse?
- make a function that simulate a dataset, and return true effects, and important things

Note: all values here are the same from the github reference