In [199]:
import sys
sys.path.insert(0, '..')
from mlconfound.stats import confound_test
from mlconfound.simulate import simulate

import numpy as np
import pandas as pd
import warnings
warnings.simplefilter("always")
import seaborn as sns
sns.set_style("whitegrid")
import matplotlib.pyplot as plt

from scipy.stats import linregress

In [200]:
#                     true signal     confounder
y, yhat, c = simulate(0.6,       0.3,     # y
                      0,         0.9,     # c
                      0.4,       0.2,  # yhat
                      n=10,                            # simulated sample size
                      random_state=42
                      )

In [201]:
confound_test(y, yhat, c, num_perms=10000, tolerance=0.01, subsample='auto',
                                return_null_dist=False, random_state=42, n_jobs=-1)

Permuting: 100%|██████████| 10000/10000 [00:03<00:00, 3200.39it/s]


ConfoundTestResults(r2_y_c=0.4421694853225511, r2_yhat_c=0.5052554018621168, r2_y_yhat=0.8734762261079274, p=0.34108527131782945, p_ci=(0.2599444488217224, 0.42970680884909984), subsample=5, found=129)

In [210]:
# generate CPT copies of X when the conditional distribution is Gaussian
# i.e. X | Z=Z_i ~ N(mu[i],sig2[i])
def generate_X_CPT_gaussian(nstep,M,X0,mu,sig2):
    log_lik_mat = - np.power(X0,2)[:,None] * (1/2/sig2)[None,:] + X0[:,None] * (mu/sig2)[None,:]
    Pi_mat = generate_X_CPT(nstep,M,log_lik_mat)
    return X0[Pi_mat]

# generate CPT copies of X, with fixed correlation to C, assuming that the conditional distribution is Gaussian
# i.e. X | Z=Z_i ~ N(mu[i],sig2[i])
def generate_X_CPT_r(nstep,M,X0,mu,sig2):
    log_lik_mat = - np.power(X0,2)[:,None] * (1/2/sig2)[None,:] + X0[:,None] * (mu/sig2)[None,:]
    Pi_mat = generate_X_CPT(nstep,M,log_lik_mat)
    return X0[Pi_mat]

# generate CPT copies of X in general case
# log_lik_mat[i,j] = q(X[i]|Z[j]) where q(x|z) is the conditional density for X|Z
def generate_X_CPT(nstep,M,log_lik_mat,Pi_init=[]):
    n = log_lik_mat.shape[0]
    if len(Pi_init)==0:
        Pi_init = np.arange(n,dtype=int)
    Pi_ = generate_X_CPT_MC(nstep,log_lik_mat,Pi_init)
    Pi_mat = np.zeros((M,n),dtype=int)
    for m in range(M):
        Pi_mat[m] = generate_X_CPT_MC(nstep,log_lik_mat,Pi_)
    return Pi_mat

def generate_X_CPT_MC(nstep,log_lik_mat,Pi):
    n = len(Pi)
    npair = np.floor(n/2).astype(int)
    for istep in range(nstep):
        perm = np.random.choice(n,n,replace=False)
        inds_i = perm[0:npair]
        inds_j = perm[npair:(2*npair)]
        # for each k=1,...,npair, decide whether to swap Pi[inds_i[k]] with Pi[inds_j[k]]
        log_odds = log_lik_mat[Pi[inds_i],inds_j] + log_lik_mat[Pi[inds_j],inds_i] \
            - log_lik_mat[Pi[inds_i],inds_i] - log_lik_mat[Pi[inds_j],inds_j]
        swaps = np.random.binomial(1,1/(1+np.exp(-np.maximum(-500,log_odds))))
        Pi[inds_i], Pi[inds_j] = Pi[inds_i] + swaps*(Pi[inds_j]-Pi[inds_i]), Pi[inds_j] - \
            swaps*(Pi[inds_j]-Pi[inds_i])   
    print(Pi)
    return Pi

In [211]:
fit = linregress(y,c)
chat = fit.intercept + fit.slope * y
fit.stderr, np.mean(np.power(chat - c, 2)/np.sqrt(c))
fit.rvalue

  This is separate from the ipykernel package so we can avoid doing imports until


0.6649582583309654

In [213]:
chat_sigma2 = np.repeat( np.power(fit.stderr/np.sqrt(len(c)), 2), len(y))
cstars = generate_X_CPT_gaussian(10, 10, c, c, chat_sigma2)
np.mean(np.corrcoef(y, cstars)[0,1:])

[4 1 2 9 6 5 0 8 7 3]
[1 4 3 9 6 5 0 8 7 2]
[1 4 3 7 6 5 0 8 9 2]
[7 4 3 9 6 8 0 5 1 2]
[7 4 3 9 6 5 0 8 1 2]
[1 4 3 9 0 5 6 8 7 2]
[7 4 3 9 0 5 6 8 1 2]
[7 4 3 9 0 5 6 8 1 2]
[7 4 3 9 6 8 0 5 1 2]
[7 4 3 9 0 8 6 5 1 2]
[4 7 3 9 0 8 6 5 1 2]


0.6611899684265413

In [183]:
for cstar in cstars:
    print(np.corrcoef(y, cstar)[0,1])

0.43861628992598256
0.4456080322723865
0.46191447245052797
0.45273835396310147
0.4567514424863113
0.4548846586313773
0.4525347955089826
0.4548414963557989
0.4582033703215801
0.4615762882477105


In [197]:
np.cov(y, c.transpose())

array([[0.42228409, 0.27088145],
       [0.27088145, 0.8434937 ]])

In [198]:
y

array([ 5.35359425e-01, -4.17895310e-01,  3.66941320e-01,  1.46616788e-01,
       -2.08115314e+00, -8.94484412e-01,  5.92109511e-01, -6.35936329e-02,
        2.17990375e-02, -3.59067643e-01,  6.61425959e-02,  4.32290990e-01,
        1.59022185e-01,  4.50767859e-01,  3.49514794e-01, -3.91152781e-01,
        3.50260620e-01, -6.50638719e-01,  7.64897364e-01,  2.65272464e-01,
       -5.28763666e-01, -9.49834025e-01,  1.11084755e+00, -1.68959298e-01,
        1.21703453e-01, -2.36152940e-01,  8.75751168e-01,  2.41673341e-01,
        1.11638847e-01,  7.89636166e-01,  1.23199344e+00, -4.95246482e-01,
       -3.43766925e-01, -4.69046261e-01,  1.00735471e-01,  7.72681871e-01,
       -1.55077577e-01, -3.28484857e-01, -1.30599329e+00,  3.06022805e-01,
       -8.60779249e-02,  1.62015982e-01, -3.01310515e-01,  7.50329906e-02,
        1.30383247e-01,  5.97918142e-02,  4.05799928e-01, -2.48727493e-01,
        6.74522203e-01,  2.78898143e-01,  1.32277195e-01,  4.73298716e-01,
       -8.43540227e-01,  