## ELEMENT: $\sigma$ bootstrap estimation.

## Preliminaries

In [1]:
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import scipy.stats as st
import scipy.special as sp
import pandas as pd 
import seaborn as sns
import matplotlib.pyplot as plt
import statsmodels.api as sm

from pygam import LinearGAM, s
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt

from fastkde import fastKDE

In [2]:
def affine(x):
    y = (x - np.min(x))/(np.max(x) - np.min(x))
    return y

In [3]:
def estimator_strata(x, alpha = 0.05):
    
    if(x.shape[0]%2!=0):
        x = x[:-1]
        
    x = np.column_stack((affine(x[:,0]), affine(x[:,1])))
    estim, inf = np.split(x, 2) #split data up into two halves
    
    ## first split used for density estimation
    margin_x = fastKDE.pdf_at_points(var1 = estim[:,0], list_of_points = list(inf[:,0]))
    margin_y = fastKDE.pdf_at_points(var1 = estim[:,1], list_of_points = list(inf[:,1]))
    select = np.logical_and(margin_x > 0, margin_y > 0)
    margin_y = margin_y[select]
    margin_x = margin_x[select]
    
    h_x1 = -np.mean(np.log(margin_x))
    h_y1 = -np.mean(np.log(margin_y))
    covar1 = np.cov(np.log(margin_x), np.log(margin_y))
    delta_var1 = covar1[0,0] + covar1[1,1] - 2*covar1[0,1]
    
    
    ## second split used for density estimation
    margin_x = fastKDE.pdf_at_points(var1 = inf[:,0], list_of_points = list(estim[:,0]))
    margin_y = fastKDE.pdf_at_points(var1 = inf[:,1], list_of_points = list(estim[:,1]))
    select = np.logical_and(margin_x > 0, margin_y > 0)
    margin_y = margin_y[select]
    margin_x = margin_x[select]
    
    h_x2 = -np.mean(np.log(margin_x))
    h_y2 = -np.mean(np.log(margin_y))
    covar2 = np.cov(np.log(margin_x), np.log(margin_y))
    delta_var2 = covar2[0,0] + covar2[1,1] - 2*covar2[0,1]
    
    ## cross fitting
    h_x = (h_x1 + h_x2)/2
    h_y = (h_y1 + h_y2)/2
    delta = (h_x - h_y)
    
    ## variance estimation using monte carlo
    delta_var = (delta_var1 + delta_var2)/2
    delta_sd = np.sqrt(delta_var)
    
    delta_lcb = delta - st.norm.ppf(1 - alpha/2)*delta_sd/np.sqrt(len(select))
    delta_ucb = delta + st.norm.ppf(1 - alpha/2)*delta_sd/np.sqrt(len(select))
    
    return ([delta, delta_lcb, delta_ucb])

In [4]:
def estimator(x, alpha = 0.05):
    
    if(x.shape[0]%2!=0):
        x = x[:-1]
    
    x = np.column_stack((affine(x[:,0]), affine(x[:,1])))
    estim, inf = np.split(x, 2) #split data up into two halves
    
    ## first split used for density estimation
    margin_x = fastKDE.pdf_at_points(var1 = estim[:,0], list_of_points = list(inf[:,0]))
    margin_y = fastKDE.pdf_at_points(var1 = estim[:,1], list_of_points = list(inf[:,1]))
    select = np.logical_and(margin_x > 0, margin_y > 0)
    margin_y = margin_y[select]
    margin_x = margin_x[select]
    
    h_x1 = -np.mean(np.log(margin_x))
    h_y1 = -np.mean(np.log(margin_y))
    covar1 = np.cov(np.log(margin_x), np.log(margin_y))
    delta_var1 = covar1[0,0] + covar1[1,1] - 2*covar1[0,1]
    
    
    ## second split used for density estimation
    margin_x = fastKDE.pdf_at_points(var1 = inf[:,0], list_of_points = list(estim[:,0]))
    margin_y = fastKDE.pdf_at_points(var1 = inf[:,1], list_of_points = list(estim[:,1]))
    select = np.logical_and(margin_x > 0, margin_y > 0)
    margin_y = margin_y[select]
    margin_x = margin_x[select]
    
    h_x2 = -np.mean(np.log(margin_x))
    h_y2 = -np.mean(np.log(margin_y))
    covar2 = np.cov(np.log(margin_x), np.log(margin_y))
    delta_var2 = covar2[0,0] + covar2[1,1] - 2*covar2[0,1]
    
    ## cross fitting
    h_x = (h_x1 + h_x2)/2
    h_y = (h_y1 + h_y2)/2
    delta = (h_x - h_y)
    
    ## variance estimation using monte carlo
    delta_var = (delta_var1 + delta_var2)/2
    delta_sd = np.sqrt(delta_var)
    
    delta_lcb = delta - st.norm.ppf(1 - alpha/2)*delta_sd/np.sqrt(len(select))
    delta_ucb = delta + st.norm.ppf(1 - alpha/2)*delta_sd/np.sqrt(len(select))
    
    return ([delta, delta_sd, x.shape[0]])

In [5]:
def fisher_inf(y_pred):
    y_vals = np.linspace(np.min(y_pred), np.max(y_pred), 500)
    kde = st.gaussian_kde(y_pred)
    pdf_values = kde.pdf(y_vals)
    log_likelihood_values = np.log(pdf_values)
    log_likelihood_prime = np.gradient(log_likelihood_values, y_vals)
    log_likelihood_double_prime = np.gradient(log_likelihood_prime, y_vals)
    fisher_information = -np.nanmean(log_likelihood_double_prime)
    return(fisher_information)

In [6]:
bps = pd.read_csv("/home/soumikp/enar_2023/data/outcome_bp.csv")

## FGF5 

In [7]:
dnam = pd.read_csv("/home/soumikp/enar_2023/data/dat_FGF5.csv").iloc[:,1:22].mean(axis=1)
folio = pd.read_csv("/home/soumikp/enar_2023/data/dat_FGF5.csv").iloc[:,23]
data = pd.concat([folio, dnam], axis=1)
data = pd.merge(data, bps, on = 'foliocc', how='inner').dropna()
data.columns = ['id', 'dnam', 'sex', 'bps', 'bpd']
data_m = data[data['sex'] == 0]
data_f = data[data['sex'] == 1]

### Estimates for BPS vs DNAm for females

In [8]:
temp = data_f
temp['bp'] = temp['bps']

np.random.seed(42)
niter = 250
mse = np.zeros(niter)
bound = np.zeros(niter)

for i in range(niter):
    bs_temp = temp.sample(n = len(temp), replace = True)
    bs_temp['bp'] = affine(bs_temp['bp'])
    bs_temp['dnam'] = affine(bs_temp['dnam'])
    
    # Create and fit the GAM model with cubic splines
    gam = LinearGAM(s(0, spline_order = 1)).fit(bs_temp['bp'], bs_temp['dnam'])

    # Make predictions on the test set
    y_pred = affine(gam.predict(bs_temp['bp'])) # this is my proxy for uncontaminated Y

    # Evaluate the model 
    mse[i] = np.std(gam.deviance_residuals(bs_temp['bp'], bs_temp['dnam']))
    
    # Fisher information
    fisher_information = fisher_inf(y_pred)
    
    # Bound of error variance
    bound[i] = (np.exp(2*estimator(np.column_stack((bs_temp['bp'], bs_temp['dnam'])))[0]) - 1)/fisher_information 
    
print(np.round(np.percentile(bound-mse, [2.5, 50, 97.5]), 3))
print(np.round(np.percentile(bound-mse, [5]), 3))

[-0.139 -0.002  0.317]
[-0.124]


### Estimates for BPS vs DNAm for males

In [9]:
temp = data_m
temp['bp'] = temp['bps']

np.random.seed(42)
niter = 250
mse = np.zeros(niter)
bound = np.zeros(niter)

for i in range(niter):
    bs_temp = temp.sample(n = len(temp), replace = True)
    bs_temp['bp'] = affine(bs_temp['bp'])
    bs_temp['dnam'] = affine(bs_temp['dnam'])
    
    # Create and fit the GAM model with cubic splines
    gam = LinearGAM(s(0, spline_order = 1)).fit(bs_temp['bp'], bs_temp['dnam'])

    # Make predictions on the test set
    y_pred = affine(gam.predict(bs_temp['bp'])) # this is my proxy for uncontaminated Y

    # Evaluate the model 
    mse[i] = np.std(gam.deviance_residuals(bs_temp['bp'], bs_temp['dnam']))
    
    # Fisher information
    fisher_information = fisher_inf(y_pred)
    
    # Bound of error variance
    bound[i] = (np.exp(2*estimator(np.column_stack((bs_temp['bp'], bs_temp['dnam'])))[0]) - 1)/fisher_information 
    
print(np.round(np.percentile(bound-mse, [2.5, 50, 97.5]), 4))
print(np.round(np.percentile(bound-mse, [5]), 3))

[-0.1689 -0.0907 -0.0065]
[-0.155]


### Estimates for BPS vs DNAm for combined 

In [10]:
temp = data
temp['bp'] = temp['bps']

np.random.seed(42)
niter = 250
mse = np.zeros(niter)
bound = np.zeros(niter)

for i in range(niter):
    bs_temp = temp.sample(n = len(temp), replace = True)
    bs_temp['bp'] = affine(bs_temp['bp'])
    bs_temp['dnam'] = affine(bs_temp['dnam'])
    
    # Create and fit the GAM model with cubic splines
    gam = LinearGAM(s(0, spline_order = 1)).fit(bs_temp['bp'], bs_temp['dnam'])

    # Make predictions on the test set
    y_pred = affine(gam.predict(bs_temp['bp'])) # this is my proxy for uncontaminated Y

    # Evaluate the model 
    mse[i] = np.std(gam.deviance_residuals(bs_temp['bp'], bs_temp['dnam']))
    
    # Fisher information
    fisher_information = fisher_inf(y_pred)
    
    # Bound of error variance
    bound[i] = (np.exp(2*estimator(np.column_stack((bs_temp['bp'], bs_temp['dnam'])))[0]) - 1)/fisher_information 
    
print(np.round(np.percentile(bound-mse, [2.5, 50, 97.5]), 3))
print(np.round(np.percentile(bound-mse, [5]), 3))

[-0.145 -0.045  0.106]
[-0.124]


### Estimates for BPD vs DNAm for females

In [11]:
temp = data_f
temp['bp'] = temp['bpd']

np.random.seed(42)
niter = 250
mse = np.zeros(niter)
bound = np.zeros(niter)

for i in range(niter):
    bs_temp = temp.sample(n = len(temp), replace = True)
    bs_temp['bp'] = affine(bs_temp['bp'])
    bs_temp['dnam'] = affine(bs_temp['dnam'])
    
    # Create and fit the GAM model with cubic splines
    gam = LinearGAM(s(0, spline_order = 1)).fit(bs_temp['bp'], bs_temp['dnam'])

    # Make predictions on the test set
    y_pred = affine(gam.predict(bs_temp['bp'])) # this is my proxy for uncontaminated Y

    # Evaluate the model 
    mse[i] = np.std(gam.deviance_residuals(bs_temp['bp'], bs_temp['dnam']))
    
    # Fisher information
    fisher_information = fisher_inf(y_pred)
    
    # Bound of error variance
    bound[i] = (np.exp(2*estimator(np.column_stack((bs_temp['bp'], bs_temp['dnam'])))[0]) - 1)/fisher_information 
    
print(np.round(np.percentile(bound-mse, [2.5, 50, 97.5]), 3))
print(np.round(np.percentile(bound-mse, [5]), 3))

[-0.155 -0.009  0.848]
[-0.138]


### Estimates for BPD vs DNAm for males

In [12]:
temp = data_m
temp['bp'] = temp['bpd']

np.random.seed(42)
niter = 250
mse = np.zeros(niter)
bound = np.zeros(niter)

for i in range(niter):
    bs_temp = temp.sample(n = len(temp), replace = True)
    bs_temp['bp'] = affine(bs_temp['bp'])
    bs_temp['dnam'] = affine(bs_temp['dnam'])
    
    # Create and fit the GAM model with cubic splines
    gam = LinearGAM(s(0, spline_order = 1)).fit(bs_temp['bp'], bs_temp['dnam'])

    # Make predictions on the test set
    y_pred = affine(gam.predict(bs_temp['bp'])) # this is my proxy for uncontaminated Y

    # Evaluate the model 
    mse[i] = np.std(gam.deviance_residuals(bs_temp['bp'], bs_temp['dnam']))
    
    # Fisher information
    fisher_information = fisher_inf(y_pred)
    
    # Bound of error variance
    bound[i] = (np.exp(2*estimator(np.column_stack((bs_temp['bp'], bs_temp['dnam'])))[0]) - 1)/fisher_information 
    
print(np.round(np.percentile(bound-mse, [2.5, 50, 97.5]), 3))
print(np.round(np.percentile(bound-mse, [5]), 3))

[-0.186 -0.09   0.108]
[-0.161]


### Estimates for BPD vs DNAm for combined

In [13]:
temp = data
temp['bp'] = temp['bpd']

np.random.seed(42)
niter = 250
mse = np.zeros(niter)
bound = np.zeros(niter)

for i in range(niter):
    bs_temp = temp.sample(n = len(temp), replace = True)
    bs_temp['bp'] = affine(bs_temp['bp'])
    bs_temp['dnam'] = affine(bs_temp['dnam'])
    
    # Create and fit the GAM model with cubic splines
    gam = LinearGAM(s(0, spline_order = 1)).fit(bs_temp['bp'], bs_temp['dnam'])

    # Make predictions on the test set
    y_pred = affine(gam.predict(bs_temp['bp'])) # this is my proxy for uncontaminated Y

    # Evaluate the model 
    mse[i] = np.std(gam.deviance_residuals(bs_temp['bp'], bs_temp['dnam']))
    
    # Fisher information
    fisher_information = fisher_inf(y_pred)
    
    # Bound of error variance
    bound[i] = (np.exp(2*estimator(np.column_stack((bs_temp['bp'], bs_temp['dnam'])))[0]) - 1)/fisher_information 
    
print(np.round(np.percentile(bound-mse, [2.5, 50, 97.5]), 3))
print(np.round(np.percentile(bound-mse, [5]), 3))

[-0.16  -0.041  0.207]
[-0.134]


## HSD11B2

In [14]:
dnam = pd.read_csv("/home/soumikp/enar_2023/data/dat_HSD11B2.csv").iloc[:,0:21].mean(axis=1)
folio = pd.read_csv("/home/soumikp/enar_2023/data/dat_HSD11B2.csv").iloc[:,22]
data = pd.concat([folio, dnam], axis=1)
data = pd.merge(data, bps, on = 'foliocc', how='inner').dropna()
data.columns = ['id', 'dnam', 'sex', 'bps', 'bpd']
data_m = data[data['sex'] == 0]
data_f = data[data['sex'] == 1]

### Estimates for BPS vs DNAm for females

In [15]:
temp = data_f
temp['bp'] = temp['bps']

np.random.seed(42)
niter = 250
mse = np.zeros(niter)
bound = np.zeros(niter)

for i in range(niter):
    bs_temp = temp.sample(n = len(temp), replace = True)
    bs_temp['bp'] = affine(bs_temp['bp'])
    bs_temp['dnam'] = affine(bs_temp['dnam'])
    
    # Create and fit the GAM model with cubic splines
    gam = LinearGAM(s(0, spline_order = 1)).fit(bs_temp['bp'], bs_temp['dnam'])

    # Make predictions on the test set
    y_pred = affine(gam.predict(bs_temp['bp'])) # this is my proxy for uncontaminated Y

    # Evaluate the model 
    mse[i] = np.std(gam.deviance_residuals(bs_temp['bp'], bs_temp['dnam']))
    
    # Fisher information
    fisher_information = fisher_inf(y_pred)
    
    # Bound of error variance
    bound[i] = (np.exp(2*estimator(np.column_stack((bs_temp['bp'], bs_temp['dnam'])))[0]) - 1)/fisher_information 
    
print(np.round(np.percentile(bound-mse, [2.5, 50, 97.5]), 3))
print(np.round(np.percentile(bound-mse, [5]), 3))

[-0.105  0.021  0.528]
[-0.072]


### Estimates for BPS vs DNAm for males

In [16]:
temp = data_m
temp['bp'] = temp['bps']

np.random.seed(42)
niter = 250
mse = np.zeros(niter)
bound = np.zeros(niter)

for i in range(niter):
    bs_temp = temp.sample(n = len(temp), replace = True)
    bs_temp['bp'] = affine(bs_temp['bp'])
    bs_temp['dnam'] = affine(bs_temp['dnam'])
    
    # Create and fit the GAM model with cubic splines
    gam = LinearGAM(s(0, spline_order = 1)).fit(bs_temp['bp'], bs_temp['dnam'])

    # Make predictions on the test set
    y_pred = affine(gam.predict(bs_temp['bp'])) # this is my proxy for uncontaminated Y

    # Evaluate the model 
    mse[i] = np.std(gam.deviance_residuals(bs_temp['bp'], bs_temp['dnam']))
    
    # Fisher information
    fisher_information = fisher_inf(y_pred)
    
    # Bound of error variance
    bound[i] = (np.exp(2*estimator(np.column_stack((bs_temp['bp'], bs_temp['dnam'])))[0]) - 1)/fisher_information 
    
print(np.round(np.percentile(bound-mse, [2.5, 50, 97.5]), 3))
print(np.round(np.percentile(bound-mse, [5]), 3))

[-2.611 -0.016  1.671]
[-0.792]


### Estimates for BPS vs DNAm for combined

In [17]:
temp = data
temp['bp'] = temp['bps']

np.random.seed(42)
niter = 250
mse = np.zeros(niter)
bound = np.zeros(niter)

for i in range(niter):
    bs_temp = temp.sample(n = len(temp), replace = True)
    bs_temp['bp'] = affine(bs_temp['bp'])
    bs_temp['dnam'] = affine(bs_temp['dnam'])
    
    # Create and fit the GAM model with cubic splines
    gam = LinearGAM(s(0, spline_order = 1)).fit(bs_temp['bp'], bs_temp['dnam'])

    # Make predictions on the test set
    y_pred = affine(gam.predict(bs_temp['bp'])) # this is my proxy for uncontaminated Y

    # Evaluate the model 
    mse[i] = np.std(gam.deviance_residuals(bs_temp['bp'], bs_temp['dnam']))
    
    # Fisher information
    fisher_information = fisher_inf(y_pred)
    
    # Bound of error variance
    bound[i] = (np.exp(2*estimator(np.column_stack((bs_temp['bp'], bs_temp['dnam'])))[0]) - 1)/fisher_information 
    
print(np.round(np.percentile(bound-mse, [2.5, 50, 97.5]), 3))
print(np.round(np.percentile(bound-mse, [5]), 3))

[-3.4    0.057  3.498]
[-2.383]


### Estimates of BPD vs DNAm for females

In [21]:
temp = data_f
temp['bp'] = temp['bpd']

np.random.seed(42)
niter = 250
mse = np.zeros(niter)
bound = np.zeros(niter)

for i in range(niter):
    bs_temp = temp.sample(n = len(temp), replace = True)
    bs_temp['bp'] = affine(bs_temp['bp'])
    bs_temp['dnam'] = affine(bs_temp['dnam'])
    
    # Create and fit the GAM model with cubic splines
    gam = LinearGAM(s(0, spline_order = 1)).fit(bs_temp['bp'], bs_temp['dnam'])

    # Make predictions on the test set
    y_pred = affine(gam.predict(bs_temp['bp'])) # this is my proxy for uncontaminated Y

    # Evaluate the model 
    mse[i] = np.std(gam.deviance_residuals(bs_temp['bp'], bs_temp['dnam']))
    
    # Fisher information
    fisher_information = fisher_inf(y_pred)
    
    # Bound of error variance
    bound[i] = (np.exp(2*estimator(np.column_stack((bs_temp['bp'], bs_temp['dnam'])))[0]) - 1)/fisher_information 
    
print(np.round(np.percentile(bound-mse, [2.5, 50, 97.5]), 3))
print(np.round(np.percentile(bound-mse, [5]), 3))

[-0.101  0.017  0.691]
[-0.08]


### Estimates of BPD vs DNAm for males

In [22]:
temp = data_m
temp['bp'] = temp['bpd']

np.random.seed(42)
niter = 250
mse = np.zeros(niter)
bound = np.zeros(niter)

for i in range(niter):
    bs_temp = temp.sample(n = len(temp), replace = True)
    bs_temp['bp'] = affine(bs_temp['bp'])
    bs_temp['dnam'] = affine(bs_temp['dnam'])
    
    # Create and fit the GAM model with cubic splines
    gam = LinearGAM(s(0, spline_order = 1)).fit(bs_temp['bp'], bs_temp['dnam'])

    # Make predictions on the test set
    y_pred = affine(gam.predict(bs_temp['bp'])) # this is my proxy for uncontaminated Y

    # Evaluate the model 
    mse[i] = np.std(gam.deviance_residuals(bs_temp['bp'], bs_temp['dnam']))
    
    # Fisher information
    fisher_information = fisher_inf(y_pred)
    
    # Bound of error variance
    bound[i] = (np.exp(2*estimator(np.column_stack((bs_temp['bp'], bs_temp['dnam'])))[0]) - 1)/fisher_information 
    
print(np.round(np.percentile(bound-mse, [2.5, 50, 97.5]), 3))
print(np.round(np.percentile(bound-mse, [5]), 3))

[-0.162 -0.056  0.277]
[-0.148]


### Estimates for BPS vs DNAm for combined

In [23]:
temp = data
temp['bp'] = temp['bpd']

np.random.seed(42)
niter = 250
mse = np.zeros(niter)
bound = np.zeros(niter)

for i in range(niter):
    bs_temp = temp.sample(n = len(temp), replace = True)
    bs_temp['bp'] = affine(bs_temp['bp'])
    bs_temp['dnam'] = affine(bs_temp['dnam'])
    
    # Create and fit the GAM model with cubic splines
    gam = LinearGAM(s(0, spline_order = 1)).fit(bs_temp['bp'], bs_temp['dnam'])

    # Make predictions on the test set
    y_pred = affine(gam.predict(bs_temp['bp'])) # this is my proxy for uncontaminated Y

    # Evaluate the model 
    mse[i] = np.std(gam.deviance_residuals(bs_temp['bp'], bs_temp['dnam']))
    
    # Fisher information
    fisher_information = fisher_inf(y_pred)
    
    # Bound of error variance
    bound[i] = (np.exp(2*estimator(np.column_stack((bs_temp['bp'], bs_temp['dnam'])))[0]) - 1)/fisher_information 
    
print(np.round(np.percentile(bound-mse, [2.5, 50, 97.5]), 3))
print(np.round(np.percentile(bound-mse, [5]), 3))

[-3.4    0.057  3.498]
[-2.383]


## ARHGAP42

In [26]:
dnam = pd.read_csv("/home/soumikp/enar_2023/data/dat_ARHGAP42.csv").iloc[:,1:44].mean(axis=1)
folio = pd.read_csv("/home/soumikp/enar_2023/data/dat_ARHGAP42.csv").iloc[:,45]
data = pd.concat([folio, dnam], axis=1)
data = pd.merge(data, bps, on = 'foliocc', how='inner').dropna()
data.columns = ['id', 'dnam', 'sex', 'bps', 'bpd']
data_m = data[data['sex'] == 0]
data_f = data[data['sex'] == 1]

### Estimates of BPS vs DNAm for females

In [27]:
temp = data_f
temp['bp'] = temp['bps']

np.random.seed(42)
niter = 250
mse = np.zeros(niter)
bound = np.zeros(niter)

for i in range(niter):
    bs_temp = temp.sample(n = len(temp), replace = True)
    bs_temp['bp'] = affine(bs_temp['bp'])
    bs_temp['dnam'] = affine(bs_temp['dnam'])
    
    # Create and fit the GAM model with cubic splines
    gam = LinearGAM(s(0, spline_order = 1)).fit(bs_temp['bp'], bs_temp['dnam'])

    # Make predictions on the test set
    y_pred = affine(gam.predict(bs_temp['bp'])) # this is my proxy for uncontaminated Y

    # Evaluate the model 
    mse[i] = np.std(gam.deviance_residuals(bs_temp['bp'], bs_temp['dnam']))
    
    # Fisher information
    fisher_information = fisher_inf(y_pred)
    
    # Bound of error variance
    bound[i] = (np.exp(2*estimator(np.column_stack((bs_temp['bp'], bs_temp['dnam'])))[0]) - 1)/fisher_information 
    
print(np.round(np.percentile(bound-mse, [2.5, 50, 97.5]), 3))
print(np.round(np.percentile(bound-mse, [5]), 3))

[-0.827 -0.108  0.262]
[-0.281]


### Estimates of BPS vs DNAm for males

In [28]:
temp = data_m
temp['bp'] = temp['bps']

np.random.seed(42)
niter = 250
mse = np.zeros(niter)
bound = np.zeros(niter)

for i in range(niter):
    bs_temp = temp.sample(n = len(temp), replace = True)
    bs_temp['bp'] = affine(bs_temp['bp'])
    bs_temp['dnam'] = affine(bs_temp['dnam'])
    
    # Create and fit the GAM model with cubic splines
    gam = LinearGAM(s(0, spline_order = 1)).fit(bs_temp['bp'], bs_temp['dnam'])

    # Make predictions on the test set
    y_pred = affine(gam.predict(bs_temp['bp'])) # this is my proxy for uncontaminated Y

    # Evaluate the model 
    mse[i] = np.std(gam.deviance_residuals(bs_temp['bp'], bs_temp['dnam']))
    
    # Fisher information
    fisher_information = fisher_inf(y_pred)
    
    # Bound of error variance
    bound[i] = (np.exp(2*estimator(np.column_stack((bs_temp['bp'], bs_temp['dnam'])))[0]) - 1)/fisher_information 
    
print(np.round(np.percentile(bound-mse, [2.5, 50, 97.5]), 3))
print(np.round(np.percentile(bound-mse, [5]), 3))

[-0.191 -0.157 -0.101]
[-0.184]


### Estimates of BPS vs DNAm for combined

In [29]:
temp = data
temp['bp'] = temp['bps']

np.random.seed(42)
niter = 250
mse = np.zeros(niter)
bound = np.zeros(niter)

for i in range(niter):
    bs_temp = temp.sample(n = len(temp), replace = True)
    bs_temp['bp'] = affine(bs_temp['bp'])
    bs_temp['dnam'] = affine(bs_temp['dnam'])
    
    # Create and fit the GAM model with cubic splines
    gam = LinearGAM(s(0, spline_order = 1)).fit(bs_temp['bp'], bs_temp['dnam'])

    # Make predictions on the test set
    y_pred = affine(gam.predict(bs_temp['bp'])) # this is my proxy for uncontaminated Y

    # Evaluate the model 
    mse[i] = np.std(gam.deviance_residuals(bs_temp['bp'], bs_temp['dnam']))
    
    # Fisher information
    fisher_information = fisher_inf(y_pred)
    
    # Bound of error variance
    bound[i] = (np.exp(2*estimator(np.column_stack((bs_temp['bp'], bs_temp['dnam'])))[0]) - 1)/fisher_information 
    
print(np.round(np.percentile(bound-mse, [2.5, 50, 97.5]), 3))
print(np.round(np.percentile(bound-mse, [5]), 3))

[-0.175 -0.116 -0.008]
[-0.166]


### Estimates of BPD vs DNAm for females

In [31]:
temp = data_f
temp['bp'] = temp['bpd']

np.random.seed(42)
niter = 250
mse = np.zeros(niter)
bound = np.zeros(niter)

for i in range(niter):
    bs_temp = temp.sample(n = len(temp), replace = True)
    bs_temp['bp'] = affine(bs_temp['bp'])
    bs_temp['dnam'] = affine(bs_temp['dnam'])
    
    # Create and fit the GAM model with cubic splines
    gam = LinearGAM(s(0, spline_order = 1)).fit(bs_temp['bp'], bs_temp['dnam'])

    # Make predictions on the test set
    y_pred = affine(gam.predict(bs_temp['bp'])) # this is my proxy for uncontaminated Y

    # Evaluate the model 
    mse[i] = np.std(gam.deviance_residuals(bs_temp['bp'], bs_temp['dnam']))
    
    # Fisher information
    fisher_information = fisher_inf(y_pred)
    
    # Bound of error variance
    bound[i] = (np.exp(2*estimator(np.column_stack((bs_temp['bp'], bs_temp['dnam'])))[0]) - 1)/fisher_information 
    
print(np.round(np.percentile(bound-mse, [2.5, 50, 97.5]), 3))
print(np.round(np.percentile(bound-mse, [5]), 3))

[-0.896 -0.113  0.523]
[-0.433]


### Estimates of BPD vs DNAm for males

In [32]:
temp = data_m
temp['bp'] = temp['bpd']

np.random.seed(42)
niter = 250
mse = np.zeros(niter)
bound = np.zeros(niter)

for i in range(niter):
    bs_temp = temp.sample(n = len(temp), replace = True)
    bs_temp['bp'] = affine(bs_temp['bp'])
    bs_temp['dnam'] = affine(bs_temp['dnam'])
    
    # Create and fit the GAM model with cubic splines
    gam = LinearGAM(s(0, spline_order = 1)).fit(bs_temp['bp'], bs_temp['dnam'])

    # Make predictions on the test set
    y_pred = affine(gam.predict(bs_temp['bp'])) # this is my proxy for uncontaminated Y

    # Evaluate the model 
    mse[i] = np.std(gam.deviance_residuals(bs_temp['bp'], bs_temp['dnam']))
    
    # Fisher information
    fisher_information = fisher_inf(y_pred)
    
    # Bound of error variance
    bound[i] = (np.exp(2*estimator(np.column_stack((bs_temp['bp'], bs_temp['dnam'])))[0]) - 1)/fisher_information 
    
print(np.round(np.percentile(bound-mse, [2.5, 50, 97.5]), 3))
print(np.round(np.percentile(bound-mse, [5]), 3))

[-0.226 -0.167 -0.026]
[-0.207]


### Estimates of BPD vs DNAm for combined

In [33]:
temp = data
temp['bp'] = temp['bpd']

np.random.seed(42)
niter = 250
mse = np.zeros(niter)
bound = np.zeros(niter)

for i in range(niter):
    bs_temp = temp.sample(n = len(temp), replace = True)
    bs_temp['bp'] = affine(bs_temp['bp'])
    bs_temp['dnam'] = affine(bs_temp['dnam'])
    
    # Create and fit the GAM model with cubic splines
    gam = LinearGAM(s(0, spline_order = 1)).fit(bs_temp['bp'], bs_temp['dnam'])

    # Make predictions on the test set
    y_pred = affine(gam.predict(bs_temp['bp'])) # this is my proxy for uncontaminated Y

    # Evaluate the model 
    mse[i] = np.std(gam.deviance_residuals(bs_temp['bp'], bs_temp['dnam']))
    
    # Fisher information
    fisher_information = fisher_inf(y_pred)
    
    # Bound of error variance
    bound[i] = (np.exp(2*estimator(np.column_stack((bs_temp['bp'], bs_temp['dnam'])))[0]) - 1)/fisher_information 
    
print(np.round(np.percentile(bound-mse, [2.5, 50, 97.5]), 3))
print(np.round(np.percentile(bound-mse, [5]), 3))

[-0.566 -0.122  0.186]
[-0.252]


## ATP2B1

In [35]:
dnam = pd.read_csv("/home/soumikp/enar_2023/data/dat_ATP2B1.csv").iloc[:,1:22].mean(axis=1)
folio = pd.read_csv("/home/soumikp/enar_2023/data/dat_ATP2B1.csv").iloc[:,23]
data = pd.concat([folio, dnam], axis=1)
data = pd.merge(data, bps, on = 'foliocc', how='inner').dropna()
data.columns = ['id', 'dnam', 'sex', 'bps', 'bpd']
data_m = data[data['sex'] == 0]
data_f = data[data['sex'] == 1]

### Estimates of BPS vs DNAm for females

In [36]:
temp = data_f
temp['bp'] = temp['bps']

np.random.seed(42)
niter = 250
mse = np.zeros(niter)
bound = np.zeros(niter)

for i in range(niter):
    bs_temp = temp.sample(n = len(temp), replace = True)
    bs_temp['bp'] = affine(bs_temp['bp'])
    bs_temp['dnam'] = affine(bs_temp['dnam'])
    
    # Create and fit the GAM model with cubic splines
    gam = LinearGAM(s(0, spline_order = 1)).fit(bs_temp['bp'], bs_temp['dnam'])

    # Make predictions on the test set
    y_pred = affine(gam.predict(bs_temp['bp'])) # this is my proxy for uncontaminated Y

    # Evaluate the model 
    mse[i] = np.std(gam.deviance_residuals(bs_temp['bp'], bs_temp['dnam']))
    
    # Fisher information
    fisher_information = fisher_inf(y_pred)
    
    # Bound of error variance
    bound[i] = (np.exp(2*estimator(np.column_stack((bs_temp['bp'], bs_temp['dnam'])))[0]) - 1)/fisher_information 
    
print(np.round(np.percentile(bound-mse, [2.5, 50, 97.5]), 3))
print(np.round(np.percentile(bound-mse, [5]), 3))

[-0.284 -0.106  0.163]
[-0.221]


### Estimates of BPS vs DNAm for males

In [37]:
temp = data_m
temp['bp'] = temp['bps']

np.random.seed(42)
niter = 250
mse = np.zeros(niter)
bound = np.zeros(niter)

for i in range(niter):
    bs_temp = temp.sample(n = len(temp), replace = True)
    bs_temp['bp'] = affine(bs_temp['bp'])
    bs_temp['dnam'] = affine(bs_temp['dnam'])
    
    # Create and fit the GAM model with cubic splines
    gam = LinearGAM(s(0, spline_order = 1)).fit(bs_temp['bp'], bs_temp['dnam'])

    # Make predictions on the test set
    y_pred = affine(gam.predict(bs_temp['bp'])) # this is my proxy for uncontaminated Y

    # Evaluate the model 
    mse[i] = np.std(gam.deviance_residuals(bs_temp['bp'], bs_temp['dnam']))
    
    # Fisher information
    fisher_information = fisher_inf(y_pred)
    
    # Bound of error variance
    bound[i] = (np.exp(2*estimator(np.column_stack((bs_temp['bp'], bs_temp['dnam'])))[0]) - 1)/fisher_information 
    
print(np.round(np.percentile(bound-mse, [2.5, 50, 97.5]), 3))
print(np.round(np.percentile(bound-mse, [5]), 3))

[-0.184 -0.103  0.16 ]
[-0.163]


### Estimates of BPS vs DNAm for combined

In [38]:
temp = data
temp['bp'] = temp['bps']

np.random.seed(42)
niter = 250
mse = np.zeros(niter)
bound = np.zeros(niter)

for i in range(niter):
    bs_temp = temp.sample(n = len(temp), replace = True)
    bs_temp['bp'] = affine(bs_temp['bp'])
    bs_temp['dnam'] = affine(bs_temp['dnam'])
    
    # Create and fit the GAM model with cubic splines
    gam = LinearGAM(s(0, spline_order = 1)).fit(bs_temp['bp'], bs_temp['dnam'])

    # Make predictions on the test set
    y_pred = affine(gam.predict(bs_temp['bp'])) # this is my proxy for uncontaminated Y

    # Evaluate the model 
    mse[i] = np.std(gam.deviance_residuals(bs_temp['bp'], bs_temp['dnam']))
    
    # Fisher information
    fisher_information = fisher_inf(y_pred)
    
    # Bound of error variance
    bound[i] = (np.exp(2*estimator(np.column_stack((bs_temp['bp'], bs_temp['dnam'])))[0]) - 1)/fisher_information 
    
print(np.round(np.percentile(bound-mse, [2.5, 50, 97.5]), 3))
print(np.round(np.percentile(bound-mse, [5]), 3))

[-1.156 -0.091  0.121]
[-0.49]


### Estimates of BPD vs DNAm for females

In [39]:
temp = data_f
temp['bp'] = temp['bpd']

np.random.seed(42)
niter = 250
mse = np.zeros(niter)
bound = np.zeros(niter)

for i in range(niter):
    bs_temp = temp.sample(n = len(temp), replace = True)
    bs_temp['bp'] = affine(bs_temp['bp'])
    bs_temp['dnam'] = affine(bs_temp['dnam'])
    
    # Create and fit the GAM model with cubic splines
    gam = LinearGAM(s(0, spline_order = 1)).fit(bs_temp['bp'], bs_temp['dnam'])

    # Make predictions on the test set
    y_pred = affine(gam.predict(bs_temp['bp'])) # this is my proxy for uncontaminated Y

    # Evaluate the model 
    mse[i] = np.std(gam.deviance_residuals(bs_temp['bp'], bs_temp['dnam']))
    
    # Fisher information
    fisher_information = fisher_inf(y_pred)
    
    # Bound of error variance
    bound[i] = (np.exp(2*estimator(np.column_stack((bs_temp['bp'], bs_temp['dnam'])))[0]) - 1)/fisher_information 
    
print(np.round(np.percentile(bound-mse, [2.5, 50, 97.5]), 3))
print(np.round(np.percentile(bound-mse, [5]), 3))

[-0.893 -0.11   0.104]
[-0.228]


### Estimates of BPD vs DNAm for males

In [40]:
temp = data_m
temp['bp'] = temp['bpd']

np.random.seed(42)
niter = 250
mse = np.zeros(niter)
bound = np.zeros(niter)

for i in range(niter):
    bs_temp = temp.sample(n = len(temp), replace = True)
    bs_temp['bp'] = affine(bs_temp['bp'])
    bs_temp['dnam'] = affine(bs_temp['dnam'])
    
    # Create and fit the GAM model with cubic splines
    gam = LinearGAM(s(0, spline_order = 1)).fit(bs_temp['bp'], bs_temp['dnam'])

    # Make predictions on the test set
    y_pred = affine(gam.predict(bs_temp['bp'])) # this is my proxy for uncontaminated Y

    # Evaluate the model 
    mse[i] = np.std(gam.deviance_residuals(bs_temp['bp'], bs_temp['dnam']))
    
    # Fisher information
    fisher_information = fisher_inf(y_pred)
    
    # Bound of error variance
    bound[i] = (np.exp(2*estimator(np.column_stack((bs_temp['bp'], bs_temp['dnam'])))[0]) - 1)/fisher_information 
    
print(np.round(np.percentile(bound-mse, [2.5, 50, 97.5]), 3))
print(np.round(np.percentile(bound-mse, [5]), 3))

[-1.063 -0.105  0.352]
[-0.178]


### Estimates of BPD vs DNAm for combined

In [41]:
temp = data
temp['bp'] = temp['bpd']

np.random.seed(42)
niter = 250
mse = np.zeros(niter)
bound = np.zeros(niter)

for i in range(niter):
    bs_temp = temp.sample(n = len(temp), replace = True)
    bs_temp['bp'] = affine(bs_temp['bp'])
    bs_temp['dnam'] = affine(bs_temp['dnam'])
    
    # Create and fit the GAM model with cubic splines
    gam = LinearGAM(s(0, spline_order = 1)).fit(bs_temp['bp'], bs_temp['dnam'])

    # Make predictions on the test set
    y_pred = affine(gam.predict(bs_temp['bp'])) # this is my proxy for uncontaminated Y

    # Evaluate the model 
    mse[i] = np.std(gam.deviance_residuals(bs_temp['bp'], bs_temp['dnam']))
    
    # Fisher information
    fisher_information = fisher_inf(y_pred)
    
    # Bound of error variance
    bound[i] = (np.exp(2*estimator(np.column_stack((bs_temp['bp'], bs_temp['dnam'])))[0]) - 1)/fisher_information 
    
print(np.round(np.percentile(bound-mse, [2.5, 50, 97.5]), 3))
print(np.round(np.percentile(bound-mse, [5]), 3))

[-2.279 -0.103  0.261]
[-1.483]


## KCNK3

In [43]:
dnam = pd.read_csv("/home/soumikp/enar_2023/data/dat_KCNK3.csv").iloc[:,1:36].mean(axis=1)
folio = pd.read_csv("/home/soumikp/enar_2023/data/dat_KCNK3.csv").iloc[:,37]
data = pd.concat([folio, dnam], axis=1)
data = pd.merge(data, bps, on = 'foliocc', how='inner').dropna()
data.columns = ['id', 'dnam', 'sex', 'bps', 'bpd']
data_m = data[data['sex'] == 0]
data_f = data[data['sex'] == 1]

### Estimates of BPS vs DNAm for females

In [44]:
temp = data_f
temp['bp'] = temp['bps']

np.random.seed(42)
niter = 250
mse = np.zeros(niter)
bound = np.zeros(niter)

for i in range(niter):
    bs_temp = temp.sample(n = len(temp), replace = True)
    bs_temp['bp'] = affine(bs_temp['bp'])
    bs_temp['dnam'] = affine(bs_temp['dnam'])
    
    # Create and fit the GAM model with cubic splines
    gam = LinearGAM(s(0, spline_order = 1)).fit(bs_temp['bp'], bs_temp['dnam'])

    # Make predictions on the test set
    y_pred = affine(gam.predict(bs_temp['bp'])) # this is my proxy for uncontaminated Y

    # Evaluate the model 
    mse[i] = np.std(gam.deviance_residuals(bs_temp['bp'], bs_temp['dnam']))
    
    # Fisher information
    fisher_information = fisher_inf(y_pred)
    
    # Bound of error variance
    bound[i] = (np.exp(2*estimator(np.column_stack((bs_temp['bp'], bs_temp['dnam'])))[0]) - 1)/fisher_information 
    
print(np.round(np.percentile(bound-mse, [2.5, 50, 97.5]), 3))
print(np.round(np.percentile(bound-mse, [5]), 3))

[-0.195 -0.075  0.094]
[-0.183]


### Estimates of BPS vs DNAm for males

In [45]:
temp = data_f
temp['bp'] = temp['bps']

np.random.seed(42)
niter = 250
mse = np.zeros(niter)
bound = np.zeros(niter)

for i in range(niter):
    bs_temp = temp.sample(n = len(temp), replace = True)
    bs_temp['bp'] = affine(bs_temp['bp'])
    bs_temp['dnam'] = affine(bs_temp['dnam'])
    
    # Create and fit the GAM model with cubic splines
    gam = LinearGAM(s(0, spline_order = 1)).fit(bs_temp['bp'], bs_temp['dnam'])

    # Make predictions on the test set
    y_pred = affine(gam.predict(bs_temp['bp'])) # this is my proxy for uncontaminated Y

    # Evaluate the model 
    mse[i] = np.std(gam.deviance_residuals(bs_temp['bp'], bs_temp['dnam']))
    
    # Fisher information
    fisher_information = fisher_inf(y_pred)
    
    # Bound of error variance
    bound[i] = (np.exp(2*estimator(np.column_stack((bs_temp['bp'], bs_temp['dnam'])))[0]) - 1)/fisher_information 
    
print(np.round(np.percentile(bound-mse, [2.5, 50, 97.5]), 3))
print(np.round(np.percentile(bound-mse, [5]), 3))

[-0.195 -0.075  0.094]
[-0.183]


### Estimates of BPS vs DNAm for combined

In [46]:
temp = data
temp['bp'] = temp['bps']

np.random.seed(42)
niter = 250
mse = np.zeros(niter)
bound = np.zeros(niter)

for i in range(niter):
    bs_temp = temp.sample(n = len(temp), replace = True)
    bs_temp['bp'] = affine(bs_temp['bp'])
    bs_temp['dnam'] = affine(bs_temp['dnam'])
    
    # Create and fit the GAM model with cubic splines
    gam = LinearGAM(s(0, spline_order = 1)).fit(bs_temp['bp'], bs_temp['dnam'])

    # Make predictions on the test set
    y_pred = affine(gam.predict(bs_temp['bp'])) # this is my proxy for uncontaminated Y

    # Evaluate the model 
    mse[i] = np.std(gam.deviance_residuals(bs_temp['bp'], bs_temp['dnam']))
    
    # Fisher information
    fisher_information = fisher_inf(y_pred)
    
    # Bound of error variance
    bound[i] = (np.exp(2*estimator(np.column_stack((bs_temp['bp'], bs_temp['dnam'])))[0]) - 1)/fisher_information 
    
print(np.round(np.percentile(bound-mse, [2.5, 50, 97.5]), 3))
print(np.round(np.percentile(bound-mse, [5]), 3))

[-0.161 -0.044  0.261]
[-0.155]


### Estimates of BPD vs DNAm for females

In [47]:
temp = data_f
temp['bp'] = temp['bpd']

np.random.seed(42)
niter = 250
mse = np.zeros(niter)
bound = np.zeros(niter)

for i in range(niter):
    bs_temp = temp.sample(n = len(temp), replace = True)
    bs_temp['bp'] = affine(bs_temp['bp'])
    bs_temp['dnam'] = affine(bs_temp['dnam'])
    
    # Create and fit the GAM model with cubic splines
    gam = LinearGAM(s(0, spline_order = 1)).fit(bs_temp['bp'], bs_temp['dnam'])

    # Make predictions on the test set
    y_pred = affine(gam.predict(bs_temp['bp'])) # this is my proxy for uncontaminated Y

    # Evaluate the model 
    mse[i] = np.std(gam.deviance_residuals(bs_temp['bp'], bs_temp['dnam']))
    
    # Fisher information
    fisher_information = fisher_inf(y_pred)
    
    # Bound of error variance
    bound[i] = (np.exp(2*estimator(np.column_stack((bs_temp['bp'], bs_temp['dnam'])))[0]) - 1)/fisher_information 
    
print(np.round(np.percentile(bound-mse, [2.5, 50, 97.5]), 3))
print(np.round(np.percentile(bound-mse, [5]), 3))

[-0.353 -0.075  0.924]
[-0.198]


### Estimates of BPD vs DNAm for males

In [48]:
temp = data_m
temp['bp'] = temp['bpd']

np.random.seed(42)
niter = 250
mse = np.zeros(niter)
bound = np.zeros(niter)

for i in range(niter):
    bs_temp = temp.sample(n = len(temp), replace = True)
    bs_temp['bp'] = affine(bs_temp['bp'])
    bs_temp['dnam'] = affine(bs_temp['dnam'])
    
    # Create and fit the GAM model with cubic splines
    gam = LinearGAM(s(0, spline_order = 1)).fit(bs_temp['bp'], bs_temp['dnam'])

    # Make predictions on the test set
    y_pred = affine(gam.predict(bs_temp['bp'])) # this is my proxy for uncontaminated Y

    # Evaluate the model 
    mse[i] = np.std(gam.deviance_residuals(bs_temp['bp'], bs_temp['dnam']))
    
    # Fisher information
    fisher_information = fisher_inf(y_pred)
    
    # Bound of error variance
    bound[i] = (np.exp(2*estimator(np.column_stack((bs_temp['bp'], bs_temp['dnam'])))[0]) - 1)/fisher_information 
    
print(np.round(np.percentile(bound-mse, [2.5, 50, 97.5]), 3))
print(np.round(np.percentile(bound-mse, [5]), 3))

[-0.186 -0.127 -0.004]
[-0.177]


### Estimates of BPD vs DNAm for combined

In [49]:
temp = data
temp['bp'] = temp['bpd']

np.random.seed(42)
niter = 250
mse = np.zeros(niter)
bound = np.zeros(niter)

for i in range(niter):
    bs_temp = temp.sample(n = len(temp), replace = True)
    bs_temp['bp'] = affine(bs_temp['bp'])
    bs_temp['dnam'] = affine(bs_temp['dnam'])
    
    # Create and fit the GAM model with cubic splines
    gam = LinearGAM(s(0, spline_order = 1)).fit(bs_temp['bp'], bs_temp['dnam'])

    # Make predictions on the test set
    y_pred = affine(gam.predict(bs_temp['bp'])) # this is my proxy for uncontaminated Y

    # Evaluate the model 
    mse[i] = np.std(gam.deviance_residuals(bs_temp['bp'], bs_temp['dnam']))
    
    # Fisher information
    fisher_information = fisher_inf(y_pred)
    
    # Bound of error variance
    bound[i] = (np.exp(2*estimator(np.column_stack((bs_temp['bp'], bs_temp['dnam'])))[0]) - 1)/fisher_information 
    
print(np.round(np.percentile(bound-mse, [2.5, 50, 97.5]), 3))
print(np.round(np.percentile(bound-mse, [5]), 3))

[-1.287 -0.073  0.625]
[-0.176]


## PRDM8

In [51]:
dnam = pd.read_csv("/home/soumikp/enar_2023/data/dat_PRDM8.csv").iloc[:, 1:52].mean(axis=1)
folio = pd.read_csv("/home/soumikp/enar_2023/data/dat_PRDM8.csv").iloc[:,53]
data = pd.concat([folio, dnam], axis=1)
data = pd.merge(data, bps, on = 'foliocc', how='inner').dropna()
data.columns = ['id', 'dnam', 'sex', 'bps', 'bpd']
data_m = data[data['sex'] == 0]
data_f = data[data['sex'] == 1]

### Estimates of BPS vs DNAm for females

In [52]:
temp = data_f
temp['bp'] = temp['bps']

np.random.seed(42)
niter = 250
mse = np.zeros(niter)
bound = np.zeros(niter)

for i in range(niter):
    bs_temp = temp.sample(n = len(temp), replace = True)
    bs_temp['bp'] = affine(bs_temp['bp'])
    bs_temp['dnam'] = affine(bs_temp['dnam'])
    
    # Create and fit the GAM model with cubic splines
    gam = LinearGAM(s(0, spline_order = 1)).fit(bs_temp['bp'], bs_temp['dnam'])

    # Make predictions on the test set
    y_pred = affine(gam.predict(bs_temp['bp'])) # this is my proxy for uncontaminated Y

    # Evaluate the model 
    mse[i] = np.std(gam.deviance_residuals(bs_temp['bp'], bs_temp['dnam']))
    
    # Fisher information
    fisher_information = fisher_inf(y_pred)
    
    # Bound of error variance
    bound[i] = (np.exp(2*estimator(np.column_stack((bs_temp['bp'], bs_temp['dnam'])))[0]) - 1)/fisher_information 
    
print(np.round(np.percentile(bound-mse, [2.5, 50, 97.5]), 3))
print(np.round(np.percentile(bound-mse, [5]), 3))

[-0.75  -0.176  0.305]
[-0.347]


### Estimates of BPS vs DNAm for males

In [53]:
temp = data_m
temp['bp'] = temp['bps']

np.random.seed(42)
niter = 250
mse = np.zeros(niter)
bound = np.zeros(niter)

for i in range(niter):
    bs_temp = temp.sample(n = len(temp), replace = True)
    bs_temp['bp'] = affine(bs_temp['bp'])
    bs_temp['dnam'] = affine(bs_temp['dnam'])
    
    # Create and fit the GAM model with cubic splines
    gam = LinearGAM(s(0, spline_order = 1)).fit(bs_temp['bp'], bs_temp['dnam'])

    # Make predictions on the test set
    y_pred = affine(gam.predict(bs_temp['bp'])) # this is my proxy for uncontaminated Y

    # Evaluate the model 
    mse[i] = np.std(gam.deviance_residuals(bs_temp['bp'], bs_temp['dnam']))
    
    # Fisher information
    fisher_information = fisher_inf(y_pred)
    
    # Bound of error variance
    bound[i] = (np.exp(2*estimator(np.column_stack((bs_temp['bp'], bs_temp['dnam'])))[0]) - 1)/fisher_information 
    
print(np.round(np.percentile(bound-mse, [2.5, 50, 97.5]), 3))
print(np.round(np.percentile(bound-mse, [5]), 3))

[-0.214 -0.168 -0.091]
[-0.2]


### Estimates of BPS vs DNAm for combined

In [54]:
temp = data
temp['bp'] = temp['bps']

np.random.seed(42)
niter = 250
mse = np.zeros(niter)
bound = np.zeros(niter)

for i in range(niter):
    bs_temp = temp.sample(n = len(temp), replace = True)
    bs_temp['bp'] = affine(bs_temp['bp'])
    bs_temp['dnam'] = affine(bs_temp['dnam'])
    
    # Create and fit the GAM model with cubic splines
    gam = LinearGAM(s(0, spline_order = 1)).fit(bs_temp['bp'], bs_temp['dnam'])

    # Make predictions on the test set
    y_pred = affine(gam.predict(bs_temp['bp'])) # this is my proxy for uncontaminated Y

    # Evaluate the model 
    mse[i] = np.std(gam.deviance_residuals(bs_temp['bp'], bs_temp['dnam']))
    
    # Fisher information
    fisher_information = fisher_inf(y_pred)
    
    # Bound of error variance
    bound[i] = (np.exp(2*estimator(np.column_stack((bs_temp['bp'], bs_temp['dnam'])))[0]) - 1)/fisher_information 
    
print(np.round(np.percentile(bound-mse, [2.5, 50, 97.5]), 3))
print(np.round(np.percentile(bound-mse, [5]), 3))

[-0.253 -0.173 -0.118]
[-0.22]


### Estimates of BPD vs DNAm for females

In [55]:
temp = data_f
temp['bp'] = temp['bpd']

np.random.seed(42)
niter = 250
mse = np.zeros(niter)
bound = np.zeros(niter)

for i in range(niter):
    bs_temp = temp.sample(n = len(temp), replace = True)
    bs_temp['bp'] = affine(bs_temp['bp'])
    bs_temp['dnam'] = affine(bs_temp['dnam'])
    
    # Create and fit the GAM model with cubic splines
    gam = LinearGAM(s(0, spline_order = 1)).fit(bs_temp['bp'], bs_temp['dnam'])

    # Make predictions on the test set
    y_pred = affine(gam.predict(bs_temp['bp'])) # this is my proxy for uncontaminated Y

    # Evaluate the model 
    mse[i] = np.std(gam.deviance_residuals(bs_temp['bp'], bs_temp['dnam']))
    
    # Fisher information
    fisher_information = fisher_inf(y_pred)
    
    # Bound of error variance
    bound[i] = (np.exp(2*estimator(np.column_stack((bs_temp['bp'], bs_temp['dnam'])))[0]) - 1)/fisher_information 
    
print(np.round(np.percentile(bound-mse, [2.5, 50, 97.5]), 3))
print(np.round(np.percentile(bound-mse, [5]), 3))

[-0.285 -0.178  0.361]
[-0.241]


### Estimates of BPD vs DNAm for males

In [56]:
temp = data_m
temp['bp'] = temp['bpd']

np.random.seed(42)
niter = 250
mse = np.zeros(niter)
bound = np.zeros(niter)

for i in range(niter):
    bs_temp = temp.sample(n = len(temp), replace = True)
    bs_temp['bp'] = affine(bs_temp['bp'])
    bs_temp['dnam'] = affine(bs_temp['dnam'])
    
    # Create and fit the GAM model with cubic splines
    gam = LinearGAM(s(0, spline_order = 1)).fit(bs_temp['bp'], bs_temp['dnam'])

    # Make predictions on the test set
    y_pred = affine(gam.predict(bs_temp['bp'])) # this is my proxy for uncontaminated Y

    # Evaluate the model 
    mse[i] = np.std(gam.deviance_residuals(bs_temp['bp'], bs_temp['dnam']))
    
    # Fisher information
    fisher_information = fisher_inf(y_pred)
    
    # Bound of error variance
    bound[i] = (np.exp(2*estimator(np.column_stack((bs_temp['bp'], bs_temp['dnam'])))[0]) - 1)/fisher_information 
    
print(np.round(np.percentile(bound-mse, [2.5, 50, 97.5]), 3))
print(np.round(np.percentile(bound-mse, [5]), 3))

[-0.24  -0.181 -0.024]
[-0.225]


### Estimates of BPD vs DNAm for combined

In [57]:
temp = data
temp['bp'] = temp['bpd']

np.random.seed(42)
niter = 250
mse = np.zeros(niter)
bound = np.zeros(niter)

for i in range(niter):
    bs_temp = temp.sample(n = len(temp), replace = True)
    bs_temp['bp'] = affine(bs_temp['bp'])
    bs_temp['dnam'] = affine(bs_temp['dnam'])
    
    # Create and fit the GAM model with cubic splines
    gam = LinearGAM(s(0, spline_order = 1)).fit(bs_temp['bp'], bs_temp['dnam'])

    # Make predictions on the test set
    y_pred = affine(gam.predict(bs_temp['bp'])) # this is my proxy for uncontaminated Y

    # Evaluate the model 
    mse[i] = np.std(gam.deviance_residuals(bs_temp['bp'], bs_temp['dnam']))
    
    # Fisher information
    fisher_information = fisher_inf(y_pred)
    
    # Bound of error variance
    bound[i] = (np.exp(2*estimator(np.column_stack((bs_temp['bp'], bs_temp['dnam'])))[0]) - 1)/fisher_information 
    
print(np.round(np.percentile(bound-mse, [2.5, 50, 97.5]), 3))
print(np.round(np.percentile(bound-mse, [5]), 3))

[-0.364 -0.181  0.028]
[-0.254]
