### Exploring CpGs for heteroskedasticity and differential methylation with passage
This code it an illustrative example and the python script 02_heteroskedasicity_CpGs.py was run with arguments for which iteration of 1,000. 

In [1]:
import numpy as np
import statsmodels
import pandas as pd
import statsmodels.formula.api as smf
import statsmodels.stats.api as sms
import sys
import statistics 

In [2]:
beta = pd.read_csv('data/beta_organoids.csv')

In [3]:
meta = pd.read_csv('data/meta_organoids.csv')

In [4]:
# prepare passage column from linear modelling
meta.rename(columns={"passage.or.rescope.no": "passage", "sample.type": "sampletype"}, inplace=True)

#df['score_num'] = df['score'].apply(score_to_numeric)
meta['passage'] = meta['passage'].str.replace('P','')
meta['passage'] = meta['passage'].str.replace('RE1.','')
meta['passage'] = pd.to_numeric(meta['passage'])

(165, 35)

In [5]:
meta.head(3)
meta.shape

Unnamed: 0.1,Unnamed: 0,case.no,array.id,age.group,diagnosis,sample.site,sampletype,sampling.time.point,passage,fraction,...,other.id,genotype,notes,long,duplicated,sample_ID,sentrix_ID,array.id.path,det_pval,passage.or.rescope.no_numeric
0,207,212,201172580052_R02C01,paediatric,Control,SC,organoid,original,6,,...,,,,212.SC.organoid.NA.P6.NA.N.N.N,Y,212_SC,201172580052,../../data/raw/all_methylation_share/ALL_IDATS...,0.000215,6
1,210,212,201172580051_R03C01,paediatric,Control,TI,organoid,original,6,,...,,,,212.TI.organoid.NA.P6.NA.N.N.N,Y,212_TI,201172580051,../../data/raw/all_methylation_share/ALL_IDATS...,0.000207,6
2,218,223,201172560046_R01C01,paediatric,Control,SC,organoid,original,2,,...,,,,223.SC.organoid.NA.P2.NA.N.N.N,Y,223_SC,201172560046,../../data/raw/all_methylation_share/ALL_IDATS...,0.000405,2


In [48]:
import random

# In py script the 1000 subsamplings were fed in as arguments
#permstart = int(sys.argv[1])
#permend = int(sys.argv[1])+10
#CpGnum = beta.shape[0]

# A minimal example is shown here for illustration. With only 20 CpGs instead of the 800,383 CpGs in the full dataset.
permstart = 1
permend = 5
CpGnum = 20 

pval_all_BP = []
pval_all_diff = []
db_all_diff = []
fdr_all_BP = []
fdr_all_diff = []

for n in range(permstart,permend):
    
    random.seed(n)


    ## Sample the cohort in the lower passage number samples. 
    #Pull 5 random samples from each of those with 1,2,3 or 4 passages.

    meta_sampled_high_passage = meta[meta['passage'] > 4]

    meta_sampled = meta[meta['passage'] <= 4]

    meta_sampled_grouped = meta_sampled.groupby('passage')

    meta_sampled_subset = []
    for name, group in meta_sampled_grouped:
        meta_sampled_subset.append(group.sample(5))

    meta_sampled_subset = pd.concat([pd.concat(meta_sampled_subset),meta_sampled_high_passage])


    ## collect a p value for each CpG

    beta_sampled = beta[meta_sampled_subset['array.id'].values.tolist()]

    CpG_pval_passage_subset = []
    CpG_pval_BP_subset = []
    CpG_db_passage_subset = []


    for cpg in range(0, CpGnum): #beta_sampled.shape[0]

        meta_sampled_subset['beta'] = beta_sampled.iloc[cpg,0:45].values.tolist()
        meta_sampled_subset['constant'] = 1

        reg = smf.ols('beta ~ passage', data=meta_sampled_subset).fit()
        # Differential p value is interesting as well
        pval_passage = reg.pvalues[1]
        db = (reg.params[1]*1)-(reg.params[1]*16)

        pred_val = reg.fittedvalues.copy()
        true_val = meta_sampled_subset['beta'].values.copy()
        residual = true_val - pred_val

        #BP heteroskedacity test
        _, pval_BP, __, f_pval = statsmodels.stats.diagnostic.het_breuschpagan(residual, meta_sampled_subset[['passage','constant']])
        # studentized or not (p vs f) values do match the ones from bptest in R

        CpG_pval_BP_subset.append(pval_BP)
        CpG_pval_passage_subset.append(pval_passage)
        CpG_db_passage_subset.append(db)
        
    pval_all_BP.append(CpG_pval_BP_subset)
    pval_all_diff.append(CpG_pval_passage_subset)
    db_all_diff.append(CpG_db_passage_subset)
    fdr_all_BP.append(statsmodels.stats.multitest.multipletests(CpG_pval_BP_subset, method='fdr_bh', is_sorted=False, returnsorted=False)[1])
    fdr_all_diff.append(statsmodels.stats.multitest.multipletests(CpG_pval_passage_subset, method='fdr_bh', is_sorted=False, returnsorted=False)[1])

In [49]:
pval_BP_df = pd.DataFrame(pval_all_BP)
pval_diff_df = pd.DataFrame(pval_all_diff)
db_all_diff = pd.DataFrame(db_all_diff)
fdr_all_BP = pd.DataFrame(fdr_all_BP)
fdr_all_diff = pd.DataFrame(fdr_all_diff)

In [50]:
print(pval_diff_df)
print(fdr_all_diff)

          0         1         2         3         4         5         6  \
0  0.071683  0.660465  0.011904  0.498074  0.824093  0.718252  0.027988   
1  0.192251  0.740886  0.039192  0.287355  0.668322  0.649799  0.003585   
2  0.064221  0.808477  0.102761  0.213139  0.951329  0.852870  0.057313   
3  0.680613  0.667291  0.011574  0.157319  0.542423  0.988244  0.039429   

          7         8         9        10        11        12        13  \
0  0.440442  0.306106  0.817992  0.187156  0.626443  0.415082  0.584132   
1  0.123520  0.992689  0.818384  0.473950  0.842505  0.439636  0.886696   
2  0.376500  0.288973  0.974044  0.238570  0.510890  0.439009  0.366386   
3  0.104696  0.856238  0.505676  0.027591  0.724031  0.577375  0.657133   

         14        15        16        17        18        19  
0  0.189016  0.267136  0.130780  0.758333  0.655734  0.831904  
1  0.490929  0.720198  0.413973  0.990871  0.522369  0.786505  
2  0.288180  0.795216  0.147489  0.874648  0.772715  0.2

In [None]:
sig_BP = []
for cpg in range(0, CpGnum): #beta_sampled.shape[0]
    sig = sum(pval_BP_df.iloc[:,cpg] < 0.05)
    sig_BP.append(sig)
    
sig_diff = []
for cpg in range(0, CpGnum): #beta_sampled.shape[0]
    sig = sum(pval_diff_df.iloc[:,cpg] < 0.05)
    sig_diff.append(sig)
    
mn_db = []
for cpg in range(0, CpGnum): #beta_sampled.shape[0]
    mn = statistics.mean(db_all_diff.iloc[:,cpg])
    mn_db.append(mn)
    
sig_BP_fdr = []
for cpg in range(0, CpGnum): #beta_sampled.shape[0]
    sig = sum(fdr_all_BP.iloc[:,cpg] < 0.05)
    sig_BP_fdr.append(sig)

sig_diff_fdr = []
for cpg in range(0, CpGnum): #beta_sampled.shape[0]
    sig = sum(fdr_all_diff.iloc[:,cpg] < 0.05)
    sig_diff_fdr.append(sig)

In [None]:
pval_BP_df

In [194]:
pval_BP_df = pd.DataFrame([sig_BP, sig_diff,mn_db, sig_BP_fdr,sig_diff_fdr])
pval_BP_df.to_csv("data/passage_CpG_iterations/Heteroskedactiy_pvalues" + sys.argv[2] + ".csv","w")