### Concates all GATK counts into a single dataframe/table. Also, computes following stats for all files individually.

    1. Reference Bias (refbias)
    2. Binomial Test (binomTest)
    3. FDR adjustment (fdr)

In [1]:
## doing necessary imports
from glob import glob
import numpy as np
import pandas as pd
from pandarallel import pandarallel
from scipy import stats
from statsmodels.stats.multitest import fdrcorrection

In [2]:
pandarallel.initialize(progress_bar=False)

INFO: Pandarallel will run on 8 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


##### Utility functions

In [3]:
## utility functions
def compute_refbias(x):
    """
    Computes reference Bias
    """
    if x['totalCount'] == 0:
        return np.nan
    else:
        return x['refCount']/x['totalCount']

In [4]:
def computes_binom_test(x):
    """
    Computes bionomial test and returns the p value
    """
    if x['totalCount'] == 0:
        return np.nan
    else:
        return stats.binom_test(x['refCount'], n=x['totalCount'], p=0.5, alternative='two-sided')

In [5]:
def compute_fdr(x):
    """
    Computes FDR
    """
    return fdrcorrection(x, method='indep')[1]

##### Run concat files

In [8]:
### declare defaults
timepoints = ["ZT0","ZT2","ZT4","ZT6","ZT8","ZT10","ZT12","ZT14","ZT16","ZT18","ZT20","ZT22"]
gatk_dir = "../pipeline_ios/asecounter/*.txt"
out_dir = "./data_files/concatenated_data"

In [9]:
import warnings
warnings.filterwarnings("ignore")

In [10]:
req_cols = ['tissue', 'timepoint', 'loci', 'refCount', 'altCount', 'totalCount', 'refBias', 'binomTest', 'fdr']
## create an empty dataframe to append all files
df_concat = pd.DataFrame(columns=req_cols)
#loop over all ZTs in timepoints
for zt in timepoints:
    print(f"Running {zt}")
    fnames = glob(gatk_dir)
    fnames = [i for i in fnames if i.split("/")[-1].split("_")[1].strip(".count.txt") == zt]
    print(f"{zt} contains {len(fnames)} files")
    ## loop over tissues of a timepoint
    for fname in fnames:
        tissue = fname.split("/")[-1].split("_")[0]
        # print(f"Running {tissue} of {zt}")

        df = pd.read_csv(fname, sep="\t")
        df['timepoint'] = zt
        df['tissue'] = tissue

        df['loci'] = df['contig'] + ":" + df['position'].astype(str)
        df['refBias'] = df.parallel_apply(compute_refbias, axis=1) #computes reference bias
        df['binomTest'] = df.parallel_apply(computes_binom_test, axis=1) #computes binomial test
        #creating a new df w/o missings to compute fdr test. This will be merged back to the original df
        df1 = df[['loci', 'binomTest']].dropna().copy()
        df1['fdr'] = compute_fdr(df1['binomTest'].to_list()) #computes FDR
        df1 = df1[['loci', 'fdr']]
        df = df.merge(df1, on='loci', how='left')
        df = df[req_cols]
        # merge with the previous df
        df_concat = df_concat.append(df)

Running ZT0
ZT0 contains 10 files
Running ZT2
ZT2 contains 11 files
Running ZT4
ZT4 contains 11 files
Running ZT6
ZT6 contains 10 files
Running ZT8
ZT8 contains 11 files
Running ZT10
ZT10 contains 9 files
Running ZT12
ZT12 contains 11 files
Running ZT14
ZT14 contains 11 files
Running ZT16
ZT16 contains 11 files
Running ZT18
ZT18 contains 11 files
Running ZT20
ZT20 contains 11 files
Running ZT22
ZT22 contains 11 files


In [12]:
df_concat.to_parquet(f"{out_dir}/gatk_all_samples_concatenated.parquet", index=None)