In [1]:
CONSTRAINT_TOOLS_DATA = '/scratch/ucgd/lustre-work/quinlan/data-shared/constraint-tools'

In [2]:
import pandas as pd 

def get_chen_predictions():
    filename = f'{CONSTRAINT_TOOLS_DATA}/chen-et-al-2022/Supplementary_Datasets/Supplementary_Data_2.bed'
    df = pd.read_csv(filename, sep='\t')
    df = df.rename(columns={
        'chrom': 'chromosome', 
        'constraint_Z': 'chen_zscore'
    })
    return df

get_chen_predictions()

Unnamed: 0,chromosome,start,end,chen_zscore
0,chr1,1432000,1433000,2.384293
1,chr1,1435000,1436000,-2.952197
2,chr1,1449000,1450000,-0.899126
3,chr1,1450000,1451000,-5.461776
4,chr1,1451000,1452000,-1.497436
...,...,...,...,...
1797148,chr9,137653000,137654000,-1.658773
1797149,chr9,137655000,137656000,-2.662311
1797150,chr9,137656000,137657000,-2.986546
1797151,chr9,137657000,137658000,-0.890788


In [3]:
def get_observations_and_mchale_predictions_on_chen_windows(kmer_size):
    # zscores were computed using dist/model-germline-grch38.kmerSize-X.json                                                                 
    filename = f'{CONSTRAINT_TOOLS_DATA}/genome-wide-predictions/predict-germline-grch38.chen-windows.kmerSize-{kmer_size}.bed'
    df = pd.read_csv(filename, sep='\t')
    return df

get_observations_and_mchale_predictions_on_chen_windows(kmer_size=3)

Unnamed: 0,chromosome,start,end,position,N_bar,N_observed,K_bar,K_observed,M
0,chr1,1432000,1433000,1432500,-0.254203,260,-2.016241612762811,110,257
1,chr1,1435000,1436000,1435500,-7.711445,275,1.0837126170320344,149,272
2,chr1,1449000,1450000,1449500,0.327887,225,-0.43437415492135656,113,223
3,chr1,1450000,1451000,1450500,2.185323,313,-3.3146359070350946,109,308
4,chr1,1451000,1452000,1451500,2.376220,293,-1.460945287275204,129,293
...,...,...,...,...,...,...,...,...,...
1796995,chr22,50224000,50225000,50224500,0.470386,254,0.7011477917731173,137,254
1796996,chr22,50227000,50228000,50227500,1.212395,283,-1.602653587803972,122,283
1796997,chr22,50228000,50229000,50228500,-0.310536,241,0.15471283440311814,121,229
1796998,chr22,50229000,50230000,50229500,-1.304697,231,0.06987977615349292,121,231


In [4]:
# I've predicted N_bar and K_bar on most of the Chen windows, but not all, 
# c.f. /scratch/ucgd/lustre-work/quinlan/data-shared/constraint-tools/genome-wide-predictions/predict-germline-grch38.chen-windows.log
print(f'Number of Chen windows: {len(get_chen_predictions())}')
print(f'Number of Chen windows on which N_bar and K_bar were predicted: {len(get_observations_and_mchale_predictions_on_chen_windows(kmer_size=3))}')

Number of Chen windows: 1797153
Number of Chen windows on which N_bar and K_bar were predicted: 1797000


In [5]:
def get_mchale_predictions_on_chen_windows(kmer_size):
    columns = ['chromosome', 'start', 'end', 'N_bar', 'K_bar']
    df = get_observations_and_mchale_predictions_on_chen_windows(kmer_size)[columns]
    df = df.rename(columns={
        'N_bar': f'N_bar_{kmer_size}',
        'K_bar': f'K_bar_{kmer_size}'
    })
    return df

def get_observations_on_chen_windows(kmer_size):
    columns = ['chromosome', 'start', 'end', 'N_observed', 'K_observed', 'M']
    df = get_observations_and_mchale_predictions_on_chen_windows(kmer_size)[columns]
    return df

In [6]:
from functools import reduce

on = ['chromosome', 'start', 'end']
how = 'inner'

dfs = [
    get_observations_on_chen_windows(kmer_size=3),
    get_mchale_predictions_on_chen_windows(kmer_size=3),
    get_mchale_predictions_on_chen_windows(kmer_size=5),
    get_mchale_predictions_on_chen_windows(kmer_size=7),
    get_chen_predictions(),
]

chen_and_mchale_predictions = reduce(
    lambda left, right: pd.merge(left, right, on=on, how=how), 
    dfs
)

chen_and_mchale_predictions

Unnamed: 0,chromosome,start,end,N_observed,K_observed,M,N_bar_3,K_bar_3,N_bar_5,K_bar_5,N_bar_7,K_bar_7,chen_zscore
0,chr1,1432000,1433000,260,110,257,-0.254203,-2.016241612762811,-0.316478,-2.016241612762811,-0.325217,-2.016241612762811,2.384293
1,chr1,1435000,1436000,275,149,272,-7.711445,1.0837126170320344,-7.032362,1.0837126170320344,-6.425345,1.0837126170320344,-2.952197
2,chr1,1449000,1450000,225,113,223,0.327887,-0.43437415492135656,0.404163,-0.43437415492135656,0.259578,-0.43437415492135656,-0.899126
3,chr1,1450000,1451000,313,109,308,2.185323,-3.3146359070350946,2.512368,-3.3146359070350946,2.587098,-3.3146359070350946,-5.461776
4,chr1,1451000,1452000,293,129,293,2.376220,-1.460945287275204,2.767957,-1.460945287275204,3.021663,-1.460945287275204,-1.497436
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1796995,chr22,50224000,50225000,254,137,254,0.470386,0.7011477917731173,0.530959,0.7011477917731173,0.584696,0.7011477917731173,-5.828498
1796996,chr22,50227000,50228000,283,122,283,1.212395,-1.602653587803972,1.351284,-1.6014191831134763,1.384147,-1.6014191831134763,-2.528842
1796997,chr22,50228000,50229000,241,121,229,-0.310536,0.15471283440311814,-0.316938,0.15471283440311814,-0.267572,0.15471283440311814,1.302929
1796998,chr22,50229000,50230000,231,121,231,-1.304697,0.06987977615349292,-1.230692,0.06987977615349292,-1.203488,0.06987977615349292,-3.331737


In [7]:
chen_and_mchale_predictions.to_csv(
    f'{CONSTRAINT_TOOLS_DATA}/benchmark-genome-wide-predictions/chen-et-al-2022/chen-mchale.multiple-kmers.bed', 
    sep = '\t',
    index = False
)