In [1]:
CONSTRAINT_TOOLS_DATA = '/scratch/ucgd/lustre-work/quinlan/data-shared/constraint-tools'

In [2]:
import pandas as pd 

def get_chen_predictions():
    filename = f'{CONSTRAINT_TOOLS_DATA}/chen-et-al-2022/Supplementary_Datasets/Supplementary_Data_2.bed'
    df = pd.read_csv(filename, sep='\t')
    df = df.rename(columns={
        'chrom': 'chromosome', 
        'constraint_Z': 'chen_zscore'
    })
    return df

get_chen_predictions()

Unnamed: 0,chromosome,start,end,chen_zscore
0,chr1,1432000,1433000,2.384293
1,chr1,1435000,1436000,-2.952197
2,chr1,1449000,1450000,-0.899126
3,chr1,1450000,1451000,-5.461776
4,chr1,1451000,1452000,-1.497436
...,...,...,...,...
1797148,chr9,137653000,137654000,-1.658773
1797149,chr9,137655000,137656000,-2.662311
1797150,chr9,137656000,137657000,-2.986546
1797151,chr9,137657000,137658000,-0.890788


In [3]:
def get_observations_and_mchale_predictions_on_chen_windows(kmer_size, train_set_label):
    # zscores were computed using dist/model-germline-grch38-Nonly.kmerSize-X.trainSet-Y.json                                                                  
    filename = f'{CONSTRAINT_TOOLS_DATA}/genome-wide-predictions/predict-germline-grch38-Nonly.chenWindows.kmerSize-{kmer_size}.trainSet-{train_set_label}.bed'
    df = pd.read_csv(filename, sep='\t')
    return df

get_observations_and_mchale_predictions_on_chen_windows(kmer_size=3, train_set_label='chenWindows')

Unnamed: 0,chromosome,start,end,position,N_bar,N_observed
0,chr1,1432000,1433000,1432500,0.118933,260
1,chr1,1435000,1436000,1435500,-6.944479,275
2,chr1,1449000,1450000,1449500,0.600034,225
3,chr1,1450000,1451000,1450500,2.607210,313
4,chr1,1451000,1452000,1451500,2.758334,293
...,...,...,...,...,...,...
1796995,chr22,50224000,50225000,50224500,0.813849,254
1796996,chr22,50227000,50228000,50227500,1.602955,283
1796997,chr22,50228000,50229000,50228500,0.024522,241
1796998,chr22,50229000,50230000,50229500,-0.978026,231


In [6]:
# I've predicted N_bar on most of the Chen windows, but not all, 
print(f'Number of Chen windows: {len(get_chen_predictions())}')
print(f'Number of Chen windows on which N_bar were predicted: {len(get_observations_and_mchale_predictions_on_chen_windows(kmer_size=3, train_set_label="chenWindows"))}')

Number of Chen windows: 1797153
Number of Chen windows on which N_bar were predicted: 1797000


In [7]:
def get_mchale_predictions_on_chen_windows(kmer_size, train_set_label):
    columns = ['chromosome', 'start', 'end', 'N_bar']
    df = get_observations_and_mchale_predictions_on_chen_windows(kmer_size, train_set_label)[columns]
    df = df.rename(columns={
        'N_bar': f'N_bar_{kmer_size}_{train_set_label}',
    })
    return df

def get_observations_on_chen_windows(kmer_size, train_set_label):
    columns = ['chromosome', 'start', 'end', 'N_observed']
    df = get_observations_and_mchale_predictions_on_chen_windows(kmer_size, train_set_label)[columns]
    return df

In [8]:
from functools import reduce

on = ['chromosome', 'start', 'end']
how = 'inner'

dfs = [get_observations_on_chen_windows(kmer_size=3, train_set_label='noncoding')]

for kmer_size in [3, 5, 7]: 
    for train_set_label in ['noncoding', 'coding', 'chenWindows']: 
        dfs.append(get_mchale_predictions_on_chen_windows(kmer_size, train_set_label))

dfs.append(get_chen_predictions())

chen_and_mchale_predictions = reduce(
    lambda left, right: pd.merge(left, right, on=on, how=how), 
    dfs
)

chen_and_mchale_predictions

Unnamed: 0,chromosome,start,end,N_observed,N_bar_3_noncoding,N_bar_3_coding,N_bar_3_chenWindows,N_bar_5_noncoding,N_bar_5_coding,N_bar_5_chenWindows,N_bar_7_noncoding,N_bar_7_coding,N_bar_7_chenWindows,chen_zscore
0,chr1,1432000,1433000,260,-0.247615,2.264194,0.118933,-0.339806,1.967525,0.075982,-0.372778,1.786749,0.081009,2.384293
1,chr1,1435000,1436000,275,-7.547130,-2.542472,-6.944479,-6.810107,-2.075201,-6.071172,-6.280673,-1.910324,-5.310077,-2.952197
2,chr1,1449000,1450000,225,0.258359,2.316462,0.600034,0.330975,2.319833,0.684473,0.139977,2.101117,0.520413,-0.899126
3,chr1,1450000,1451000,313,2.197534,5.178077,2.607210,2.480911,5.214801,2.933478,2.524948,5.129449,2.997041,-5.461776
4,chr1,1451000,1452000,293,2.367303,5.056714,2.758334,2.725116,5.216863,3.145152,2.951595,5.311666,3.382963,-1.497436
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1796995,chr22,50224000,50225000,254,0.460666,2.855045,0.813849,0.496005,2.723660,0.871800,0.557227,2.619678,0.932196,-5.828498
1796996,chr22,50227000,50228000,283,1.220947,3.869722,1.602955,1.340543,3.779904,1.761348,1.371285,3.723743,1.823687,-2.528842
1796997,chr22,50228000,50229000,241,-0.317039,1.935872,0.024522,-0.348897,1.724601,0.033374,-0.330285,1.729511,0.101667,1.302929
1796998,chr22,50229000,50230000,231,-1.323602,1.031312,-0.978026,-1.250236,0.939094,-0.903616,-1.225998,0.927144,-0.855868,-3.331737


In [9]:
chen_and_mchale_predictions.to_csv(
    f'{CONSTRAINT_TOOLS_DATA}/benchmark-genome-wide-predictions/chen-et-al-2022/chen-mchale.kmerSizes.trainSets.bed', 
    sep = '\t',
    index = False
)