In [1]:
CONSTRAINT_TOOLS_DATA = '/scratch/ucgd/lustre-work/quinlan/data-shared/constraint-tools'

In [2]:
import pandas as pd 

def get_chen_predictions():
    filename = f'{CONSTRAINT_TOOLS_DATA}/chen-et-al-2022/Supplementary_Datasets/Supplementary_Data_2.bed'
    df = pd.read_csv(filename, sep='\t')
    df = df.rename(columns={
        'chrom': 'chromosome', 
        'constraint_Z': 'chen_zscore'
    })
    return df

get_chen_predictions()

Unnamed: 0,chromosome,start,end,chen_zscore
0,chr1,1432000,1433000,2.384293
1,chr1,1435000,1436000,-2.952197
2,chr1,1449000,1450000,-0.899126
3,chr1,1450000,1451000,-5.461776
4,chr1,1451000,1452000,-1.497436
...,...,...,...,...
1797148,chr9,137653000,137654000,-1.658773
1797149,chr9,137655000,137656000,-2.662311
1797150,chr9,137656000,137657000,-2.986546
1797151,chr9,137657000,137658000,-0.890788


In [3]:
def get_observations_and_mchale_predictions_on_chen_windows(kmer_size):
    # zscores were computed using dist/model-germline-grch38.kmerSize-X.json                                                                 
    filename = f'{CONSTRAINT_TOOLS_DATA}/genome-wide-predictions/predict-germline-grch38.chen-windows.kmerSize-{kmer_size}.bed'
    df = pd.read_csv(filename, sep='\t')
    return df

get_observations_and_mchale_predictions_on_chen_windows(kmer_size=3)

Unnamed: 0,chromosome,start,end,position,N_bar,N_observed,K_bar,K_observed,M
0,chr1,1432000,1433000,1432500,-0.228060,260,-1.9172674540619583,110,257
1,chr1,1435000,1436000,1435500,-7.687212,275,1.0807428812243396,149,272
2,chr1,1449000,1450000,1449500,0.374691,225,-0.4320220497782873,113,223
3,chr1,1450000,1451000,1450500,2.210181,313,-3.541284444033647,109,308
4,chr1,1451000,1452000,1451500,2.257523,291,-1.3744872898308813,129,291
...,...,...,...,...,...,...,...,...,...
1796995,chr22,50224000,50225000,50224500,0.514023,254,0.6934577364788537,137,254
1796996,chr22,50227000,50228000,50227500,1.095354,281,-1.7252944931979473,121,281
1796997,chr22,50228000,50229000,50228500,-0.355943,240,0.10313490972853154,120,228
1796998,chr22,50229000,50230000,50229500,-1.273488,231,0.075415864727681,121,231


In [4]:
# I've predicted N_bar and K_bar on most of the Chen windows, but not all, 
# c.f. /scratch/ucgd/lustre-work/quinlan/data-shared/constraint-tools/genome-wide-predictions/predict-germline-grch38.chen-windows.log
print(f'Number of Chen windows: {len(get_chen_predictions())}')
print(f'Number of Chen windows on which N_bar and K_bar were predicted: {len(get_observations_and_mchale_predictions_on_chen_windows(kmer_size=3))}')

Number of Chen windows: 1797153
Number of Chen windows on which N_bar and K_bar were predicted: 1797000


In [5]:
def get_mchale_predictions_on_chen_windows(kmer_size):
    columns = ['chromosome', 'start', 'end', 'N_bar', 'K_bar']
    df = get_observations_and_mchale_predictions_on_chen_windows(kmer_size)[columns]
    df = df.rename(columns={
        'N_bar': f'N_bar_{kmer_size}',
        'K_bar': f'K_bar_{kmer_size}'
    })
    return df

def get_observations_on_chen_windows(kmer_size):
    columns = ['chromosome', 'start', 'end', 'N_observed', 'K_observed', 'M']
    df = get_observations_and_mchale_predictions_on_chen_windows(kmer_size)[columns]
    return df

In [6]:
from functools import reduce

on = ['chromosome', 'start', 'end']
how = 'inner'

dfs = [
    get_observations_on_chen_windows(kmer_size=3),
    get_mchale_predictions_on_chen_windows(kmer_size=3),
    get_mchale_predictions_on_chen_windows(kmer_size=5),
    get_mchale_predictions_on_chen_windows(kmer_size=7),
    get_chen_predictions(),
]

chen_and_mchale_predictions = reduce(
    lambda left, right: pd.merge(left, right, on=on, how=how), 
    dfs
)

chen_and_mchale_predictions

Unnamed: 0,chromosome,start,end,N_observed,K_observed,M,N_bar_3,K_bar_3,N_bar_5,K_bar_5,N_bar_7,K_bar_7,chen_zscore
0,chr1,1432000,1433000,260,110,257,-0.228060,-1.9172674540619583,-0.256142,-1.929171911173777,-0.302927,-1.9542069179816122,2.384293
1,chr1,1435000,1436000,275,149,272,-7.687212,1.0807428812243396,-7.120401,0.9862108436107184,-6.480015,1.0063653866562718,-2.952197
2,chr1,1449000,1450000,225,113,223,0.374691,-0.4320220497782873,0.501916,-0.42349418060119715,0.375665,-0.40292018202864255,-0.899126
3,chr1,1450000,1451000,313,109,308,2.210181,-3.541284444033647,2.499685,-3.446256072892112,2.604670,-3.4160133802925,-5.461776
4,chr1,1451000,1452000,291,129,291,2.257523,-1.3744872898308813,2.673875,-1.3290694510065333,2.948512,-1.3173308655803435,-1.497436
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1796995,chr22,50224000,50225000,254,137,254,0.514023,0.6934577364788537,0.537595,0.6821584879264039,0.613568,0.7110962532414574,-5.828498
1796996,chr22,50227000,50228000,281,121,281,1.095354,-1.7252944931979473,1.125435,-1.7555469423001422,1.186683,-1.8113689783770324,-2.528842
1796997,chr22,50228000,50229000,240,120,228,-0.355943,0.10313490972853154,-0.393731,0.03005173712198986,-0.330440,0.09272271004257816,1.302929
1796998,chr22,50229000,50230000,231,121,231,-1.273488,0.075415864727681,-1.239444,0.0216509919812062,-1.182980,0.023455488221183544,-3.331737


In [7]:
chen_and_mchale_predictions.to_csv(
    f'{CONSTRAINT_TOOLS_DATA}/benchmark-genome-wide-predictions/chen-et-al-2022/chen-mchale.multiple-kmers.bed', 
    sep = '\t',
    index = False
)