In [1]:
CONSTRAINT_TOOLS_DATA = '/scratch/ucgd/lustre-work/quinlan/data-shared/constraint-tools'

In [3]:
import pandas as pd 

def get_chen_predictions():
    filename = f'{CONSTRAINT_TOOLS_DATA}/chen-et-al-2022/Supplementary_Datasets/Supplementary_Data_2.bed'
    df = pd.read_csv(filename, sep='\t')
    df = df.rename(columns={
        'chrom': 'chromosome', 
        'constraint_Z': 'chen_zscore'
    })
    return df

get_chen_predictions()

Unnamed: 0,chromosome,start,end,chen_zscore
0,chr1,1432000,1433000,2.384293
1,chr1,1435000,1436000,-2.952197
2,chr1,1449000,1450000,-0.899126
3,chr1,1450000,1451000,-5.461776
4,chr1,1451000,1452000,-1.497436
...,...,...,...,...
1797148,chr9,137653000,137654000,-1.658773
1797149,chr9,137655000,137656000,-2.662311
1797150,chr9,137656000,137657000,-2.986546
1797151,chr9,137657000,137658000,-0.890788


In [4]:
# decompress mchale zscores: 
! bgzip \
    --decompress \
    --stdout \
    /scratch/ucgd/lustre-work/quinlan/data-shared/constraint-tools/genome-wide-predictions/predict-germline-grch38.chen-windows.bed.gz \
    > /scratch/ucgd/lustre-work/quinlan/data-shared/constraint-tools/genome-wide-predictions/predict-germline-grch38.chen-windows.bed

In [5]:
def get_mchale_predictions_on_chen_windows():
    filename = f'{CONSTRAINT_TOOLS_DATA}/genome-wide-predictions/predict-germline-grch38.chen-windows.bed'
    df = pd.read_csv(filename, sep='\t')
    return df

get_mchale_predictions_on_chen_windows()

Unnamed: 0,chromosome,start,end,position,N_bar,N_observed,K_bar,K_observed,M
0,chr1,1432000,1433000,1432500,-0.302927,259,-1.9542069179816122,109,256
1,chr1,1435000,1436000,1435500,-6.480015,273,1.0063653866562718,147,270
2,chr1,1449000,1450000,1449500,0.375665,225,-0.40292018202864255,113,223
3,chr1,1450000,1451000,1450500,2.604670,312,-3.4160133802925,109,307
4,chr1,1451000,1452000,1451500,2.948512,291,-1.3173308655803435,129,291
...,...,...,...,...,...,...,...,...,...
1796995,chr22,50224000,50225000,50224500,0.613568,253,0.7110962532414574,136,253
1796996,chr22,50227000,50228000,50227500,1.186683,279,-1.8113689783770324,119,279
1796997,chr22,50228000,50229000,50228500,-0.330440,238,0.09272271004257816,119,226
1796998,chr22,50229000,50230000,50229500,-1.182980,230,0.023455488221183544,120,230


In [6]:
# I've predicted N_bar and K_bar on most of the Chen windows, but not all, 
# c.f. /scratch/ucgd/lustre-work/quinlan/data-shared/constraint-tools/genome-wide-predictions/predict-germline-grch38.chen-windows.log
print(f'Number of Chen windows: {len(get_chen_predictions())}')
print(f'Number of Chen windows on which N_bar and K_bar were predicted: {len(get_mchale_predictions_on_chen_windows())}')

Number of Chen windows: 1797153
Number of Chen windows on which N_bar and K_bar were predicted: 1797000


In [7]:
chen_and_mchale_predictions = ( 
  get_mchale_predictions_on_chen_windows()
  .merge(
      get_chen_predictions(), 
      on = ['chromosome', 'start', 'end'], 
      how = 'inner'
  )
)
chen_and_mchale_predictions

Unnamed: 0,chromosome,start,end,position,N_bar,N_observed,K_bar,K_observed,M,chen_zscore
0,chr1,1432000,1433000,1432500,-0.302927,259,-1.9542069179816122,109,256,2.384293
1,chr1,1435000,1436000,1435500,-6.480015,273,1.0063653866562718,147,270,-2.952197
2,chr1,1449000,1450000,1449500,0.375665,225,-0.40292018202864255,113,223,-0.899126
3,chr1,1450000,1451000,1450500,2.604670,312,-3.4160133802925,109,307,-5.461776
4,chr1,1451000,1452000,1451500,2.948512,291,-1.3173308655803435,129,291,-1.497436
...,...,...,...,...,...,...,...,...,...,...
1796995,chr22,50224000,50225000,50224500,0.613568,253,0.7110962532414574,136,253,-5.828498
1796996,chr22,50227000,50228000,50227500,1.186683,279,-1.8113689783770324,119,279,-2.528842
1796997,chr22,50228000,50229000,50228500,-0.330440,238,0.09272271004257816,119,226,1.302929
1796998,chr22,50229000,50230000,50229500,-1.182980,230,0.023455488221183544,120,230,-3.331737


In [8]:
chen_and_mchale_predictions.to_csv(
    f'{CONSTRAINT_TOOLS_DATA}/benchmark-genome-wide-predictions/chen-et-al-2022/chen-mchale.bed', 
    sep = '\t',
    index = False
)