## Ingest Chen and McHale scores for windows defined by Chen 

The data were created using 

```
experiments/germline-model/chen-et-al-2022/merge_chen_zscores_with_mchale_zscores.ipynb
```

and	stored at: 

```
/scratch/ucgd/lustre-work/quinlan/data-shared/constraint-tools/benchmark-genome-wide-predictions/chen-et-al-2022/chen-mchale.bed
```


In [1]:
CONSTRAINT_TOOLS_DATA = '/scratch/ucgd/lustre-work/quinlan/data-shared/constraint-tools'
filename = f'{CONSTRAINT_TOOLS_DATA}/benchmark-genome-wide-predictions/chen-et-al-2022/chen-mchale.bed'

import pandas as pd 

pd.set_option('display.max_columns', 30)

chen_mchale_zscores = pd.read_csv(filename, sep='\t')
chen_mchale_zscores = chen_mchale_zscores.rename(columns = {
    'chen_zscore': 'chen zscore',
})
chen_mchale_zscores['negative N_bar'] = -chen_mchale_zscores['N_bar']
chen_mchale_zscores

Unnamed: 0,chromosome,start,end,position,N_bar,N_observed,K_bar,K_observed,M,chen zscore,negative N_bar
0,chr1,1432000,1433000,1432500,-0.302927,259,-1.9542069179816122,109,256,2.384293,0.302927
1,chr1,1435000,1436000,1435500,-6.480015,273,1.0063653866562718,147,270,-2.952197,6.480015
2,chr1,1449000,1450000,1449500,0.375665,225,-0.40292018202864255,113,223,-0.899126,-0.375665
3,chr1,1450000,1451000,1450500,2.604670,312,-3.4160133802925,109,307,-5.461776,-2.604670
4,chr1,1451000,1452000,1451500,2.948512,291,-1.3173308655803435,129,291,-1.497436,-2.948512
...,...,...,...,...,...,...,...,...,...,...,...
1796995,chr22,50224000,50225000,50224500,0.613568,253,0.7110962532414574,136,253,-5.828498,-0.613568
1796996,chr22,50227000,50228000,50227500,1.186683,279,-1.8113689783770324,119,279,-2.528842,-1.186683
1796997,chr22,50228000,50229000,50228500,-0.330440,238,0.09272271004257816,119,226,1.302929,0.330440
1796998,chr22,50229000,50230000,50229500,-1.182980,230,0.023455488221183544,120,230,-3.331737,1.182980


## Label windows with their z-score quantiles and save to disk

In [2]:
import warnings

def label_windows_with_score_quantiles(score): 
    with warnings.catch_warnings():
        warnings.simplefilter("ignore")        

        df = chen_mchale_zscores.copy()        

        array_of_quantiles = (
            [0.000, 0.025, 0.050, 0.075, 0.100] + 
            [0.250, 0.500, 0.750] + 
            [0.900, 0.925, 0.950, 0.975, 0.990, 0.9925, 0.995, 1.000]
        )

        starts = array_of_quantiles[:-1]
        ends = array_of_quantiles[1:]
        quantile_labels = [f'{start} - {end}' for start, end in zip(starts, ends)]

        df[f'{score} quantile'], bins = pd.qcut(
            df[score],
            q = array_of_quantiles, 
            labels = quantile_labels, 
            retbins = True,
    #         duplicates='drop'
        )

        return df

chen_mchale_zscores_quantiles = label_windows_with_score_quantiles(score='negative N_bar')
chen_mchale_zscores_quantiles

Unnamed: 0,chromosome,start,end,position,N_bar,N_observed,K_bar,K_observed,M,chen zscore,negative N_bar,negative N_bar quantile
0,chr1,1432000,1433000,1432500,-0.302927,259,-1.9542069179816122,109,256,2.384293,0.302927,0.25 - 0.5
1,chr1,1435000,1436000,1435500,-6.480015,273,1.0063653866562718,147,270,-2.952197,6.480015,0.995 - 1.0
2,chr1,1449000,1450000,1449500,0.375665,225,-0.40292018202864255,113,223,-0.899126,-0.375665,0.25 - 0.5
3,chr1,1450000,1451000,1450500,2.604670,312,-3.4160133802925,109,307,-5.461776,-2.604670,0.075 - 0.1
4,chr1,1451000,1452000,1451500,2.948512,291,-1.3173308655803435,129,291,-1.497436,-2.948512,0.075 - 0.1
...,...,...,...,...,...,...,...,...,...,...,...,...
1796995,chr22,50224000,50225000,50224500,0.613568,253,0.7110962532414574,136,253,-5.828498,-0.613568,0.25 - 0.5
1796996,chr22,50227000,50228000,50227500,1.186683,279,-1.8113689783770324,119,279,-2.528842,-1.186683,0.1 - 0.25
1796997,chr22,50228000,50229000,50228500,-0.330440,238,0.09272271004257816,119,226,1.302929,0.330440,0.25 - 0.5
1796998,chr22,50229000,50230000,50229500,-1.182980,230,0.023455488221183544,120,230,-3.331737,1.182980,0.5 - 0.75


In [3]:
filename = f'{CONSTRAINT_TOOLS_DATA}/benchmark-genome-wide-predictions/chen-et-al-2022/chen-mchale-quantiles.bed'

chen_mchale_zscores_quantiles.to_csv(
    filename, 
    sep = '\t',
    index = False
)