In [1]:
CONSTRAINT_TOOLS = '/scratch/ucgd/lustre-work/quinlan/u6018199/constraint-tools'
CONSTRAINT_TOOLS_DATA = '/scratch/ucgd/lustre-work/quinlan/data-shared/constraint-tools'

import sys
sys.path.append(f'{CONSTRAINT_TOOLS}/utilities')

CHEN_DATA_DIRECTORY = f'{CONSTRAINT_TOOLS_DATA}/chen-et-al-2023-published-version/41586_2023_6045_MOESM4_ESM'
CHEN_FILE_STEM = f'Supplementary_Data_2' # chen scores for 1kb windows

PUBLIC_REPO = 'constraint-tools-data'
PUBLIC_REPO_DIR = f'/scratch/ucgd/lustre-work/quinlan/u6018199/{PUBLIC_REPO}'

## Compute observed SNV counts, expected counts under gnocchi, and Kbar for Chen's windows 

In [2]:
import pandas as pd
import numpy as np 

from tqdm import tqdm
tqdm.pandas()

def get_gnocchi(): 
  filename = f'{CHEN_DATA_DIRECTORY}/{CHEN_FILE_STEM}.bed'  
  df = pd.read_csv(
    filename, 
    sep='\t',
    names=['chromosome', 'start', 'end', 'gnocchi']
  )
  return df 

def get_observed_counts_and_Kbar():
  filename = f'{CONSTRAINT_TOOLS_DATA}/genome-wide-predictions/predict-germline-grch38.chen-windows.bed'
  df = pd.read_csv(
    filename, 
    sep='\t',
  )
  df = df[['chromosome', 'start', 'end', 'N_observed', 'K_bar']]
  return df 

def compute_expected_count_under_gnocchi(row): 
  a = 1 
  b = -(2*row['N_observed'] + row['gnocchi']**2)
  c = row['N_observed']**2
  sqrt = np.sqrt(b**2 - 4*a*c)
  sign = 1 if row['gnocchi'] > 0 else -1
  return (-b + sign*sqrt)/(2*a)
  
def compute_expected_counts_under_gnocchi_and_Kbar(): 
  df = pd.merge(
    get_gnocchi(), 
    get_observed_counts_and_Kbar(), 
    how='inner', 
    on=['chromosome', 'start', 'end']
  )
  df['N_expected_gnocchi'] = df.progress_apply(compute_expected_count_under_gnocchi, axis=1) # type: ignore

  for col in ['gnocchi', 'N_observed', 'K_bar', 'N_expected_gnocchi']: 
    df[['chromosome', 'start', 'end', f'{col}']].to_csv(
      f'{CHEN_DATA_DIRECTORY}/{CHEN_FILE_STEM}-{col}.bed', 
      sep='\t', 
      index=False, 
      header=False
    )

  return df 

compute_expected_counts_under_gnocchi_and_Kbar()

100%|█████████████████████████████| 1795656/1795656 [00:31<00:00, 56164.12it/s]


Unnamed: 0,chromosome,start,end,gnocchi,N_observed,K_bar,N_expected_gnocchi
0,chr1,1432000,1433000,4.299894,259,-1.9542069179816122,338.059552
1,chr1,1435000,1436000,3.331645,273,1.0063653866562718,333.876799
2,chr1,1449000,1450000,1.817267,225,-0.40292018202864255,253.960193
3,chr1,1450000,1451000,1.365863,312,-3.4160133802925,337.076763
4,chr1,1451000,1452000,0.666316,291,-1.3173308655803435,302.590657
...,...,...,...,...,...,...,...
1795651,chr9,137269000,137270000,5.276351,209,-0.7858373648267517,300.458958
1795652,chr9,137275000,137276000,2.687348,290,-2.4094675113595323,339.517045
1795653,chr9,137282000,137283000,-0.090791,210,-0.33350277418097046,208.688422
1795654,chr9,137290000,137291000,2.162811,223,-1.182282731197813,257.721105


## Visualize these variables in UCSC genome browser 

In [3]:
import trace
from shell import shell 

def prepare_data_for_UCSC_genome_browser():
  prepare_data = f'{CONSTRAINT_TOOLS}/experiments/germline-model/chen-et-al-2022/prepare-data-for-UCSC-genome-browser.sh' 

  for suffix in tqdm(['gnocchi', 'N_observed', 'K_bar', 'N_expected_gnocchi']): 
    data_directory = CHEN_DATA_DIRECTORY
    data_stem = f'{CHEN_FILE_STEM}-{suffix}'
    track_name = suffix
    track_description = suffix
    cmd = (
      f'bash '
      f'{prepare_data} '
      f'{data_directory} {data_stem} {PUBLIC_REPO_DIR} '
      f'{track_name} {track_description}'
    )
    print(shell(cmd))

prepare_data_for_UCSC_genome_browser()

 25%|███████████                                 | 1/4 [00:07<00:23,  7.75s/it]

[0;36mWrote: [0m/scratch/ucgd/lustre-work/quinlan/data-shared/constraint-tools/chen-et-al-2023-published-version/41586_2023_6045_MOESM4_ESM/Supplementary_Data_2-gnocchi.bedGraph
Everything up-to-date
[0;36mPushed Supplementary_Data_2-gnocchi.bedGraph to public repo[0m
"Supplementary_Data_2-gnocchi.bedGraph" already supported
On branch main
Your branch is up to date with 'origin/main'.

nothing to commit, working tree clean


 50%|██████████████████████                      | 2/4 [00:15<00:15,  7.63s/it]

[0;36mWrote: [0m/scratch/ucgd/lustre-work/quinlan/data-shared/constraint-tools/chen-et-al-2023-published-version/41586_2023_6045_MOESM4_ESM/Supplementary_Data_2-N_observed.bedGraph
Everything up-to-date
[0;36mPushed Supplementary_Data_2-N_observed.bedGraph to public repo[0m
"Supplementary_Data_2-N_observed.bedGraph" already supported
On branch main
Your branch is up to date with 'origin/main'.

nothing to commit, working tree clean


 75%|█████████████████████████████████           | 3/4 [00:23<00:07,  7.69s/it]

[0;36mWrote: [0m/scratch/ucgd/lustre-work/quinlan/data-shared/constraint-tools/chen-et-al-2023-published-version/41586_2023_6045_MOESM4_ESM/Supplementary_Data_2-K_bar.bedGraph
Everything up-to-date
[0;36mPushed Supplementary_Data_2-K_bar.bedGraph to public repo[0m
"Supplementary_Data_2-K_bar.bedGraph" already supported
On branch main
Your branch is up to date with 'origin/main'.

nothing to commit, working tree clean


100%|████████████████████████████████████████████| 4/4 [00:30<00:00,  7.67s/it]

[0;36mWrote: [0m/scratch/ucgd/lustre-work/quinlan/data-shared/constraint-tools/chen-et-al-2023-published-version/41586_2023_6045_MOESM4_ESM/Supplementary_Data_2-N_expected_gnocchi.bedGraph
Everything up-to-date
[0;36mPushed Supplementary_Data_2-N_expected_gnocchi.bedGraph to public repo[0m
"Supplementary_Data_2-N_expected_gnocchi.bedGraph" already supported
On branch main
Your branch is up to date with 'origin/main'.

nothing to commit, working tree clean





In [4]:
# TODO: a "track hub" might make more sense when there are multiple custom tracks 

def get_ucsc_genome_browser_link(chrom, start, end): 
  attribute_value_pair = f'position={chrom}%3A{start}-{end}'
  for suffix in ['gnocchi', 'N_observed', 'K_bar', 'N_expected_gnocchi']: 
    data_stem = f'{CHEN_FILE_STEM}-{suffix}'
    data_url = f'http://github.com/petermchale/constraint-tools-data/raw/main/{data_stem}.bedGraph'
    # https://genome-blog.gi.ucsc.edu/blog/2021/08/13/sharing-data-with-sessions-and-urls/  
    browser_url = f'https://genome.ucsc.edu/cgi-bin/hgTracks?db=hg38&{attribute_value_pair}&hgct_customText={data_url}'
    print(f'UCSC genome browser: {browser_url}')

get_ucsc_genome_browser_link(chrom='chr4', start=1, end=190214555)

UCSC genome browser: https://genome.ucsc.edu/cgi-bin/hgTracks?db=hg38&position=chr4%3A1-190214555&hgct_customText=http://github.com/petermchale/constraint-tools-data/raw/main/Supplementary_Data_2-gnocchi.bedGraph
UCSC genome browser: https://genome.ucsc.edu/cgi-bin/hgTracks?db=hg38&position=chr4%3A1-190214555&hgct_customText=http://github.com/petermchale/constraint-tools-data/raw/main/Supplementary_Data_2-N_observed.bedGraph
UCSC genome browser: https://genome.ucsc.edu/cgi-bin/hgTracks?db=hg38&position=chr4%3A1-190214555&hgct_customText=http://github.com/petermchale/constraint-tools-data/raw/main/Supplementary_Data_2-K_bar.bedGraph
UCSC genome browser: https://genome.ucsc.edu/cgi-bin/hgTracks?db=hg38&position=chr4%3A1-190214555&hgct_customText=http://github.com/petermchale/constraint-tools-data/raw/main/Supplementary_Data_2-N_expected_gnocchi.bedGraph
