In [1]:
CONSTRAINT_TOOLS = '/scratch/ucgd/lustre-work/quinlan/u6018199/constraint-tools'
CONSTRAINT_TOOLS_DATA = '/scratch/ucgd/lustre-work/quinlan/data-shared/constraint-tools'

import sys
sys.path.append(f'{CONSTRAINT_TOOLS}/utilities')

CHEN_DATA_DIRECTORY = f'{CONSTRAINT_TOOLS_DATA}/chen-et-al-2023-published-version/41586_2023_6045_MOESM4_ESM'
CHEN_FILE_STEM = f'Supplementary_Data_2' # chen scores for 1kb windows

PUBLIC_REPO = 'constraint-tools-data'
PUBLIC_REPO_DIR = f'/scratch/ucgd/lustre-work/quinlan/u6018199/{PUBLIC_REPO}'

## Compute observed SNV counts, expected counts under gnocchi, and Kbar for Chen's windows 

In [8]:
import pandas as pd
import numpy as np 

from tqdm import tqdm
tqdm.pandas()

def get_gnocchi(): 
  filename = f'{CHEN_DATA_DIRECTORY}/{CHEN_FILE_STEM}.bed'  
  df = pd.read_csv(
    filename, 
    sep='\t',
    names=['chromosome', 'start', 'end', 'gnocchi']
  )
  return df 

def get_observed_counts_and_Kbar():
  filename = f'{CONSTRAINT_TOOLS_DATA}/genome-wide-predictions/predict-germline-grch38.chen-windows.bed'
  df = pd.read_csv(
    filename, 
    sep='\t',
  )
  df = df[['chromosome', 'start', 'end', 'N_observed', 'K_bar']]
  return df 

def compute_expected_count_under_gnocchi(row): 
  a = 1 
  b = -(2*row['N_observed'] + row['gnocchi']**2)
  c = row['N_observed']**2
  sqrt = np.sqrt(b**2 - 4*a*c) 
  sign = 1 if row['gnocchi'] > 0 else -1
  return (-b + sign*sqrt)/(2*a)
  
def wrangle_K_bar(df): 
  df['K_bar'] = pd.to_numeric(df['K_bar'], errors='coerce') # convert '.' to 'NaN'
  df = df.dropna(subset=['K_bar']) # drop windows for which K_bar == NaN
  df['negative_K_bar'] = -df['K_bar']
  return df

def compute_expected_counts_under_gnocchi_and_Kbar(): 
  df = pd.merge(
    get_gnocchi(), 
    get_observed_counts_and_Kbar(), 
    how='inner', 
    on=['chromosome', 'start', 'end']
  )

  df = wrangle_K_bar(df)

  df['N_expected_gnocchi'] = df.progress_apply(compute_expected_count_under_gnocchi, axis=1) # type: ignore

  for col in ['gnocchi', 'N_observed', 'negative_K_bar', 'N_expected_gnocchi']: 
    df[['chromosome', 'start', 'end', f'{col}']].to_csv(
      f'{CHEN_DATA_DIRECTORY}/{CHEN_FILE_STEM}-{col}.bed', 
      sep='\t', 
      index=False, 
      header=False
    )

  return df 

compute_expected_counts_under_gnocchi_and_Kbar()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['negative_K_bar'] = -df['K_bar']
100%|█████████████████████████████| 1794224/1794224 [00:32<00:00, 55135.76it/s]


Unnamed: 0,chromosome,start,end,gnocchi,N_observed,K_bar,negative_K_bar,N_expected_gnocchi
0,chr1,1432000,1433000,4.299894,259,-1.954207,1.954207,338.059552
1,chr1,1435000,1436000,3.331645,273,1.006365,-1.006365,333.876799
2,chr1,1449000,1450000,1.817267,225,-0.402920,0.402920,253.960193
3,chr1,1450000,1451000,1.365863,312,-3.416013,3.416013,337.076763
4,chr1,1451000,1452000,0.666316,291,-1.317331,1.317331,302.590657
...,...,...,...,...,...,...,...,...
1795651,chr9,137269000,137270000,5.276351,209,-0.785837,0.785837,300.458958
1795652,chr9,137275000,137276000,2.687348,290,-2.409468,2.409468,339.517045
1795653,chr9,137282000,137283000,-0.090791,210,-0.333503,0.333503,208.688422
1795654,chr9,137290000,137291000,2.162811,223,-1.182283,1.182283,257.721105


## Visualize these variables in UCSC genome browser 

In [9]:
from shell import shell 

def prepare_data_for_UCSC_genome_browser():
  prepare_data = f'{CONSTRAINT_TOOLS}/experiments/germline-model/chen-et-al-2022/prepare-data-for-UCSC-genome-browser.sh' 

  for suffix in tqdm(['gnocchi', 'N_observed', 'negative_K_bar', 'N_expected_gnocchi']): 
    data_directory = CHEN_DATA_DIRECTORY
    data_stem = f'{CHEN_FILE_STEM}-{suffix}'
    track_name = suffix
    track_description = suffix
    cmd = (
      f'bash '
      f'{prepare_data} '
      f'{data_directory} {data_stem} {PUBLIC_REPO_DIR} '
      f'{track_name} {track_description}'
    )
    print(shell(cmd))

prepare_data_for_UCSC_genome_browser()

  0%|                                                    | 0/4 [00:00<?, ?it/s]

 25%|███████████                                 | 1/4 [00:11<00:35, 11.74s/it]

[0;36mWrote: [0m/scratch/ucgd/lustre-work/quinlan/data-shared/constraint-tools/chen-et-al-2023-published-version/41586_2023_6045_MOESM4_ESM/Supplementary_Data_2-gnocchi.bedGraph
To https://github.com/petermchale/constraint-tools-data.git
   c4a792f..f1e1698  main -> main
[0;36mPushed Supplementary_Data_2-gnocchi.bedGraph to public repo[0m
"Supplementary_Data_2-gnocchi.bedGraph" already supported
[main f1e1698] Add Supplementary_Data_2-gnocchi in UCSC-genome-browser format (bedgraph)
 1 file changed, 2 insertions(+), 2 deletions(-)
Uploading LFS objects: 100% (1/1), 77 MB | 28 MB/s, done.


 50%|██████████████████████                      | 2/4 [00:22<00:22, 11.23s/it]

[0;36mWrote: [0m/scratch/ucgd/lustre-work/quinlan/data-shared/constraint-tools/chen-et-al-2023-published-version/41586_2023_6045_MOESM4_ESM/Supplementary_Data_2-N_observed.bedGraph
To https://github.com/petermchale/constraint-tools-data.git
   f1e1698..29e6e89  main -> main
[0;36mPushed Supplementary_Data_2-N_observed.bedGraph to public repo[0m
"Supplementary_Data_2-N_observed.bedGraph" already supported
[main 29e6e89] Add Supplementary_Data_2-N_observed in UCSC-genome-browser format (bedgraph)
 1 file changed, 2 insertions(+), 2 deletions(-)
Uploading LFS objects: 100% (1/1), 50 MB | 21 MB/s, done.


 75%|█████████████████████████████████           | 3/4 [00:34<00:11, 11.60s/it]

[0;36mWrote: [0m/scratch/ucgd/lustre-work/quinlan/data-shared/constraint-tools/chen-et-al-2023-published-version/41586_2023_6045_MOESM4_ESM/Supplementary_Data_2-negative_K_bar.bedGraph
To https://github.com/petermchale/constraint-tools-data.git
   29e6e89..70c8794  main -> main
[0;36mPushed Supplementary_Data_2-negative_K_bar.bedGraph to public repo[0m
"Supplementary_Data_2-negative_K_bar.bedGraph" already supported
[main 70c8794] Add Supplementary_Data_2-negative_K_bar in UCSC-genome-browser format (bedgraph)
 1 file changed, 2 insertions(+), 2 deletions(-)
Uploading LFS objects: 100% (1/1), 78 MB | 20 MB/s, done.


100%|████████████████████████████████████████████| 4/4 [00:46<00:00, 11.57s/it]

[0;36mWrote: [0m/scratch/ucgd/lustre-work/quinlan/data-shared/constraint-tools/chen-et-al-2023-published-version/41586_2023_6045_MOESM4_ESM/Supplementary_Data_2-N_expected_gnocchi.bedGraph
To https://github.com/petermchale/constraint-tools-data.git
   70c8794..9cc2858  main -> main
[0;36mPushed Supplementary_Data_2-N_expected_gnocchi.bedGraph to public repo[0m
"Supplementary_Data_2-N_expected_gnocchi.bedGraph" already supported
[main 9cc2858] Add Supplementary_Data_2-N_expected_gnocchi in UCSC-genome-browser format (bedgraph)
 1 file changed, 2 insertions(+), 2 deletions(-)
Uploading LFS objects: 100% (1/1), 76 MB | 25 MB/s, done.





In [7]:
# TODO: a "track hub" might make more sense when there are multiple custom tracks 

def get_ucsc_genome_browser_link(chrom, start, end): 
  attribute_value_pair = f'position={chrom}%3A{start}-{end}'
  for suffix in ['gnocchi', 'N_observed', 'negative_K_bar', 'N_expected_gnocchi']: 
    data_stem = f'{CHEN_FILE_STEM}-{suffix}'
    data_url = f'http://github.com/petermchale/constraint-tools-data/raw/main/{data_stem}.bedGraph'
    # https://genome-blog.gi.ucsc.edu/blog/2021/08/13/sharing-data-with-sessions-and-urls/  
    browser_url = f'https://genome.ucsc.edu/cgi-bin/hgTracks?db=hg38&{attribute_value_pair}&hgct_customText={data_url}'
    print(f'UCSC genome browser: {browser_url}')

get_ucsc_genome_browser_link(chrom='chr4', start=1, end=190214555)

UCSC genome browser: https://genome.ucsc.edu/cgi-bin/hgTracks?db=hg38&position=chr4%3A1-190214555&hgct_customText=http://github.com/petermchale/constraint-tools-data/raw/main/Supplementary_Data_2-gnocchi.bedGraph
UCSC genome browser: https://genome.ucsc.edu/cgi-bin/hgTracks?db=hg38&position=chr4%3A1-190214555&hgct_customText=http://github.com/petermchale/constraint-tools-data/raw/main/Supplementary_Data_2-N_observed.bedGraph
UCSC genome browser: https://genome.ucsc.edu/cgi-bin/hgTracks?db=hg38&position=chr4%3A1-190214555&hgct_customText=http://github.com/petermchale/constraint-tools-data/raw/main/Supplementary_Data_2-negative_K_bar.bedGraph
UCSC genome browser: https://genome.ucsc.edu/cgi-bin/hgTracks?db=hg38&position=chr4%3A1-190214555&hgct_customText=http://github.com/petermchale/constraint-tools-data/raw/main/Supplementary_Data_2-N_expected_gnocchi.bedGraph
