In [7]:
CONSTRAINT_TOOLS = '/scratch/ucgd/lustre-work/quinlan/u6018199/constraint-tools'
CONSTRAINT_TOOLS_DATA = '/scratch/ucgd/lustre-work/quinlan/data-shared/constraint-tools'

import sys
sys.path.append(f'{CONSTRAINT_TOOLS}/utilities')

LOWER_DELETION_SIZE_LIMIT = 4000
UPPER_DELETION_SIZE_LIMIT = 10000
DELETION_ALLELE_FREQ_THRESHOLD = 0.00001

# Filter out false deletions: 
SUSPICIOUS_DELETION_SIZE_THRESHOLD = 1000000

DELETION_CLASS = 'observed' # TOPMED 

# DELETION_TYPE = 'long' 
DELETION_TYPE = 'all' 

PUBLIC_REPO = 'constraint-tools-data'
PUBLIC_REPO_DIR = f'/scratch/ucgd/lustre-work/quinlan/u6018199/{PUBLIC_REPO}'

CHEN_DATA_DIRECTORY = f'{CONSTRAINT_TOOLS_DATA}/chen-et-al-2023-published-version/41586_2023_6045_MOESM4_ESM'
CHEN_FILE_STEM = 'Supplementary_Data_6_ESM'

## Compute SV scores of a set of positive and negative enhancers linked to genes that are known to be under negative selection or no selection, respectively


In [30]:
import pandas as pd 

def get_enhancer_coordinates(row):
  enhancer = row['enhancer']
  chrom, start, end = enhancer.strip().split('-')
  return chrom, int(start), int(end)

def make_bed_file_of_enhancers():
  df = pd.read_csv(f'{CHEN_DATA_DIRECTORY}/{CHEN_FILE_STEM}.txt', sep='\t')
  df[['enhancer_chrom', 'enhancer_start', 'enhancer_end']] = df.apply(get_enhancer_coordinates, axis=1, result_type='expand')
  cols = ['enhancer_chrom', 'enhancer_start', 'enhancer_end'] + [col for col in df.columns if col not in ['enhancer_chrom', 'enhancer_start', 'enhancer_end']]
  df = df[cols]
  bed_file = f'{CHEN_DATA_DIRECTORY}/{CHEN_FILE_STEM}.bed'
  df.to_csv(bed_file, sep='\t', index=False)
  return bed_file

make_bed_file_of_enhancers()

'/scratch/ucgd/lustre-work/quinlan/data-shared/constraint-tools/chen-et-al-2023-published-version/41586_2023_6045_MOESM4_ESM/Supplementary_Data_6_ESM.bed'

In [71]:
from shell import shell 
from aggregate import aggregate
import numpy as np

def assign_SV_scores_to_labeled_enhancers(
  deletion_class=DELETION_CLASS,
  deletion_type=DELETION_TYPE,
):
  windows = make_bed_file_of_enhancers()
  windows_with_deletions = f'{CHEN_DATA_DIRECTORY}/{CHEN_FILE_STEM}-intersect-{deletion_type}-{deletion_class}-deletions.bed'

  intersect_script = f'{CONSTRAINT_TOOLS}/experiments/germline-model/chen-et-al-2022/intersect-windows-with-deletions.sh'

  PROCESSED_DELETIONS = f'{deletion_type}-{deletion_class}-deletions.bed'

  cmd = (
    f'bash '
    f'{intersect_script} '
    f'{windows} {deletion_class} {windows_with_deletions} '
    f'{PUBLIC_REPO_DIR} {PROCESSED_DELETIONS} '
    f'{deletion_type} '
    f'{LOWER_DELETION_SIZE_LIMIT} {UPPER_DELETION_SIZE_LIMIT} {DELETION_ALLELE_FREQ_THRESHOLD} '
    f'{SUSPICIOUS_DELETION_SIZE_THRESHOLD}'
  )
  print(shell(cmd))

  df = pd.read_csv(
    windows_with_deletions, 
    sep='\t',
  )

  df = df[
    (df['enhancer_chrom'] != 'chrX') &
    (df['enhancer_chrom'] != 'chrY')
  ]

  # though unlikely, some enhancers may intersect multiple merged (and therefore non-overlapping) deletions,
  # therefore group by enhancer (and deletion count), and compute "sum merged_deletion_overlap" in each group,
  group_columns = df.columns.tolist()[:-4]
  df = aggregate(
    df, 
    group_columns = group_columns,
    aggregation_functions = {
      'merged_deletion_overlap': [np.sum],
    }
  )

  df['enhancer coordinates'] = df['enhancer_chrom'] + ':' + df['enhancer_start'].astype(str) + '-' + df['enhancer_end'].astype(str)
  df = df.drop(columns=['enhancer'])
  
  df['enhancer_length'] = df['enhancer_end'] - df['enhancer_start']

  df['deletion count per bp'] = df['deletion_count'] / df['enhancer_length']
  df['fraction of bps that are deleted'] = df['sum merged_deletion_overlap'] / df['enhancer_length']

  df = df.drop(columns=['LOEUF underpowered'])

  constrained_labels = ['Haploinsufficient', 'MGI essential',	'OMIM dominant', 'LOEUF constrained']
  constrained_enhancers = df[df[constrained_labels].any(axis=1)]

  unconstrained_labels = ['Olfactory', 'LOEUF unconstrained']
  unconstrained_enhancers = df[df[unconstrained_labels].any(axis=1)]
  
  ambiguous_enhancers = pd.merge(constrained_enhancers, unconstrained_enhancers, how='inner')

  constrained_enhancers = pd.merge(constrained_enhancers, ambiguous_enhancers, how='outer', indicator=True)
  constrained_enhancers = constrained_enhancers[constrained_enhancers['_merge'] == 'left_only']
  constrained_enhancers = constrained_enhancers.drop(columns=['_merge'])
  constrained_enhancers['truly constrained'] = True

  unconstrained_enhancers = pd.merge(unconstrained_enhancers, ambiguous_enhancers, how='outer', indicator=True)
  unconstrained_enhancers = unconstrained_enhancers[unconstrained_enhancers['_merge'] == 'left_only']
  unconstrained_enhancers = unconstrained_enhancers.drop(columns=['_merge'])
  unconstrained_enhancers['truly constrained'] = False

  return pd.concat([constrained_enhancers, unconstrained_enhancers])

assign_SV_scores_to_labeled_enhancers()

[0;36mWe assume that the first line of the following is a header line: [0m/scratch/ucgd/lustre-work/quinlan/data-shared/constraint-tools/chen-et-al-2023-published-version/41586_2023_6045_MOESM4_ESM/Supplementary_Data_6_ESM.bed
[0;36mWrote windows with deletion counts and overlaps to: [0m/scratch/ucgd/lustre-work/quinlan/data-shared/constraint-tools/chen-et-al-2023-published-version/41586_2023_6045_MOESM4_ESM/Supplementary_Data_6_ESM-intersect-all-observed-deletions.bed
[0;36mWrote all observed deletions in UCSC-genome-browser format to: [0m/scratch/ucgd/lustre-work/quinlan/u6018199/constraint-tools-data/all-observed-deletions.bed
Everything up-to-date
[0;36mPushed /scratch/ucgd/lustre-work/quinlan/u6018199/constraint-tools-data/all-observed-deletions.bed to public repo[0m
On branch main
Your branch is up to date with 'origin/main'.

nothing to commit, working tree clean


Unnamed: 0,enhancer_chrom,enhancer_start,enhancer_end,gene,enhancer_Gnocchi,Haploinsufficient,MGI essential,OMIM dominant,LOEUF constrained,Olfactory,LOEUF unconstrained,deletion_count,sum merged_deletion_overlap,enhancer coordinates,enhancer_length,deletion count per bp,fraction of bps that are deleted,truly constrained
0,chr1,2545161,2545361,PANK4,2.775673,False,False,True,False,False,False,0,0,chr1:2545161-2545361,200,0.0000,0.000,True
1,chr1,3208836,3209036,PRDM16,6.070480,False,True,True,True,False,False,0,0,chr1:3208836-3209036,200,0.0000,0.000,True
2,chr1,3670636,3671636,TP73,5.884122,False,True,False,True,False,False,2,1000,chr1:3670636-3671636,1000,0.0020,1.000,True
3,chr1,4654140,4654540,AJAP1,5.098327,False,False,False,True,False,False,1,400,chr1:4654140-4654540,400,0.0025,1.000,True
4,chr1,6071940,6072140,CHD5,8.484611,False,False,False,True,False,False,2,200,chr1:6071940-6072140,200,0.0100,1.000,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
364,chr9,122807921,122808121,OR1L4,3.383762,False,False,False,False,True,False,0,37,chr9:122807921-122808121,200,0.0000,0.185,False
365,chr9,122807921,122808721,OR1L3,3.383762,False,False,False,False,True,False,0,228,chr9:122807921-122808721,800,0.0000,0.285,False
366,chr9,122807921,122808921,OR1K1,3.383762,False,False,False,False,True,False,0,428,chr9:122807921-122808921,1000,0.0000,0.428,False
367,chr9,122807921,122808921,OR1L6,3.383762,False,False,False,False,True,False,0,428,chr9:122807921-122808921,1000,0.0000,0.428,False


In [None]:
# TODO : new section 
# continue from "Plot the distribution of SNV and SV scores of positive and negative genes" in experiments/germline-model/chen-et-al-2022/SNV_plus_SV_model.3.ipynb