## Get a set of enhancers known to be constrained or not, together with their Chen scores 

In [1]:
CONSTRAINT_TOOLS = '/scratch/ucgd/lustre-work/quinlan/u6018199/constraint-tools'
CONSTRAINT_TOOLS_DATA = '/scratch/ucgd/lustre-work/quinlan/data-shared/constraint-tools'

In [33]:
import pandas as pd

def read_disease_enhancers_intersect_chen_windows():
  # download-process-data/khurana/README.md
  df = pd.read_csv(
      f"{CONSTRAINT_TOOLS_DATA}/khurana/disease-enhancers-intersect-chen-windows.bed", 
      sep = '\t',
      names = [
          'chrom_enhancer', 'start_enhancer', 'end_enhancer', 
          'disease', 'enhancer_predicted_LoF_tolerance_prob', 'enhancer_predicted_LoF_tolerance_status', 
          'chrom_window', 'start_window', 'end_window', 
          'new_chen_score_window'
      ]
  )
  df = df.drop_duplicates() 
  df['negative_new_chen_score_window'] = -df['new_chen_score_window']
  df = df.drop(columns=[
    'new_chen_score_window',
    'chrom_window', 'start_window', 'end_window',
    'disease', 'enhancer_predicted_LoF_tolerance_prob', 'enhancer_predicted_LoF_tolerance_status'
  ])
  df = df.rename(columns={
    'chrom_enhancer': 'chrom', 
    'start_enhancer': 'start', 
    'end_enhancer': 'end'
  })
  df['constrained'] = True
  df['tag'] = 'disease_enhancers'
  return df 

read_disease_enhancers_intersect_chen_windows()

Unnamed: 0,chrom,start,end,negative_new_chen_score_window,constrained,tag
0,chr1,21346107,21348107,-3.432213,True,disease_enhancers
1,chr1,93909844,93911044,-2.645724,True,disease_enhancers
2,chr1,155293209,155293609,-4.190955,True,disease_enhancers
3,chr1,160032410,160033810,-2.719339,True,disease_enhancers
4,chr1,160032410,160033810,-0.167476,True,disease_enhancers
...,...,...,...,...,...,...
116,chr20,46114761,46114961,0.241142,True,disease_enhancers
117,chr22,28787612,28789612,2.138137,True,disease_enhancers
118,chr22,28787612,28789612,0.488883,True,disease_enhancers
119,chr22,28787612,28789612,1.762075,True,disease_enhancers


In [34]:
def read_pacbio_deleted_enhancers_intersect_chen_windows():
  # download-process-data/khurana/README.md
  df = pd.read_csv(
      f"{CONSTRAINT_TOOLS_DATA}/khurana/pacbio-deleted-enhancers-intersect-chen-windows.bed", 
      sep = '\t',
      names = [
          'chrom_enhancer', 'start_enhancer', 'end_enhancer', 
          'family', 'enhancer_predicted_LoF_tolerance_prob', 'enhancer_predicted_LoF_tolerance_status', 
          'chrom_window', 'start_window', 'end_window', 
          'new_chen_score_window'
      ]
  )
  df = df.drop_duplicates() 
  df['negative_new_chen_score_window'] = -df['new_chen_score_window']
  df = df.drop(columns=[
    'new_chen_score_window',
    'chrom_window', 'start_window', 'end_window',
    'family', 'enhancer_predicted_LoF_tolerance_prob', 'enhancer_predicted_LoF_tolerance_status'
  ])
  df = df.rename(columns={
    'chrom_enhancer': 'chrom', 
    'start_enhancer': 'start', 
    'end_enhancer': 'end'
  })
  df['constrained'] = False
  df['tag'] = 'pacbio_deleted'
  return df 

read_pacbio_deleted_enhancers_intersect_chen_windows()

Unnamed: 0,chrom,start,end,negative_new_chen_score_window,constrained,tag
0,chr2,5785468,5785668,2.210384,False,pacbio_deleted
1,chr3,121543753,121543953,-1.011635,False,pacbio_deleted
2,chr4,69152682,69153682,2.649651,False,pacbio_deleted
3,chr4,69152682,69153682,1.650749,False,pacbio_deleted
4,chr17,6969081,6970081,-2.492414,False,pacbio_deleted


In [35]:
# get enhancers that are homozygously deleted in 1000 Genomes Project
def read_lof_tolerant_enhancers_intersect_chen_windows():
  # download-process-data/khurana/README.md
  df = pd.read_csv(
      f"{CONSTRAINT_TOOLS_DATA}/khurana/lof-tolerant-enhancers-intersect-chen-windows.bed", 
      sep = '\t',
      names = [
          'chrom_enhancer_hg38', 'start_enhancer_hg38', 'end_enhancer_hg38', 
          'enhancer_hg19', 'unknown',
          'chrom_window', 'start_window', 'end_window', 
          'new_chen_score_window'
      ]
  )
  df = df.drop_duplicates() 
  df['negative_new_chen_score_window'] = -df['new_chen_score_window']
  df = df.drop(columns=[
    'new_chen_score_window',
    'chrom_window', 'start_window', 'end_window',
    'enhancer_hg19', 'unknown'
  ])
  df = df.rename(columns={
    'chrom_enhancer_hg38': 'chrom', 
    'start_enhancer_hg38': 'start', 
    'end_enhancer_hg38': 'end'
  })
  df['constrained'] = False
  df['tag'] = 'lof_tolerant'
  return df 

read_lof_tolerant_enhancers_intersect_chen_windows()

Unnamed: 0,chrom,start,end,negative_new_chen_score_window,constrained,tag
0,chr1,831220,832820,-1.248459,False,lof_tolerant
1,chr1,1508220,1509820,-1.731383,False,lof_tolerant
2,chr1,1508220,1509820,-1.670577,False,lof_tolerant
3,chr1,8124140,8124940,-0.841769,False,lof_tolerant
4,chr1,12684990,12685190,-1.311703,False,lof_tolerant
...,...,...,...,...,...,...
834,chr22,36177152,36177552,-1.206580,False,lof_tolerant
835,chr22,36548353,36550153,-0.796912,False,lof_tolerant
836,chr22,44535320,44535920,-0.025900,False,lof_tolerant
837,chr22,48390588,48391588,-0.083336,False,lof_tolerant


In [36]:
# get low-lof-tolerance enhancers
def read_low_lof_tolerance_enhancers_intersect_chen_windows():
  # download-process-data/khurana/README.md
  df = pd.read_csv(
      f"{CONSTRAINT_TOOLS_DATA}/khurana/low-lof-tolerance-enhancers-intersect-chen-windows.bed", 
      sep = '\t',
      names = [
          'chrom_enhancer_hg38', 'start_enhancer_hg38', 'end_enhancer_hg38', 
          'enhancer_hg19', 'unknown',
          'chrom_window', 'start_window', 'end_window', 
          'new_chen_score_window'
      ]
  )
  df = df.drop_duplicates() 
  df['negative_new_chen_score_window'] = -df['new_chen_score_window']
  df = df.drop(columns=[
    'new_chen_score_window',
    'chrom_window', 'start_window', 'end_window',
    'enhancer_hg19', 'unknown'
  ])
  df = df.rename(columns={
    'chrom_enhancer_hg38': 'chrom', 
    'start_enhancer_hg38': 'start', 
    'end_enhancer_hg38': 'end'
  })
  df['constrained'] = True
  df['tag'] = 'low_lof_tolerance'
  return df 

read_low_lof_tolerance_enhancers_intersect_chen_windows().iloc[:10]

Unnamed: 0,chrom,start,end,negative_new_chen_score_window,constrained,tag
0,chr1,61587728,61589928,-0.884984,True,low_lof_tolerance
1,chr1,61587728,61589928,-3.667483,True,low_lof_tolerance
2,chr1,61587728,61589928,-1.787719,True,low_lof_tolerance
3,chr1,87355917,87357117,-1.174601,True,low_lof_tolerance
4,chr1,87355917,87357117,-2.971624,True,low_lof_tolerance
5,chr1,87355917,87357117,-1.59784,True,low_lof_tolerance
9,chr1,169941659,169943659,-0.732262,True,low_lof_tolerance
10,chr1,169941659,169943659,-0.494991,True,low_lof_tolerance
11,chr2,66069666,66071266,-1.628703,True,low_lof_tolerance
12,chr2,66069666,66071266,-1.923346,True,low_lof_tolerance


In [37]:
def aggregate_over_windows(df): 
    columns = df.columns.tolist()
    group_columns = [column for column in columns if column != 'negative_new_chen_score_window']
    groups = df.groupby(group_columns)
    aggregation_functions = {'negative_new_chen_score_window': ['min', 'count']}
    aggregated = groups.agg(aggregation_functions)  
    df = aggregated.reset_index()
    df.columns = [' '.join(col[::-1]).strip() for col in df.columns.values]
    return df

aggregate_over_windows(read_lof_tolerant_enhancers_intersect_chen_windows())

Unnamed: 0,chrom,start,end,constrained,tag,min negative_new_chen_score_window,count negative_new_chen_score_window
0,chr1,831220,832820,False,lof_tolerant,-1.248459,1
1,chr1,1508220,1509820,False,lof_tolerant,-1.731383,2
2,chr1,8124140,8124940,False,lof_tolerant,-0.841769,1
3,chr1,12684990,12685190,False,lof_tolerant,-1.403148,2
4,chr1,16104305,16105505,False,lof_tolerant,0.845290,1
...,...,...,...,...,...,...,...
505,chr9,126908121,126908321,False,lof_tolerant,-1.462233,1
506,chr9,129765721,129766721,False,lof_tolerant,-1.356569,1
507,chr9,129766921,129767321,False,lof_tolerant,-1.356569,1
508,chr9,132748213,132748813,False,lof_tolerant,-2.111835,1


In [54]:
def create_labeled_enhancers(): 
  df = pd.concat([
    aggregate_over_windows(read_disease_enhancers_intersect_chen_windows()),
    aggregate_over_windows(read_pacbio_deleted_enhancers_intersect_chen_windows()),
    aggregate_over_windows(read_lof_tolerant_enhancers_intersect_chen_windows()),
    aggregate_over_windows(read_low_lof_tolerance_enhancers_intersect_chen_windows())  
  ])
  df = df.reset_index(drop=True)
  print('number of enhancers that are constrained (True) or not (False):') 
  print(df['constrained'].value_counts())
  df.to_csv(f"{CONSTRAINT_TOOLS_DATA}/khurana/labeled-enhancers.bed", sep='\t', index=False)
  return df 

pd.set_option('display.max_rows', 10)

create_labeled_enhancers()

number of enhancers that are constrained (True) or not (False):
False    514
True     118
Name: constrained, dtype: int64


Unnamed: 0,chrom,start,end,constrained,tag,min negative_new_chen_score_window,count negative_new_chen_score_window
0,chr1,21346107,21348107,True,disease_enhancers,-3.432213,1
1,chr1,93909844,93911044,True,disease_enhancers,-2.645724,1
2,chr1,155293209,155293609,True,disease_enhancers,-4.190955,1
3,chr1,160032410,160033810,True,disease_enhancers,-2.719339,2
4,chr1,173915262,173915462,True,disease_enhancers,-1.790974,1
...,...,...,...,...,...,...,...
627,chr7,31361986,31364186,True,low_lof_tolerance,-1.247859,4
628,chr7,54568507,54569507,True,low_lof_tolerance,-1.270480,2
629,chr7,97011888,97013488,True,low_lof_tolerance,-0.365397,2
630,chr8,98415972,98417172,True,low_lof_tolerance,0.620860,1


## Intersect Khurana enhancers with homozygous deletions from TopMed 

In [152]:
import subprocess
import numpy as np 

def shell(cmd):
  # run shell command and return output as a string
  return subprocess.run(
    cmd,
    shell=True,
    stdout=subprocess.PIPE,
    stderr=subprocess.PIPE,
  ).stdout.decode("utf-8").strip()

def collapse(ser): 
  unique_value, = set(ser)
  return unique_value

def count(ser): 
  if len(ser) == 1 and ser.iloc[0] == '.': return '.'
  return len(ser) 

def custom_max(ser): 
  if len(ser) == 1 and ser.iloc[0] == '.': return '.'
  return np.max([int(value) for value in list(ser)])

def custom_list(ser): 
  if len(ser) == 1 and ser.iloc[0] == '.': return '.'
  return list(ser)

def aggregate_over_deletions(df, group_columns, aggregation_functions): 
  groups = df.groupby(group_columns)
  aggregated = groups.agg(aggregation_functions)  
  df = aggregated.reset_index()
  df.columns = [' '.join(col[::-1]).strip() for col in df.columns.values]
  return df

def read_enhancers_intersect_homozygous_deletions():
  cmd = f'bash {CONSTRAINT_TOOLS}/experiments/germline-model/chen-et-al-2022/intersect-labeled-enhancers-with-topmed-homozygous-deletions.sh'
  print(cmd)
  print(shell(cmd))
  df = pd.read_csv(
    f"{CONSTRAINT_TOOLS_DATA}/khurana/labeled-enhancers-intersect-topmed-homozygous-deletions.bed", 
    sep = '\t',
  )  
  group_columns = [
    'chrom',
    'start',
    'end',
    'constrained',
    'tag',
    'min negative_new_chen_score_window',
    'count negative_new_chen_score_window',
  ]
  aggregation_functions = {
    'SVLEN': [custom_max],
    'SVTYPE': [collapse, count],
    'SV_ID': [custom_list],
    'HomAlt': [custom_max],
    'enhancer-deletion-overlap': [custom_max]
  }
  aggregation_columns = list(aggregation_functions.keys())
  new_columns = group_columns + aggregation_columns
  df = df[new_columns]
  df = aggregate_over_deletions(df, group_columns, aggregation_functions)
  df = df.rename(columns={
    'chrom': 'chrom_enhancer',
    'start': 'start_enhancer',
    'end': 'end_enhancer',
  })
  return df   
  # TODO: 
  # add a column indicating whether enhancer overlaps a homozygous deletion
  
read_enhancers_intersect_homozygous_deletions()

bash /scratch/ucgd/lustre-work/quinlan/u6018199/constraint-tools/experiments/germline-model/chen-et-al-2022/intersect-labeled-enhancers-with-topmed-homozygous-deletions.sh



Unnamed: 0,chrom_enhancer,start_enhancer,end_enhancer,constrained,tag,min negative_new_chen_score_window,count negative_new_chen_score_window,custom_max SVLEN,collapse SVTYPE,count SVTYPE,custom_list SV_ID,custom_max HomAlt,custom_max enhancer-deletion-overlap
0,chr1,831220,832820,False,lof_tolerant,-1.248459,1,2521,DEL,1,[DEL_1:831216-833736],99,1600
1,chr1,1508220,1509820,False,lof_tolerant,-1.731383,2,74900,DEL,3,"[DEL_1:1508186-1510388, DEL_1:1443801-1518700,...",262,1600
2,chr1,8124140,8124940,False,lof_tolerant,-0.841769,1,9600,DEL,2,"[DEL_1:8121301-8130900, DEL_1:8122347-8130816]",65,800
3,chr1,12684990,12685190,False,lof_tolerant,-1.403148,2,3222,DEL,1,[DEL_1:12683396-12686617],175,200
4,chr1,16104305,16105505,False,lof_tolerant,0.845290,1,.,.,.,.,.,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
627,chr9,126908121,126908321,False,lof_tolerant,-1.462233,1,.,.,.,.,.,0
628,chr9,129765721,129766721,False,lof_tolerant,-1.356569,1,5100,DEL,4,"[DEL_9:129763101-129768200, DEL_9:129764101-12...",116,1000
629,chr9,129766921,129767321,False,lof_tolerant,-1.356569,1,5100,DEL,4,"[DEL_9:129763101-129768200, DEL_9:129764101-12...",116,400
630,chr9,132748213,132748813,False,lof_tolerant,-2.111835,1,4608,DEL,1,[DEL_9:132747349-132751956],36,600


## Using Chen zscore to predict whether an enhancer is critical or not, and improving that prediction using homozygous deletions

In [None]:
# TODO 
# 2. for each score threshold
#   i. predict class (constrained vs non-constrained), and compute contingency table
#   ii. create a new contingency table in which examples predicted to be constrained by Chen, 
#       but which overlap deletions, are now predicted to be unconstrained, 
# 3. repeat for all thresholds, and measure AUROC or AUPRC, both for original contingency tables and altered ones 

# Under the assumption that deletions can identify false-positive calls of constraint, one would expect performance to improve

## Avoiding data leakage by defining random enhancers to be unconstrained 

## Using Chen zscore to predict whether a SNV is deleterious or not, and improving that prediction using homozygous deletions 

In [None]:
# TODO 
# 0. resources: 
#    experiments/germline-model/chen-et-al-2022/pathogenic_variant_enrichment*.ipynb
#    experiments/germline-model/chen-et-al-2022/*clin*
#    download-process-data/noncoding-variants-associated-with-Mendelian-traits
#    download-process-data/compute-trustworthy-exon-adjacent-windows.sh
#    download-process-data/clinvar-noncoding-with-negative-controls
# 1. intersect clinvar variants with chen windows (with chen scores), 
#    instead of the other way around (as I previously did when testing for enrichment of clinvar variants in constrained windows)
# 2. this gives a single chen score to each clinvar variant 
# 3. do the same for a set of gnomad variants of equal number 
# 4. we now have a positive and negative set of SNVs, which we can use to draw a ROC curve, by thresholding the chen score
# 5. next, as above, we can declare those variants that overlap homozygous deletions as false positives, and recompute the ROC curve