## Get a set of enhancers known to be constrained and a set that are assumed to be constrained, together with their Chen scores 

In [1]:
CONSTRAINT_TOOLS = '/scratch/ucgd/lustre-work/quinlan/u6018199/constraint-tools'
CONSTRAINT_TOOLS_DATA = '/scratch/ucgd/lustre-work/quinlan/data-shared/constraint-tools'

In [2]:
import pandas as pd

def read_disease_enhancers_intersect_chen_windows():
  # download-process-data/khurana/README.md
  df = pd.read_csv(
    f"{CONSTRAINT_TOOLS_DATA}/khurana/disease-enhancers-intersect-chen-windows.bed", 
    sep = '\t',
    names = [
        'chrom_enhancer', 'start_enhancer', 'end_enhancer', 
        'disease', 'enhancer_predicted_LoF_tolerance_prob', 'enhancer_predicted_LoF_tolerance_status', 
        'chrom_window', 'start_window', 'end_window', 
        'new_chen_score_window'
    ]
  )
  df = df.drop_duplicates() 
  df['negative_new_chen_score_window'] = -df['new_chen_score_window']
  df = df.drop(columns=[
    'new_chen_score_window',
    'chrom_window', 'start_window', 'end_window',
    'disease', 'enhancer_predicted_LoF_tolerance_prob', 'enhancer_predicted_LoF_tolerance_status'
  ])
  df = df.rename(columns={
    'chrom_enhancer': 'chrom', 
    'start_enhancer': 'start', 
    'end_enhancer': 'end'
  })
  df['truly constrained'] = True
  df['tag'] = 'disease_enhancers'
  return df 

read_disease_enhancers_intersect_chen_windows()

Unnamed: 0,chrom,start,end,negative_new_chen_score_window,truly constrained,tag
0,chr1,21346107,21348107,-3.432213,True,disease_enhancers
1,chr1,93909844,93911044,-2.645724,True,disease_enhancers
2,chr1,155293209,155293609,-4.190955,True,disease_enhancers
3,chr1,160032410,160033810,-2.719339,True,disease_enhancers
4,chr1,160032410,160033810,-0.167476,True,disease_enhancers
...,...,...,...,...,...,...
116,chr20,46114761,46114961,0.241142,True,disease_enhancers
117,chr22,28787612,28789612,2.138137,True,disease_enhancers
118,chr22,28787612,28789612,0.488883,True,disease_enhancers
119,chr22,28787612,28789612,1.762075,True,disease_enhancers


In [3]:
# get low-lof-tolerance enhancers
def read_low_lof_tolerance_enhancers_intersect_chen_windows():
  # download-process-data/khurana/README.md
  df = pd.read_csv(
    f"{CONSTRAINT_TOOLS_DATA}/khurana/low-lof-tolerance-enhancers-intersect-chen-windows.bed", 
    sep = '\t',
    names = [
        'chrom_enhancer_hg38', 'start_enhancer_hg38', 'end_enhancer_hg38', 
        'enhancer_hg19', 'unknown',
        'chrom_window', 'start_window', 'end_window', 
        'new_chen_score_window'
    ]
  )
  df = df.drop_duplicates() 
  df['negative_new_chen_score_window'] = -df['new_chen_score_window']
  df = df.drop(columns=[
    'new_chen_score_window',
    'chrom_window', 'start_window', 'end_window',
    'enhancer_hg19', 'unknown'
  ])
  df = df.rename(columns={
    'chrom_enhancer_hg38': 'chrom', 
    'start_enhancer_hg38': 'start', 
    'end_enhancer_hg38': 'end'
  })
  df['truly constrained'] = True
  df['tag'] = 'low_lof_tolerance'
  return df 

read_low_lof_tolerance_enhancers_intersect_chen_windows().iloc[:10]

Unnamed: 0,chrom,start,end,negative_new_chen_score_window,truly constrained,tag
0,chr1,61587728,61589928,-0.884984,True,low_lof_tolerance
1,chr1,61587728,61589928,-3.667483,True,low_lof_tolerance
2,chr1,61587728,61589928,-1.787719,True,low_lof_tolerance
3,chr1,87355917,87357117,-1.174601,True,low_lof_tolerance
4,chr1,87355917,87357117,-2.971624,True,low_lof_tolerance
5,chr1,87355917,87357117,-1.59784,True,low_lof_tolerance
9,chr1,169941659,169943659,-0.732262,True,low_lof_tolerance
10,chr1,169941659,169943659,-0.494991,True,low_lof_tolerance
11,chr2,66069666,66071266,-1.628703,True,low_lof_tolerance
12,chr2,66069666,66071266,-1.923346,True,low_lof_tolerance


In [10]:
def read_enhancers_intersect_chen_windows():
  # download-process-data/khurana/README.md
  df = pd.read_csv(
    f"{CONSTRAINT_TOOLS_DATA}/khurana/enhancers-intersect-chen-windows.bed", 
    sep = '\t',
    names = [
        'chrom_enhancer', 'start_enhancer', 'end_enhancer', 
        'enhancer_deletion_status', 'enhancer_predicted_LoF_tolerance_status', 'enhancer_predicted_LoF_tolerance_prob', 
        'chrom_window', 'start_window', 'end_window', 
        'new_chen_score_window'
    ]
  )
  df = df.drop_duplicates() 
  df['negative_new_chen_score_window'] = -df['new_chen_score_window']
  df = df.drop(columns=[
    'new_chen_score_window',
    'chrom_window', 'start_window', 'end_window',
    'enhancer_deletion_status', 'enhancer_predicted_LoF_tolerance_status', 'enhancer_predicted_LoF_tolerance_prob'
  ])
  df = df.rename(columns={
    'chrom_enhancer': 'chrom', 
    'start_enhancer': 'start', 
    'end_enhancer': 'end'
  })
  df['truly constrained'] = False
  df['tag'] = '.'
  return df 

read_enhancers_intersect_chen_windows()

Unnamed: 0,chrom,start,end,negative_new_chen_score_window,truly constrained,tag
0,chr1,794220,794420,-4.034545,False,.
1,chr1,794620,795020,-4.034545,False,.
2,chr1,794620,795020,-4.072486,False,.
3,chr1,803820,804220,-3.779493,False,.
4,chr1,803820,804220,-0.767485,False,.
...,...,...,...,...,...,...
351771,chr22,50739172,50740572,-2.476451,False,.
351772,chr22,50748972,50749972,-2.543021,False,.
351773,chr22,50754572,50754772,0.087238,False,.
351774,chr22,50772572,50772972,-2.204285,False,.


In [11]:
def aggregate_over_windows(df): 
  columns = df.columns.tolist()
  group_columns = [column for column in columns if column != 'negative_new_chen_score_window']
  groups = df.groupby(group_columns)
  aggregation_functions = {'negative_new_chen_score_window': ['min', 'count']}
  aggregated = groups.agg(aggregation_functions)  
  df = aggregated.reset_index()
  df.columns = [' '.join(col[::-1]).strip() for col in df.columns.values]
  return df

aggregate_over_windows(read_enhancers_intersect_chen_windows())

Unnamed: 0,chrom,start,end,truly constrained,tag,min negative_new_chen_score_window,count negative_new_chen_score_window
0,chr1,794220,794420,False,.,-4.034545,1
1,chr1,794620,795020,False,.,-4.072486,2
2,chr1,803820,804220,False,.,-3.779493,2
3,chr1,804820,805620,False,.,-3.761746,2
4,chr1,828420,830620,False,.,-2.366455,2
...,...,...,...,...,...,...,...
201406,chr9,138037348,138037548,False,.,-1.328952,1
201407,chr9,138037948,138038948,False,.,-2.355119,2
201408,chr9,138040548,138041148,False,.,-2.570888,2
201409,chr9,138052748,138054748,False,.,-5.088021,2


In [21]:
import subprocess

def shell(cmd):
  completed_process = subprocess.run(
    cmd,
    shell=True,
    executable='/usr/bin/bash',  # default shell is /bin/sh, but we need bash for <()
    stdout=subprocess.PIPE,
    stderr=subprocess.PIPE,
  ) 
  if completed_process.returncode == 0: 
    return completed_process.stdout.decode("utf-8").strip()
  else: 
    raise Exception(completed_process.stderr.decode("utf-8").strip())
  
def check_enhancers_non_overlapping():
  count_self_intersections = (
    f'bedtools intersect'
    f' -a <(tail -n +2 {CONSTRAINT_TOOLS_DATA}/khurana/labeled-enhancers.2.bed)'
    f' -b <(tail -n +2 {CONSTRAINT_TOOLS_DATA}/khurana/labeled-enhancers.2.bed)'
    f' -wao'
    f' | wc -l'
  )
  number_self_intersections = shell(count_self_intersections)
  count_number_lines = f'tail -n +2 {CONSTRAINT_TOOLS_DATA}/khurana/labeled-enhancers.2.bed | wc -l'
  number_lines = shell(count_number_lines)
  if number_self_intersections != number_lines:
    raise Exception(f'number of self-intersections ({number_self_intersections}) != number of lines ({number_lines})')
  else: 
    print(f'enhancers ({number_self_intersections}) are non-overlapping')


In [30]:
def create_labeled_enhancers(): 
  positive_examples = pd.concat([
    aggregate_over_windows(read_disease_enhancers_intersect_chen_windows()),
    aggregate_over_windows(read_low_lof_tolerance_enhancers_intersect_chen_windows()),
  ])

  # assume most enhancers are not constrained: 
  negative_examples = aggregate_over_windows(read_enhancers_intersect_chen_windows()).sample(n=len(positive_examples)) 
  negative_examples['tag'] = 'randomly sampled'

  df = pd.concat([positive_examples, negative_examples])

  df = df.reset_index(drop=True) # create new index and drop old index
  df = df.reset_index(drop=False) # make new index into a column
  df = df.rename(columns={'index': 'enhancer_id'})
  new_order = df.columns[1:].tolist() + ['enhancer_id']
  df = df.reindex(columns=new_order)
  
  print('number of enhancers that are truly constrained (True) or not (False):') 
  print(df['truly constrained'].value_counts())

  df.to_csv(f"{CONSTRAINT_TOOLS_DATA}/khurana/labeled-enhancers.2.bed", sep='\t', index=False)
  check_enhancers_non_overlapping() 

  return df 

pd.set_option('display.max_rows', 10)

create_labeled_enhancers()

number of enhancers that are truly constrained (True) or not (False):
True     118
False    118
Name: truly constrained, dtype: int64
enhancers (236) are non-overlapping


Unnamed: 0,chrom,start,end,truly constrained,tag,min negative_new_chen_score_window,count negative_new_chen_score_window,enhancer_id
0,chr1,21346107,21348107,True,disease_enhancers,-3.432213,1,0
1,chr1,93909844,93911044,True,disease_enhancers,-2.645724,1,1
2,chr1,155293209,155293609,True,disease_enhancers,-4.190955,1,2
3,chr1,160032410,160033810,True,disease_enhancers,-2.719339,2,3
4,chr1,173915262,173915462,True,disease_enhancers,-1.790974,1,4
...,...,...,...,...,...,...,...,...
231,chr2,75157274,75158874,False,randomly sampled,1.461836,1,231
232,chr7,51219303,51219703,False,randomly sampled,-0.775471,1,232
233,chr19,14818188,14818588,False,randomly sampled,-1.515543,1,233
234,chr7,151991915,151993115,False,randomly sampled,-3.060102,2,234


## Intersect Khurana enhancers with homozygous deletions from TopMed 

In [None]:
import numpy as np 

def collapse(ser): 
  unique_value, = set(ser)
  return unique_value

def count(ser): 
  if len(ser) == 1 and ser.iloc[0] == '.': return '.'
  return len(ser) 

def custom_max(ser): 
  if len(ser) == 1 and ser.iloc[0] == '.': return '.'
  return np.max([int(value) for value in list(ser)])

def custom_list(ser): 
  if len(ser) == 1 and ser.iloc[0] == '.': return '.'
  return list(ser)

def aggregate_over_deletions(df, group_columns, aggregation_functions): 
  groups = df.groupby(group_columns)
  aggregated = groups.agg(aggregation_functions)  
  df = aggregated.reset_index()
  df.columns = [' '.join(col[::-1]).strip() for col in df.columns.values]
  return df

def read_enhancers_intersect_homozygous_deletions():
  cmd = f'bash {CONSTRAINT_TOOLS}/experiments/germline-model/chen-et-al-2022/intersect-labeled-enhancers-1-with-topmed-homozygous-deletions.sh'
  shell(cmd)  
  df = pd.read_csv(
    f"{CONSTRAINT_TOOLS_DATA}/khurana/labeled-enhancers-1-intersect-topmed-homozygous-deletions.bed", 
    sep = '\t',
  )  
  group_columns = [
    'chrom',
    'start',
    'end',
    'truly constrained',
    'tag',
    'min negative_new_chen_score_window',
    'count negative_new_chen_score_window',
    'enhancer_id'
  ]
  aggregation_functions = {
    'SVLEN': [custom_max],
    'SVTYPE': [collapse, count],
    # 'SV_ID': [custom_list],
    'HomAlt': [custom_max],
    # 'enhancer-deletion-overlap': [custom_max]
  }
  aggregation_columns = list(aggregation_functions.keys())
  new_columns = group_columns + aggregation_columns
  df = df[new_columns]
  df = aggregate_over_deletions(df, group_columns, aggregation_functions)
  df = df.rename(columns={
    'chrom': 'chrom_enhancer',
    'start': 'start_enhancer',
    'end': 'end_enhancer',
  })
  df['enhancer is homozygously deleted in topmed'] = df['collapse SVTYPE'] == 'DEL'
  print('number of enhancers that are homozygously deleted in TopMed (True) or not (False):')
  print(df['enhancer is homozygously deleted in topmed'].value_counts())
  return df   
  
ENHANCERS = read_enhancers_intersect_homozygous_deletions()
ENHANCERS

## Using Chen zscore to predict whether an enhancer is critical or not, and improving that prediction using homozygous deletions

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

def plot_zscore_distributions(df):
  ax = sns.histplot(data=df, x='min negative_new_chen_score_window', kde=True, bins=25, hue='truly constrained')
  ax.set_xlabel('Chen z-score of enhancer')
  ax.set_ylabel('Number of enhancers')
  legend = ax.get_legend()
  legend.set_title('Enhancer truly constrained?')
  fig = plt.gcf()
  fig.set_size_inches(10, 5)

plot_zscore_distributions(ENHANCERS)

In [None]:
def predict_constraint_without_deletions(df, zscore_threshold): 
  score = 'min negative_new_chen_score_window'
  df = df[[score, 'truly constrained']].copy()
  df['predicted to be constrained'] = df[score] < zscore_threshold
  return df

predict_constraint_without_deletions(ENHANCERS, zscore_threshold=-2)

In [None]:
def predict_constraint_using_deletions(df, zscore_threshold): 
  score = 'min negative_new_chen_score_window'
  df = df[[
    score, 
    'truly constrained', 
    'enhancer is homozygously deleted in topmed'
  ]].copy()
  df['predicted to be constrained'] = (
    (df[score] < zscore_threshold) & 
    (df['enhancer is homozygously deleted in topmed'] == False)
  )
  return df

pd.set_option('display.max_rows', 10)

predict_constraint_using_deletions(ENHANCERS, zscore_threshold=-2)

In [None]:
def compute_precision_recall(df, zscore_threshold, predict_constraint, log=False): 
  df = predict_constraint(df, zscore_threshold)
  
  contingency_table = pd.crosstab(
    df['predicted to be constrained'], 
    df['truly constrained']
  )

  if log:   
    print(zscore_threshold)
    print(contingency_table)

  tp = contingency_table.loc[True, True]
  fp = contingency_table.loc[True, False]
  fn = contingency_table.loc[False, True]
  precision = tp / (tp + fp)
  recall = tp / (tp + fn)

  return precision, recall
  
compute_precision_recall(ENHANCERS, zscore_threshold=-2, predict_constraint=predict_constraint_without_deletions, log=True)

In [None]:
def plot_precision_recall(df, predict_constraint, label): 
  zscore_thresholds = np.arange(-4.5, 6.5, 0.1)
  precision_recall = [
    compute_precision_recall(df, zscore_threshold, predict_constraint)
    for zscore_threshold in zscore_thresholds
  ]
  precisions, recalls = zip(*precision_recall)
  plt.plot(recalls, precisions, label=label)

def plot_precision_recall_wrapper(df): 
  plot_precision_recall(df, predict_constraint_without_deletions, label='without using topmed deletions')
  plot_precision_recall(df, predict_constraint_using_deletions, label='using topmed deletions')
  plt.xlabel('Recall')
  plt.ylabel('Precision')
  plt.legend()
  fig = plt.gcf()
  fig.set_size_inches(10, 5)
  plt.title('Predicting enhancer constraint')

plot_precision_recall_wrapper(ENHANCERS)

## This test of the approach is weak

Labels are leaking to predictors because most of the enhancers that are homozygously deleted in 1000G (label) are also homozygously deleted in TopMed (predictor): 

In [None]:
from matplotlib_venn import venn2
import matplotlib.pyplot as plt
import matplotlib as mpl

# Set the font size
mpl.rcParams['font.size'] = 20

def get_enhancers_deleted_in_topmed(df): 
  return df[df['enhancer is homozygously deleted in topmed']]['enhancer_id'].tolist()

def get_enhancers_deleted_in_1000_genomes(df):
  return df[df['tag'] == 'lof_tolerant']['enhancer_id'].tolist()

def plot_venn_diagram():
  venn2(
    [
      set(get_enhancers_deleted_in_topmed(ENHANCERS)), 
      set(get_enhancers_deleted_in_1000_genomes(ENHANCERS))
    ], 
    set_labels=('enhancers homozygously deleted in TopMed', 'enhancers homozygously deleted in 1000G')
  )
  fig = plt.gcf()
  fig.set_size_inches(10, 5)
  fig.set_facecolor('white')

plot_venn_diagram()

## Next steps 

Circumvent the problem of labels leaking to predictors by not relying upon homozygous deletions to label the enhancers.
Instead label a set of randomly chosen enhancers as not constrained, under the assumption that most enhancers are not constrained. 
Then use the Chen zscore, with and without homozygous deletions, to predict whether an enhancer is constrained or not.