In [21]:
CONSTRAINT_TOOLS = '/scratch/ucgd/lustre-work/quinlan/u6018199/constraint-tools'
CONSTRAINT_TOOLS_DATA = '/scratch/ucgd/lustre-work/quinlan/data-shared/constraint-tools'

import sys
sys.path.append(f'{CONSTRAINT_TOOLS}/utilities')

LARGE_WINDOW_SIZE = 10000
LARGE_WINDOW_FILENAME = f'{CONSTRAINT_TOOLS_DATA}/benchmark-genome-wide-predictions/chen-et-al-2022/large-windows.bed'

## Make large windows

In [5]:
from shell import shell 

def make_large_windows(): 
  chromosome_sizes = f'{CONSTRAINT_TOOLS_DATA}/reference/grch38/chromosome-sizes/hg38.chrom.sizes.sorted'
  cmd = f'bedtools makewindows -g {chromosome_sizes} -w {LARGE_WINDOW_SIZE} > {LARGE_WINDOW_FILENAME}'    
  shell(cmd)

make_large_windows()




## Compute characteristics of overlapping deletions per large window 

In [24]:
import pandas as pd 

# intersect large windows with deletions (both het and homalt), and filter out suspect large windows
def read_large_windows_intersect_all_deletions():
  cmd = f'bash {CONSTRAINT_TOOLS}/experiments/germline-model/chen-et-al-2022/intersect-large-windows-with-topmed-deletions-and-filter.sh'
  print(shell(cmd))
  df = pd.read_csv(
    f"{CONSTRAINT_TOOLS_DATA}/benchmark-genome-wide-predictions/chen-et-al-2022/filtered-large-windows-with-deletions.bed", 
    sep = '\t',
  )  
  df = df[
    (df['chrom_window'] != 'chrX') &
    (df['chrom_window'] != 'chrY')
  ]
  return df 
  
pd.set_option('display.max_rows', 1000)

LARGE_WINDOWS_WITH_DELETIONS = read_large_windows_intersect_all_deletions()
LARGE_WINDOWS_WITH_DELETIONS

[0;36mWrote (filtered) large windows with intersecting topmed deletions to: [0m/scratch/ucgd/lustre-work/quinlan/data-shared/constraint-tools/benchmark-genome-wide-predictions/chen-et-al-2022/filtered-large-windows-with-deletions.bed



Unnamed: 0,chrom_window,start_window,end_window,number_of_overlapping_topmed_deletions,chrom_merged_deletion,start_merged_deletion,end_merged_deletion,window_merged_deletion_overlap
0,chr1,2300000,2310000,0,.,-1,-1,0
1,chr1,2310000,2320000,5,chr1,2311495,2335000,8505
2,chr1,2320000,2330000,4,chr1,2311495,2335000,10000
3,chr1,2330000,2340000,2,chr1,2311495,2335000,5000
4,chr1,2340000,2350000,3,chr1,2340700,2356411,9300
...,...,...,...,...,...,...,...,...
262886,chr22,49050000,49060000,5,chr22,39415437,49170200,10000
262887,chr22,49060000,49070000,7,chr22,39415437,49170200,10000
262888,chr22,49070000,49080000,4,chr22,39415437,49170200,10000
262889,chr22,49080000,49090000,2,chr22,39415437,49170200,10000


## Get small windows with constraint labels, and Chen variables 

In [None]:
# TODO 

def compute_N_mean_null_chen(row): 
    a = 1 
    b = -(2*row['N_observed'] + row['new chen zscore']**2)
    c = row['N_observed']**2
    sqrt = np.sqrt(b**2 - 4*a*c)
    sign = 1 if row['new chen zscore'] > 0 else -1
    return (-b + sign*sqrt)/(2*a)
    

def get_noncoding_nonenhancer_windows():
  # this bed file has already been aggregated to the window level: 
  # experiments/germline-model/chen-et-al-2022/enhancer-characteristics-enrichment.ipynb 
  filename = f'{CONSTRAINT_TOOLS_DATA}/benchmark-genome-wide-predictions/chen-et-al-2022/enhancer-characteristics-enrichment.bed'
  df = pd.read_csv(filename, sep='\t')
  return df 

  # TODO 
  df['N_mean_null_chen'] = df.apply(compute_N_mean_null_chen, axis=1)

  df = df[df['window overlaps merged_exon'] == False] # noncoding
  df = df[df['window overlaps enhancer'] == False] # nonenhancer

  df = df.drop_duplicates() 
  df = df[[
    'chromosome', 'start', 'end',
    'negative new chen zscore'
  ]]
  df = df.rename(columns={
    'chromosome': 'chrom', 
    'negative new chen zscore': 'min negative_new_chen_score_window'
  }) # type: ignore
  df = df.sample(NUMBER_NEGATIVE_EXAMPLES, random_state=42) # assume most such windows are nonconstrained
  df['truly constrained'] = False
  df['tag'] = 'random_noncoding_nonenhancer'
  df['count negative_new_chen_score_window'] = 1
  return df

pd.set_option('display.max_columns', 100)

get_noncoding_nonenhancer_windows()

In [None]:
def check_regions_non_overlapping():
  count_self_intersections = (
    f'bedtools intersect'
    f" -a <(tail -n +2 {CONSTRAINT_TOOLS_DATA}/khurana/{LABELED_REGIONS})"
    f" -b <(tail -n +2 {CONSTRAINT_TOOLS_DATA}/khurana/{LABELED_REGIONS})"
    f' -wao'
    f' | wc -l'
  )
  number_self_intersections = shell(count_self_intersections)
  count_number_lines = f'tail -n +2 {CONSTRAINT_TOOLS_DATA}/khurana/{LABELED_REGIONS} | wc -l'
  number_lines = shell(count_number_lines)
  if number_self_intersections != number_lines:
    raise Exception(f'number of self-intersections ({number_self_intersections}) != number of lines ({number_lines})')
  else: 
    print(f'regions ({number_self_intersections}) are non-overlapping')

def create_and_save_labeled_regions(): 
  positive_examples = pd.concat([    
    aggregate_over_windows(read_disease_enhancers_intersect_chen_windows()),
    aggregate_over_windows(read_low_lof_tolerance_enhancers_intersect_chen_windows())
  ])

  negative_examples = get_noncoding_nonenhancer_windows()

  df = pd.concat([positive_examples, negative_examples])

  df = df.reset_index(drop=True) # create new index and drop old index
  df = df.reset_index(drop=False) # make new index into a column
  df = df.rename(columns={'index': 'region_id'})
  new_order = df.columns[1:].tolist() + ['region_id']
  df = df.reindex(columns=new_order)
  
  print('number of regions that are truly constrained (True) or not (False):') 
  print(df['truly constrained'].value_counts())

  df.to_csv(f"{CONSTRAINT_TOOLS_DATA}/khurana/{LABELED_REGIONS}", sep='\t', index=False)

  check_regions_non_overlapping() 

  return df 

pd.set_option('display.max_rows', 10)

create_and_save_labeled_regions()

## Intersect labeled regions with ALL deletions from TopMed 

In [None]:
import numpy as np 

def collapse(ser): 
  unique_value, = set(ser)
  return unique_value

def count(ser): 
  if len(ser) == 1 and ser.iloc[0] == '.': return 0
  return len(ser) 

def custom_max(ser): 
  if len(ser) == 1 and ser.iloc[0] == '.': return '.'
  return np.max([int(value) for value in list(ser)])

def custom_list(ser): 
  if len(ser) == 1 and ser.iloc[0] == '.': return '.'
  return list(ser)

def aggregate_over_deletions(df, group_columns, aggregation_functions): 
  groups = df.groupby(group_columns)
  aggregated = groups.agg(aggregation_functions)  
  df = aggregated.reset_index()
  df.columns = [' '.join(col[::-1]).strip() for col in df.columns.values]
  return df

def read_labeled_regions_intersect_all_deletions():
  cmd = f'bash {CONSTRAINT_TOOLS}/experiments/germline-model/chen-et-al-2022/intersect-labeled-regions-{VERSION}-with-topmed-deletions.sh {VERSION}'
  print(shell(cmd))
  df = pd.read_csv(
    f"{CONSTRAINT_TOOLS_DATA}/khurana/labeled-regions-{VERSION}-intersect-topmed-deletions.bed", 
    sep = '\t',
  )  
  group_columns = [
    'chrom',
    'start',
    'end',
    'truly constrained',
    'tag',
    'min negative_new_chen_score_window',
    'count negative_new_chen_score_window',
    'region_id'
  ]
  aggregation_functions = {
    'SVLEN': [custom_max],
    'SVTYPE': [collapse, count],
    # 'SV_ID': [custom_list],
    'Het': [custom_max],
    'HomAlt': [custom_max],
    # 'region-deletion-overlap': [custom_max]
  }
  aggregation_columns = list(aggregation_functions.keys())
  new_columns = group_columns + aggregation_columns
  df = df[new_columns]
  df = aggregate_over_deletions(df, group_columns, aggregation_functions)
  df = df.rename(columns={
    'chrom': 'chrom_region',
    'start': 'start_region',
    'end': 'end_region',
  })
  df['region is deleted in topmed'] = df['collapse SVTYPE'] == 'DEL'
  print('number of regions that are deleted in TopMed (True) or not (False):')
  print(df['region is deleted in topmed'].value_counts())
  return df   
  
pd.set_option('display.max_rows', 1000)

REGIONS = read_labeled_regions_intersect_all_deletions()
REGIONS

## Using Chen zscore to predict whether a region is critical or not, and improving that prediction using TopMed deletions

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib as mpl

# Set the font size
mpl.rcParams['font.size'] = 20

def plot_score_distributions(df, score, xlabel, bins, xlim=None):
  ax = sns.histplot(data=df, x=score, kde=True, bins=bins, hue='truly constrained', stat='count') # stat='density'
  ax.set_xlabel(xlabel)
  ax.set_ylabel('Number of regions')
  if xlim is not None: ax.set_xlim(xlim)
  legend = ax.get_legend()
  legend.set_title('region truly constrained?')
  fig = plt.gcf()
  fig.set_size_inches(10, 5)
  plt.show()

  # Compute the area under the curve for each histogram
  for container in ax.containers:
    print(container.get_label())
    area = 0
    for bar in container.patches:
      area += bar.get_width() * bar.get_height()
    print(f"Area under the curve: {area:.2f}")

plot_score_distributions(REGIONS, score='min negative_new_chen_score_window', xlabel='Chen score of region', bins=40, xlim=(-6, 6))

In [None]:
from scipy.stats import poisson

def plot_deletion_overlap_distribution(df, regions_class): 
  k = np.array(df['count SVTYPE']) # number of deletions overlapping each region
  poisson_rate_parameter = np.mean(k)

  ser = df['count SVTYPE'].value_counts()
  number_deletions_overlapping_region = np.array(ser.index) # possible (unique) number of deletions overlapping a region
  number_regions = np.array(ser)
  total_number_regions = np.sum(number_regions)
  probability_deletion_overlaps_region = number_regions / total_number_regions

  plt.scatter(number_deletions_overlapping_region, probability_deletion_overlaps_region, label='data')
  plt.scatter(number_deletions_overlapping_region, poisson.pmf(number_deletions_overlapping_region, poisson_rate_parameter), label='Poisson fit')
  plt.yscale('linear') # 'log'
  # plt.ylim([0.0001, 1])
  plt.xlim([0, 20])
  plt.ylim([0, 0.5])
  plt.xlabel('Number of deletions overlapping region')
  plt.ylabel('Probability')
  plt.legend() 
  plt.title(
    f'{regions_class} regions\n'
    f'{poisson_rate_parameter:.2f} deletions overlap each region on average'
  )
  plt.show()

def plot_deletion_overlap_distribution_wrapper():
  constrained = REGIONS['truly constrained']
  plot_deletion_overlap_distribution(REGIONS[constrained == True], regions_class='Constrained')
  plot_deletion_overlap_distribution(REGIONS[constrained == False], regions_class='Unconstrained')

plot_deletion_overlap_distribution_wrapper()

In [None]:
def predict_constraint_without_deletions(df, zscore_threshold): 
  score = 'min negative_new_chen_score_window'
  df = df[[score, 'truly constrained']].copy()
  df['predicted to be constrained'] = df[score] < zscore_threshold
  return df

predict_constraint_without_deletions(REGIONS, zscore_threshold=-2)

In [None]:
def predict_constraint_using_deletions(df, zscore_threshold): 
  score = 'min negative_new_chen_score_window'
  df = df[[
    score, 
    'truly constrained', 
    'region is deleted in topmed'
  ]].copy()
  df['predicted to be constrained'] = (
    (df[score] < zscore_threshold) & 
    (df['region is deleted in topmed'] == False)
  )
  return df

pd.set_option('display.max_rows', 10)

predict_constraint_using_deletions(REGIONS, zscore_threshold=-2)

In [None]:
def compute_precision_recall(df, zscore_threshold, predict_constraint, log=False): 
  df = predict_constraint(df, zscore_threshold)
  
  contingency_table = pd.crosstab(
    df['predicted to be constrained'], 
    df['truly constrained']
  )

  if log:   
    print(zscore_threshold)
    print(contingency_table)

  tp = contingency_table.loc[True, True]
  fp = contingency_table.loc[True, False]
  fn = contingency_table.loc[False, True]
  precision = tp / (tp + fp)
  recall = tp / (tp + fn)

  return precision, recall
  
compute_precision_recall(REGIONS, zscore_threshold=-3, predict_constraint=predict_constraint_without_deletions, log=True)

In [None]:
compute_precision_recall(REGIONS, zscore_threshold=-3, predict_constraint=predict_constraint_using_deletions, log=True)

In [None]:
def plot_precision_recall(df, predict_constraint, label): 
  zscore_thresholds = np.arange(-6, 5.5, 0.1) 
  precision_recall = [
    compute_precision_recall(df, zscore_threshold, predict_constraint, log=False)
    for zscore_threshold in zscore_thresholds
  ]
  precisions, recalls = zip(*precision_recall)
  plt.plot(recalls, precisions, label=label)

def plot_precision_recall_wrapper(df): 
  plot_precision_recall(df, predict_constraint_without_deletions, label='without using topmed deletions')
  plot_precision_recall(df, predict_constraint_using_deletions, label='using topmed deletions')
  plt.xlabel('Recall')
  plt.ylabel('Precision')
  plt.legend()
  fig = plt.gcf()
  fig.set_size_inches(10, 5)

  number_unconstrained_regions = df['truly constrained'].value_counts().loc[False]
  number_constrained_regions = df['truly constrained'].value_counts().loc[True]
  plt.title(
    'Predicting constraint on\n'
    f'{number_unconstrained_regions} unconstrained regions\n'
    f'and {number_constrained_regions} constrained regions'
  )
  
plot_precision_recall_wrapper(REGIONS)