In [2]:
CONSTRAINT_TOOLS = '/scratch/ucgd/lustre-work/quinlan/u6018199/constraint-tools'
CONSTRAINT_TOOLS_DATA = '/scratch/ucgd/lustre-work/quinlan/data-shared/constraint-tools'

import sys
sys.path.append(f'{CONSTRAINT_TOOLS}/utilities')

LOWER_DELETION_SIZE_LIMIT = 4000
UPPER_DELETION_SIZE_LIMIT = 10000
DELETION_ALLELE_FREQ_THRESHOLD = 0.00001

# DELETION_TYPE = 'long' 
DELETION_TYPE = 'all' 

# TODO: remove? 
BALANCE_CLASSES = False

PUBLIC_REPO = 'constraint-tools-data'
PUBLIC_REPO_DIR = f'/scratch/ucgd/lustre-work/quinlan/u6018199/{PUBLIC_REPO}'

## Compute SV scores of a set of positive and negative enhancers, i.e., enhancers linked to genes that are known to be under negative selection or no selection


In [None]:
# TODO 
# 1. read in enhancers into dataframe 
# 2. write out a file in which the first three columns are the enhancer coordinates
# 3. assign that file to the "windows" variable below, and edit the following code to compute SV scores for each enhancer

In [3]:
import pandas as pd 

pd.read_csv(
  f'{CONSTRAINT_TOOLS_DATA}/chen-et-al-2023-published-version/41586_2023_6045_MOESM4_ESM/Supplementary_Data_6_ESM.txt', 
  sep='\t'
)

Unnamed: 0,gene,enhancer,enhancer_Gnocchi,Haploinsufficient,MGI essential,OMIM dominant,LOEUF constrained,Olfactory,LOEUF unconstrained,LOEUF underpowered
0,NOC2L,chr1-867020-867220,1.509945,False,False,False,False,False,False,False
1,SAMD11,chr1-923220-923420,1.579239,False,False,False,False,False,False,False
2,RNF223,chr1-1094220-1094420,2.725941,False,False,False,False,False,False,True
3,HES4,chr1-1031020-1031220,4.830528,False,False,False,False,False,False,True
4,AGRN,chr1-1003620-1003820,5.938154,False,True,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...
15536,PNPLA7,chr9-137486748-137487348,6.012245,False,False,False,False,False,False,False
15537,ZMYND19,chr9-137532548-137532748,2.448615,False,False,False,False,False,False,False
15538,ARRDC1,chr9-137593948-137594348,1.190685,False,False,False,False,False,False,False
15539,EHMT1,chr9-137649348-137649548,3.038869,True,True,True,True,False,False,False


In [3]:
def assign_SV_scores_to_labeled_genes(
    gene_class, 
    deletion_class,
    deletion_type=DELETION_TYPE,
):
  windows = f'{CONSTRAINT_TOOLS_DATA}/genes/grch38/canonical-exons.{gene_class}.sorted.bed'
  windows_with_deletions = f'{CONSTRAINT_TOOLS_DATA}/benchmark-genome-wide-predictions/chen-et-al-2022/{gene_class}-canonical-exons-intersect-{deletion_type}-{deletion_class}-deletions.bed'

  intersect_script = f'{CONSTRAINT_TOOLS}/experiments/germline-model/chen-et-al-2022/intersect-windows-with-deletions.sh'

  PROCESSED_DELETIONS = f'{deletion_type}-{deletion_class}-deletions.bed'

  cmd = (
    f'bash '
    f'{intersect_script} '
    f'{windows} {deletion_class} {windows_with_deletions} '
    f'{PUBLIC_REPO_DIR} {PROCESSED_DELETIONS} '
    f'{deletion_type} '
    f'{LOWER_DELETION_SIZE_LIMIT} {UPPER_DELETION_SIZE_LIMIT} {DELETION_ALLELE_FREQ_THRESHOLD}'
  )
  print(shell(cmd))

  df = pd.read_csv(
    windows_with_deletions, 
    sep='\t',
  )

  df = df[
    (df['chromosome'] != 'chrX') &
    (df['chromosome'] != 'chrY')
  ]

  # though unlikely, some exons may intersect multiple merged (and therefore non-overlapping) deletions,
  # therefore group by exon (and deletion count), and compute "sum deletion overlap" in each group,
  group_columns = df.columns.tolist()
  group_columns.remove('deletion_overlap')
  df = aggregate(
    df, 
    group_columns = group_columns,
    aggregation_functions = {
      'deletion_overlap': [np.sum],
    }
  )

  df['exon_length'] = df['exon_end'] - df['exon_start']

  # group exons by parent gene, 
  # computing "sum deletion count" and "sum deletion overlap" in each group, 
  df = aggregate(
    df, 
    group_columns = ['chromosome', 'gene_symbol'],
    aggregation_functions = {
      'exon_start': [np.min],
      'exon_end': [np.max],
      'exon_rank': 'count',
      'deletion_count': [np.sum],
      'sum deletion_overlap': [np.sum],
      'exon_length': [np.sum],
    }
  )

  df['gene coordinates'] = df['chromosome'] + ':' + df['min exon_start'].astype(str) + '-' + df['max exon_end'].astype(str)
  df = df.drop(columns=['chromosome', 'min exon_start', 'max exon_end'])
  df['deletion count per bp'] = df['sum deletion_count'] / df['sum exon_length']
  df['fraction of bps that are deleted'] = df['sum sum deletion_overlap'] / df['sum exon_length']

  df = df.rename(columns={
    'count exon_rank': 'exon_count',
    'sum deletion_count': 'deletion count over all exons',
    'sum sum deletion_overlap': 'deletion overlap over all exons',
    'sum exon_length': 'total length of all exons',
  })

  return df

assign_SV_scores_to_labeled_genes(
  gene_class='positive',
  deletion_class='observed'
)

[0;36mWe assume that the first line of the following is a header line: [0m/scratch/ucgd/lustre-work/quinlan/data-shared/constraint-tools/genes/grch38/canonical-exons.positive.sorted.bed
[0;36mWrote windows with deletion counts and overlaps to: [0m/scratch/ucgd/lustre-work/quinlan/data-shared/constraint-tools/benchmark-genome-wide-predictions/chen-et-al-2022/positive-canonical-exons-intersect-all-observed-deletions.bed
[0;36mWrote all observed deletions in UCSC-genome-browser format to: [0m/scratch/ucgd/lustre-work/quinlan/u6018199/constraint-tools-data/all-observed-deletions.bed
Everything up-to-date
[0;36mPushed /scratch/ucgd/lustre-work/quinlan/u6018199/constraint-tools-data/all-observed-deletions.bed to public repo[0m
On branch main
Your branch is up to date with 'origin/main'.

nothing to commit, working tree clean


Unnamed: 0,gene_symbol,exon_count,deletion count over all exons,deletion overlap over all exons,total length of all exons,gene coordinates,deletion count per bp,fraction of bps that are deleted
0,ACTN2,21,3,407,4851,chr1:236686499-236764631,0.000618,0.083900
1,ADAR,15,0,0,6595,chr1:154582057-154608186,0.000000,0.000000
2,AHDC1,7,0,0,6431,chr1:27534303-27603632,0.000000,0.000000
3,AMPD1,16,6,866,2359,chr1:114673098-114695586,0.002543,0.367105
4,APOA2,4,0,0,470,chr1:161222292-161223628,0.000000,0.000000
...,...,...,...,...,...,...,...,...
641,TGFBR1,9,4,869,5924,chr9:99105150-99153658,0.000675,0.146691
642,TOR1A,5,0,0,2075,chr9:129812942-129824136,0.000000,0.000000
643,TPM2,9,10,1318,1318,chr9:35682926-35690056,0.007587,1.000000
644,TSC1,23,6,3792,8593,chr9:132891348-132944633,0.000698,0.441289


In [None]:
# TODO 
# 1. assign a label to each enhancer
# 2. compute PR curves, c.f., experiments/germline-model/chen-et-al-2022/SNV_plus_SV_model.3.ipynb