## Note on Khurana enhancer set as a positive set

In [1]:
# Khurana has a set of disease enhancers, and enhancers that she predicts to be intolerant to deletion, but the size of the set is small (100s): 
# /scratch/ucgd/lustre-labs/quinlan/u6018199/constraint-tools/download-process-data/khurana

## Preliminary Workflow for "labeled" enhancers

"Labeled" enhancers are defined at: https://docs.google.com/presentation/d/1qw3QiWVHSqYA2f4QoahE8FHZPisIftp-tCUWTODjnDY/edit#slide=id.p

In [2]:
# 1. computed GC-content for labeled enhancers: 
#    papers/neutral_models_are_biased/8.labeled-enhancers/compute-GC-content-for-all-window-sizes-based-on-labeled-enhancers.sh

## Assign BGS, gBGC, GC_content to "labeled" enhancers

In [3]:
CONSTRAINT_TOOLS = '/scratch/ucgd/lustre-labs/quinlan/u6018199/constraint-tools'
CONSTRAINT_TOOLS_DATA = '/scratch/ucgd/lustre-labs/quinlan/data-shared/constraint-tools'

import sys
sys.path.append(f'{CONSTRAINT_TOOLS}/utilities')

POP = 'EUR'
GC_WINDOW_SIZE = 1000

In [4]:
import polars as pl 

from shell import shell 
from colorize import print_string_as_info
from aggregate import aggregate_polars

def get_header(filename):
  with open(filename) as fh: 
    header = fh.readline().strip().split('\t')
  return header

def intersect(a_filename, b_filename, intersect_filename, b_class): 
  cmd = (
    f'bedtools intersect'
    f" -a <(tail -n +2 {a_filename})" # contains header
    f" -b <(tail -n +2 {b_filename})" # contains header
    f' -wao -f 0.5' 
    f' > {intersect_filename}'
  )
  shell(cmd) 
  print_string_as_info(f'Wrote {intersect_filename}')

  a_file_header = get_header(a_filename) # contains header
  b_file_header = get_header(b_filename) # contains header
  b_file_header = [f'{b_class}_{field}' for field in b_file_header[:3]] + b_file_header[3:]

  return a_file_header, b_file_header

def make_scores_numeric(df: pl.DataFrame, b_class: str, b_features: list) -> pl.DataFrame:
    # we don't want to include windows that don't overlap any windows with b_feature values
    df = df.filter(pl.col(f'{b_class}_chromosome') != '.')

    # having removed records where 'b_class_chromosome' is '.', 
    # we can now convert the 'b_features' columns to floats
    for b_feature in b_features:
        df = df.filter(pl.col(b_feature) != '.') # gBGC values are floats, but some are missing
        df = df.with_columns(pl.col(b_feature).cast(pl.Float64))

    return df

def intersect_and_aggregate(a_filename_stem, b_filename, b_class, b_features, b_class_aggregation_functions): 
  a_filename = f'{a_filename_stem}.bed'
  intersect_filename = f'{a_filename_stem}.intersect.{b_class}.bed'

  a_file_header, b_file_header = intersect(a_filename, b_filename, intersect_filename, b_class)

  df = pl.read_csv(
    intersect_filename, 
    separator='\t',
    new_columns=a_file_header + b_file_header + [f'labeledEnhancer_{b_class}Window_overlap'],
    infer_schema_length=1000000
  )

  df = make_scores_numeric(df, b_class, b_features) 

  # some labeled enhancers may intersect multiple b_feature-windows, 
  # so let's group by labeled enhancer, and aggregate scores over all b_feature-windows in the group
  df = aggregate_polars(
    df, 
    group_columns = a_file_header,
    aggregation_functions = b_class_aggregation_functions
  )

  df.write_csv(
    f'{a_filename_stem}.{b_class}.bed',
    separator='\t',
  )

  print_string_as_info(f'Wrote {a_filename_stem}.{b_class}.bed')

  return df 

In [5]:
# assign BGS values to labeled enhancers 
intersect_and_aggregate(
    a_filename_stem = f'{CONSTRAINT_TOOLS_DATA}/benchmark-genome-wide-predictions/chen-et-al-2022/labeled-enhancers.gnocchi.GC', 
    b_filename = f'{CONSTRAINT_TOOLS_DATA}/background-selection/CADD-B-map/bmap.hg38.header.bed', 
    b_class = 'BGS',
    b_features = ['B'], 
    b_class_aggregation_functions = [pl.col('B').min()] 
)   




[36mWrote /scratch/ucgd/lustre-labs/quinlan/data-shared/constraint-tools/benchmark-genome-wide-predictions/chen-et-al-2022/labeled-enhancers.gnocchi.GC.intersect.BGS.bed[0m
[36mWrote /scratch/ucgd/lustre-labs/quinlan/data-shared/constraint-tools/benchmark-genome-wide-predictions/chen-et-al-2022/labeled-enhancers.gnocchi.GC.BGS.bed[0m


enhancer_chrom,enhancer_start,enhancer_end,gene,enhancer,Haploinsufficient,MGI essential,OMIM dominant,LOEUF constrained,Olfactory,LOEUF unconstrained,max chen_score,max corrected_chen_score,max filtered_chen_score,mean window_GC_content,truly constrained,B
str,i64,i64,str,str,bool,bool,bool,bool,bool,bool,f64,f64,f64,f64,bool,f64
"""chr1""",1536220,1537020,"""TMEM240""","""chr1-1536220-1537020""",false,false,true,false,false,false,5.401829,1.029468,3.238727,0.570806,true,0.651
"""chr1""",1554620,1555020,"""ATAD3A""","""chr1-1554620-1555020""",false,false,true,false,false,false,4.059724,0.304463,1.584306,0.569781,true,0.652
"""chr1""",2128961,2129161,"""GABRD""","""chr1-2128961-2129161""",false,true,true,true,false,false,3.499737,0.045254,3.531299,0.565593,true,0.841
"""chr1""",2268561,2268761,"""SKI""","""chr1-2268561-2268761""",false,true,true,true,false,false,5.007183,1.077828,2.657948,0.547951,true,0.847
"""chr1""",2545161,2545361,"""PANK4""","""chr1-2545161-2545361""",false,false,true,false,false,false,2.775673,-0.264937,3.898165,0.557988,true,0.84
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""chr9""",122572921,122573121,"""OR1Q1""","""chr9-122572921-122573121""",false,false,false,false,true,false,2.41359,1.139482,1.717825,0.419903,false,0.904
"""chr9""",122807921,122808121,"""OR1L4""","""chr9-122807921-122808121""",false,false,false,false,true,false,1.049799,0.530994,1.290332,0.407189,false,0.672
"""chr9""",122807921,122808721,"""OR1L3""","""chr9-122807921-122808721""",false,false,false,false,true,false,1.049799,0.530994,1.290332,0.407189,false,0.672
"""chr9""",134922954,134923154,"""FCN1""","""chr9-134922954-134923154""",false,false,false,false,false,true,1.5286,-0.70674,1.196153,0.536773,false,0.973


In [6]:
# assign gBGC coefficients to labeled enhancers
intersect_and_aggregate(
    a_filename_stem = f'{CONSTRAINT_TOOLS_DATA}/benchmark-genome-wide-predictions/chen-et-al-2022/labeled-enhancers.gnocchi.GC.BGS', 
    b_filename = f'{CONSTRAINT_TOOLS_DATA}/GC-biased-gene-conversion/gBGC-coefficient.hg38.{POP}.header.bed', 
    b_class = 'gBGC',
    b_features = [f'B_M1star.{POP}'], 
    b_class_aggregation_functions = [pl.col(f'B_M1star.{POP}').mean()]
)   




[36mWrote /scratch/ucgd/lustre-labs/quinlan/data-shared/constraint-tools/benchmark-genome-wide-predictions/chen-et-al-2022/labeled-enhancers.gnocchi.GC.BGS.intersect.gBGC.bed[0m
[36mWrote /scratch/ucgd/lustre-labs/quinlan/data-shared/constraint-tools/benchmark-genome-wide-predictions/chen-et-al-2022/labeled-enhancers.gnocchi.GC.BGS.gBGC.bed[0m


enhancer_chrom,enhancer_start,enhancer_end,gene,enhancer,Haploinsufficient,MGI essential,OMIM dominant,LOEUF constrained,Olfactory,LOEUF unconstrained,max chen_score,max corrected_chen_score,max filtered_chen_score,mean window_GC_content,truly constrained,B,B_M1star.EUR
str,i64,i64,str,str,bool,bool,bool,bool,bool,bool,f64,f64,f64,f64,bool,f64,f64
"""chr1""",1554620,1555020,"""ATAD3A""","""chr1-1554620-1555020""",false,false,true,false,false,false,4.059724,0.304463,1.584306,0.569781,true,0.652,0.108103
"""chr1""",2128961,2129161,"""GABRD""","""chr1-2128961-2129161""",false,true,true,true,false,false,3.499737,0.045254,3.531299,0.565593,true,0.841,0.347981
"""chr1""",2268561,2268761,"""SKI""","""chr1-2268561-2268761""",false,true,true,true,false,false,5.007183,1.077828,2.657948,0.547951,true,0.847,0.347981
"""chr1""",2545161,2545361,"""PANK4""","""chr1-2545161-2545361""",false,false,true,false,false,false,2.775673,-0.264937,3.898165,0.557988,true,0.84,0.347981
"""chr1""",3208836,3209036,"""PRDM16""","""chr1-3208836-3209036""",false,true,true,true,false,false,6.07048,1.56976,4.124261,0.555916,true,0.966,0.788536
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""chr9""",122562521,122562921,"""OR1B1""","""chr9-122562521-122562921""",false,false,false,false,true,false,2.900355,1.400654,1.928357,0.420672,false,0.864,0.033174
"""chr9""",122572921,122573121,"""OR1Q1""","""chr9-122572921-122573121""",false,false,false,false,true,false,2.41359,1.139482,1.717825,0.419903,false,0.904,0.033174
"""chr9""",122807921,122808121,"""OR1L4""","""chr9-122807921-122808121""",false,false,false,false,true,false,1.049799,0.530994,1.290332,0.407189,false,0.672,0.033174
"""chr9""",122807921,122808721,"""OR1L3""","""chr9-122807921-122808721""",false,false,false,false,true,false,1.049799,0.530994,1.290332,0.407189,false,0.672,0.033174


In [7]:
def read(gc_window_size): 
    df = pl.read_csv(        
        f'{CONSTRAINT_TOOLS_DATA}/labeled-enhancers/labeled-enhancers.GC_content_{gc_window_size}.bed',
        separator='\t',
        infer_schema_length=1000000
    )
    df = df.select(pl.col(
        "labeled_enhancer_chrom",
        "labeled_enhancer_start",
        "labeled_enhancer_end", 
        "GC_window__GC_content"
    ))
    df = df.rename({"GC_window__GC_content": f'GC_content_{gc_window_size}bp'})
    return df

def assign_GC_to_labeled_enhancers():
    df = pl.read_csv(
        f'{CONSTRAINT_TOOLS_DATA}/benchmark-genome-wide-predictions/chen-et-al-2022/labeled-enhancers.gnocchi.GC.BGS.gBGC.bed', 
        separator='\t',
        infer_schema_length=1000000
    )

    for df_gc in [read(gc_window_size) for gc_window_size in [1000]]: 
        df = df.join(
            df_gc, 
            left_on=['enhancer_chrom', 'enhancer_start', 'enhancer_end'],
            right_on=["labeled_enhancer_chrom", "labeled_enhancer_start", "labeled_enhancer_end"], 
            how='inner'
        )

    df = df.unique(subset=['enhancer_chrom', 'enhancer_start', 'enhancer_end'], keep='first', maintain_order=True)

    df = df.drop(["max chen_score", "max corrected_chen_score", "max filtered_chen_score", "mean window_GC_content"])
    
    output_filename = f'{CONSTRAINT_TOOLS_DATA}/labeled-enhancers/labeled-enhancers.BGS.gBGC.GC_content.bed'
    df.write_csv(output_filename, separator='\t')
    print_string_as_info(f'Wrote {output_filename}')
    
    return df 

assign_GC_to_labeled_enhancers()

[36mWrote /scratch/ucgd/lustre-labs/quinlan/data-shared/constraint-tools/labeled-enhancers/labeled-enhancers.BGS.gBGC.GC_content.bed[0m


enhancer_chrom,enhancer_start,enhancer_end,gene,enhancer,Haploinsufficient,MGI essential,OMIM dominant,LOEUF constrained,Olfactory,LOEUF unconstrained,truly constrained,B,B_M1star.EUR,GC_content_1000bp
str,i64,i64,str,str,bool,bool,bool,bool,bool,bool,bool,f64,f64,f64
"""chr1""",1554620,1555020,"""ATAD3A""","""chr1-1554620-1555020""",false,false,true,false,false,false,true,0.652,0.108103,0.606394
"""chr1""",2128961,2129161,"""GABRD""","""chr1-2128961-2129161""",false,true,true,true,false,false,true,0.841,0.347981,0.585415
"""chr1""",2268561,2268761,"""SKI""","""chr1-2268561-2268761""",false,true,true,true,false,false,true,0.847,0.347981,0.602398
"""chr1""",2545161,2545361,"""PANK4""","""chr1-2545161-2545361""",false,false,true,false,false,false,true,0.84,0.347981,0.64036
"""chr1""",3208836,3209036,"""PRDM16""","""chr1-3208836-3209036""",false,true,true,true,false,false,true,0.966,0.788536,0.525475
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""chr9""",122562521,122562921,"""OR1B1""","""chr9-122562521-122562921""",false,false,false,false,true,false,false,0.864,0.033174,0.460539
"""chr9""",122572921,122573121,"""OR1Q1""","""chr9-122572921-122573121""",false,false,false,false,true,false,false,0.904,0.033174,0.431568
"""chr9""",122807921,122808121,"""OR1L4""","""chr9-122807921-122808121""",false,false,false,false,true,false,false,0.672,0.033174,0.466533
"""chr9""",122807921,122808721,"""OR1L3""","""chr9-122807921-122808721""",false,false,false,false,true,false,false,0.672,0.033174,0.478521
