In [4]:
CONSTRAINT_TOOLS = '/scratch/ucgd/lustre-labs/quinlan/u6018199/constraint-tools'
CONSTRAINT_TOOLS_DATA = '/scratch/ucgd/lustre-labs/quinlan/data-shared/constraint-tools'

import sys
sys.path.append(f'{CONSTRAINT_TOOLS}/utilities')

## Assign Gnocchi to "labeled" enhancers 

In [21]:
import pandas as pd 

def assign_gnocchi_to_labeled_enhancers(): 
    labeled_enhancers_with_features = pd.read_csv(f'{CONSTRAINT_TOOLS_DATA}/labeled-enhancers/labeled-enhancers.BGS.gBGC.GC_content.bed', sep='\t')    
    labeled_enhancers_with_Gnocchi = pd.read_csv(f'{CONSTRAINT_TOOLS_DATA}/chen-et-al-2023-published-version/41586_2023_6045_MOESM4_ESM/Supplementary_Data_6_ESM.bed', sep='\t')
    labeled_enhancers_with_Gnocchi = labeled_enhancers_with_Gnocchi[['enhancer_chrom', 'enhancer_start', 'enhancer_end', 'enhancer_Gnocchi']]
    df = pd.merge(labeled_enhancers_with_features, labeled_enhancers_with_Gnocchi, on=['enhancer_chrom', 'enhancer_start', 'enhancer_end'], how='inner')
    df = df.drop_duplicates()
    return df 

assign_gnocchi_to_labeled_enhancers()

Unnamed: 0,enhancer_chrom,enhancer_start,enhancer_end,gene,enhancer,Haploinsufficient,MGI essential,OMIM dominant,LOEUF constrained,Olfactory,LOEUF unconstrained,truly constrained,B,B_M1star.EUR,GC_content_1000bp,enhancer_Gnocchi
0,chr1,1554620,1555020,ATAD3A,chr1-1554620-1555020,False,False,True,False,False,False,True,0.652,0.108103,0.606394,4.059724
1,chr1,2128961,2129161,GABRD,chr1-2128961-2129161,False,True,True,True,False,False,True,0.841,0.347981,0.585415,6.530123
2,chr1,2268561,2268761,SKI,chr1-2268561-2268761,False,True,True,True,False,False,True,0.847,0.347981,0.602398,5.007183
3,chr1,2545161,2545361,PANK4,chr1-2545161-2545361,False,False,True,False,False,False,True,0.840,0.347981,0.640360,2.775673
4,chr1,3208836,3209036,PRDM16,chr1-3208836-3209036,False,True,True,True,False,False,True,0.966,0.788536,0.525475,6.070480
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4024,chr9,122562521,122562921,OR1B1,chr9-122562521-122562921,False,False,False,False,True,False,False,0.864,0.033174,0.460539,2.900355
4025,chr9,122572921,122573121,OR1Q1,chr9-122572921-122573121,False,False,False,False,True,False,False,0.904,0.033174,0.431568,2.413590
4026,chr9,122807921,122808121,OR1L4,chr9-122807921-122808121,False,False,False,False,True,False,False,0.672,0.033174,0.466533,3.383762
4027,chr9,122807921,122808721,OR1L3,chr9-122807921-122808721,False,False,False,False,True,False,False,0.672,0.033174,0.478521,3.383762


## Assign lambda_s to "labeled" enhancers

In [25]:
import polars as pl 

def create_bed_file_of_lambda_s_scores(): 
    # https://mail.google.com/mail/u/0/#inbox/QgrcJHrjCsBTxVdFdTZvkMTlDfGKRnDvZxl
    # http://compgen.cshl.edu/extrainsight/description.php
    df = pl.read_csv(
        f'{CONSTRAINT_TOOLS_DATA}/lambda_s/Results_26July2024.csv',
        infer_schema_length=1000000,
    )
    df = df.with_columns(
        df["start"].cast(pl.Int64),
        df["end"].cast(pl.Int64),
    )
    df = df.to_pandas()

    # Nurdan: "I've included the windows for which ExtRaINSIGHT does not report results, 
    # as they do not pass the filtering steps. 
    # In these cases, all values from columns 4 to 9 are 0."
    df = df[df['num_possible_mutations'] > 0]

    df = df[['chr', 'start', 'end', 'strong_selection']]

    df = df.rename(columns={
        'chr': 'chromosome', 
        'strong_selection': 'lambda_s'
    })

    df.to_csv(f'{CONSTRAINT_TOOLS_DATA}/lambda_s/Results_26July2024.filtered.bed', sep='\t', index=False)
    
    return df 
    
create_bed_file_of_lambda_s_scores()

Unnamed: 0,chromosome,start,end,lambda_s
0,chr1,1432000,1433000,0.087061
1,chr1,1451000,1452000,0.061982
2,chr1,1453000,1454000,0.103610
3,chr1,1458000,1459000,-0.019612
4,chr1,1463000,1464000,0.255973
...,...,...,...,...
1003222,chr9,137262000,137263000,-0.299242
1003223,chr9,137268000,137269000,0.142611
1003224,chr9,137269000,137270000,0.244719
1003225,chr9,137275000,137276000,-0.003339


In [31]:
from shell import shell 
from colorize import print_string_as_info
from aggregate import aggregate_polars

def get_header(filename):
  with open(filename) as fh: 
    header = fh.readline().strip().split('\t')
  return header

def intersect(a_filename, b_filename, intersect_filename, b_class): 
  cmd = (
    f'bedtools intersect'
    f" -a <(tail -n +2 {a_filename})" # contains header
    f" -b <(tail -n +2 {b_filename})" # contains header
    f' -wao -f 0.5' 
    f' > {intersect_filename}'
  )
  shell(cmd) 
  print_string_as_info(f'Wrote {intersect_filename}')

  a_file_header = get_header(a_filename) # contains header
  b_file_header = get_header(b_filename) # contains header
  b_file_header = [f'{b_class}_{field}' for field in b_file_header[:3]] + b_file_header[3:]

  return a_file_header, b_file_header

def make_scores_numeric(df: pl.DataFrame, b_class: str, b_features: list) -> pl.DataFrame:
    # we don't want to include windows that don't overlap any windows with b_feature values
    df = df.filter(pl.col(f'{b_class}_chromosome') != '.')

    # having removed records where 'b_class_chromosome' is '.', 
    # we can now convert the 'b_features' columns to floats
    for b_feature in b_features:
        df = df.filter(pl.col(b_feature) != '.') # gBGC values are floats, but some are missing
        df = df.with_columns(pl.col(b_feature).cast(pl.Float64))

    return df

def intersect_and_aggregate(a_filename_stem, b_filename, b_class, b_features, b_class_aggregation_functions): 
  a_filename = f'{a_filename_stem}.bed'
  intersect_filename = f'{a_filename_stem}.intersect.{b_class}.bed'

  a_file_header, b_file_header = intersect(a_filename, b_filename, intersect_filename, b_class)

  df = pl.read_csv(
    intersect_filename, 
    separator='\t',
    new_columns=a_file_header + b_file_header + [f'labeledEnhancer_{b_class}Window_overlap'],
    infer_schema_length=1000000
  )

  df = make_scores_numeric(df, b_class, b_features) 

  # some labeled enhancers may intersect multiple b_feature-windows, 
  # so let's group by labeled enhancer, and aggregate scores over all b_feature-windows in the group
  df = aggregate_polars(
    df, 
    group_columns = a_file_header,
    aggregation_functions = b_class_aggregation_functions
  )

  return df 

In [32]:
def assign_lambda_s_to_labeled_enhancers(): 
    # TODO: 
    # once Nurdan computes lambda_s for all labeled enhancers,
    # we can change the implementation of this function to perform a df-merge operation instead of bedtools-intersect
    # c.f., assign_gnocchi_to_labeled_enhancers (above)
    df = intersect_and_aggregate(
        a_filename_stem = f'{CONSTRAINT_TOOLS_DATA}/labeled-enhancers/labeled-enhancers.BGS.gBGC.GC_content', 
        b_filename = f'{CONSTRAINT_TOOLS_DATA}/lambda_s/Results_26July2024.filtered.bed', 
        b_class = 'lambda_s',
        b_features = ['lambda_s'], 
        b_class_aggregation_functions = [pl.col('lambda_s').max()] 
    ) 
    return df 

assign_lambda_s_to_labeled_enhancers()




[36mWrote /scratch/ucgd/lustre-labs/quinlan/data-shared/constraint-tools/labeled-enhancers/labeled-enhancers.BGS.gBGC.GC_content.intersect.lambda_s.bed[0m


enhancer_chrom,enhancer_start,enhancer_end,gene,enhancer,Haploinsufficient,MGI essential,OMIM dominant,LOEUF constrained,Olfactory,LOEUF unconstrained,truly constrained,B,B_M1star.EUR,GC_content_1000bp,lambda_s
str,i64,i64,str,str,bool,bool,bool,bool,bool,bool,bool,f64,f64,f64,f64
"""chr1""",2128961,2129161,"""GABRD""","""chr1-2128961-2129161""",false,true,true,true,false,false,true,0.841,0.347981,0.585415,0.117883
"""chr1""",2268561,2268761,"""SKI""","""chr1-2268561-2268761""",false,true,true,true,false,false,true,0.847,0.347981,0.602398,0.115906
"""chr1""",6240740,6241540,"""ICMT""","""chr1-6240740-6241540""",false,true,false,false,false,false,true,0.872,0.014875,0.548452,0.090827
"""chr1""",6483340,6483540,"""ESPN""","""chr1-6483340-6483540""",false,false,true,false,false,false,true,0.837,0.014875,0.572428,0.088351
"""chr1""",6697340,6697540,"""DNAJC11""","""chr1-6697340-6697540""",false,false,false,true,false,false,true,0.708,0.014875,0.461538,-0.008011
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""chr9""",122464321,122465121,"""OR1J1""","""chr9-122464321-122465121""",false,false,false,false,true,false,false,0.878,0.033174,0.462537,0.006783
"""chr9""",122561921,122562921,"""OR1N2""","""chr9-122561921-122562921""",false,false,false,false,true,false,false,0.864,0.033174,0.53047,0.193952
"""chr9""",122562521,122562921,"""OR1B1""","""chr9-122562521-122562921""",false,false,false,false,true,false,false,0.864,0.033174,0.460539,0.193952
"""chr9""",122572921,122573121,"""OR1Q1""","""chr9-122572921-122573121""",false,false,false,false,true,false,false,0.904,0.033174,0.431568,0.193267


## Assign Depletion Rank to "labeled" enhancers

In [None]:
# TODO 
# use papers/neutral_models_are_biased/7.CDTS/main.2.ipynb to assign DR scores to labeled enhancers

## Assign CDTS to "labeled" enhancers

In [None]:
# TODO 
# use papers/neutral_models_are_biased/7.CDTS/main.2.ipynb to assign CDTS scores to labeled enhancers

## Concatenate essential enhancers with noncoding windows not overlapping any GeneHancer enhancer

In [33]:
# TODO: 
# should this be done for each constraint score, when it is being assigned to labeled enhancers? 

In [None]:
# TODO
# 3. pull out "essential" enhancers from list of "labeled" enhancers 
# 4. for each constraint_score, 
#    use papers/neutral_models_are_biased/7.CDTS/main.2.ipynb to read in df containing overlap status with GeneHancer enhancers, 
#    and pull out a random sample of non-enhancer windows, together with constraint_score, and features, and append to the essential-enhancer df for that constraint score


## PR curves 

In [None]:
# TODO 
# 5. use papers/neutral_models_are_biased/7.CDTS/main.2.ipynb to compute PR curves using the newly constructed dfs (one for each constraint score), as a function of BGS, gBGC, GC
#    (note that I've already done this for Gnocchi and BGS: experiments/germline-model/chen-et-al-2022/assess-impact-of-BGS-on-Gnocchi-predictions-at-labeled-enhancers.ipynb)  
