In [1]:
CONSTRAINT_TOOLS = '/scratch/ucgd/lustre-labs/quinlan/u6018199/constraint-tools'
CONSTRAINT_TOOLS_DATA = '/scratch/ucgd/lustre-labs/quinlan/data-shared/constraint-tools'

import sys
sys.path.append(f'{CONSTRAINT_TOOLS}/utilities')

## Truth set with Gnocchi scores

In [2]:
import pandas as pd

def get_positive_examples_with_Gnocchi(): 
    labeled_enhancers_with_features = pd.read_csv(f'{CONSTRAINT_TOOLS_DATA}/labeled-enhancers/labeled-enhancers.BGS.gBGC.GC_content.bed', sep='\t')    
    labeled_enhancers_with_Gnocchi = pd.read_csv(f'{CONSTRAINT_TOOLS_DATA}/chen-et-al-2023-published-version/41586_2023_6045_MOESM4_ESM/Supplementary_Data_6_ESM.bed', sep='\t')
    labeled_enhancers_with_Gnocchi = labeled_enhancers_with_Gnocchi[['enhancer_chrom', 'enhancer_start', 'enhancer_end', 'enhancer_Gnocchi']]
    df = pd.merge(labeled_enhancers_with_features, labeled_enhancers_with_Gnocchi, on=['enhancer_chrom', 'enhancer_start', 'enhancer_end'], how='inner')
    df = df.drop_duplicates()
    df = df[df['truly constrained']] # "essential" enhancers
    df = df[['enhancer_chrom', 'enhancer_start', 'enhancer_end', 'enhancer_Gnocchi', 'truly constrained', 'B', 'B_M1star.EUR', 'GC_content_1000bp']]
    df = df.rename(columns={
        'enhancer_chrom': 'chromosome',
        'enhancer_start': 'start',
        'enhancer_end': 'end',
        'enhancer_Gnocchi': 'gnocchi',
    })
    return df 

get_positive_examples_with_Gnocchi()

Unnamed: 0,chromosome,start,end,gnocchi,truly constrained,B,B_M1star.EUR,GC_content_1000bp
0,chr1,1554620,1555020,4.059724,True,0.652,0.108103,0.606394
1,chr1,2128961,2129161,6.530123,True,0.841,0.347981,0.585415
2,chr1,2268561,2268761,5.007183,True,0.847,0.347981,0.602398
3,chr1,2545161,2545361,2.775673,True,0.840,0.347981,0.640360
4,chr1,3208836,3209036,6.070480,True,0.966,0.788536,0.525475
...,...,...,...,...,...,...,...,...
3651,chr9,136914548,136914748,2.409869,True,0.581,0.869820,0.508492
3652,chr9,136985948,136986348,1.520249,True,0.567,0.165515,0.618382
3653,chr9,137138948,137139148,3.286597,True,0.556,0.165515,0.717283
3654,chr9,137446948,137447348,3.799187,True,0.536,0.165515,0.532468


In [3]:
def get_negative_examples_with_Gnocchi(sample_size):
    df = pd.read_csv(
        f'{CONSTRAINT_TOOLS_DATA}/chen-et-al-2023-published-version/41586_2023_6045_MOESM4_ESM/Supplementary_Data_2.features.constraint_scores.bed',
        sep='\t',
    ) # windows don't overlap exons 
    df = df[df['window overlaps enhancer'] == False] # windows don't overlap enhancers
    df = df.sample(sample_size)
    df = df[['chrom', 'start', 'end', 'gnocchi', 'B', 'B_M1star.EUR', 'GC_content_1000bp']]
    df = df.rename(columns={'chrom': 'chromosome'})
    df['truly constrained'] = False
    return df

get_negative_examples_with_Gnocchi(sample_size=100)

Unnamed: 0,chromosome,start,end,gnocchi,B,B_M1star.EUR,GC_content_1000bp,truly constrained
392584,chr18,65618000,65619000,-2.138835,0.879,0.049794,0.332667,False
680336,chr4,132284000,132285000,-2.475938,0.838,0.044665,0.384615,False
848335,chr7,12326000,12327000,-0.656614,0.821,0.423095,0.313686,False
472803,chr2,158050000,158051000,-0.790551,0.835,0.666322,0.344655,False
254436,chr13,96597000,96598000,-1.076755,0.747,0.092393,0.335664,False
...,...,...,...,...,...,...,...,...
850033,chr7,16560000,16561000,-0.922760,0.929,0.620350,0.310689,False
38647,chr1,108546000,108547000,-1.819044,0.764,-0.256439,0.379620,False
61452,chr1,198466000,198467000,-0.235967,0.735,0.050336,0.363636,False
941144,chr8,95945000,95946000,3.796547,0.825,0.276146,0.567433,False


In [4]:
def get_truth_set_with_Gnocchi(): 
    positive_examples = get_positive_examples_with_Gnocchi()
    negative_examples = get_negative_examples_with_Gnocchi(sample_size=len(positive_examples))
    df = pd.concat([positive_examples, negative_examples])
    df.to_csv(f'{CONSTRAINT_TOOLS_DATA}/stringent_truth_set/truth-set.gnocchi.bed', sep='\t', index=False)
    return df

get_truth_set_with_Gnocchi()

Unnamed: 0,chromosome,start,end,gnocchi,truly constrained,B,B_M1star.EUR,GC_content_1000bp
0,chr1,1554620,1555020,4.059724,True,0.652,0.108103,0.606394
1,chr1,2128961,2129161,6.530123,True,0.841,0.347981,0.585415
2,chr1,2268561,2268761,5.007183,True,0.847,0.347981,0.602398
3,chr1,2545161,2545361,2.775673,True,0.840,0.347981,0.640360
4,chr1,3208836,3209036,6.070480,True,0.966,0.788536,0.525475
...,...,...,...,...,...,...,...,...
794242,chr6,46411000,46412000,0.025313,False,0.842,0.076253,0.363636
807762,chr6,82997000,82998000,-0.291330,False,0.732,0.213906,0.387612
9195,chr1,30222000,30223000,0.489599,False,0.953,0.151525,0.446553
976752,chr9,38203000,38204000,2.216012,False,0.782,0.681746,0.441558


## Assign lambda_s to truth set

In [5]:
import polars as pl 

from shell import shell 
from colorize import print_string_as_info
from aggregate import aggregate_polars

def get_header(filename):
  with open(filename) as fh: 
    header = fh.readline().strip().split('\t')
  return header

def intersect(a_filename, b_filename, intersect_filename, b_class): 
  cmd = (
    f'bedtools intersect'
    f" -a <(tail -n +2 {a_filename})" # contains header
    f" -b <(tail -n +2 {b_filename})" # contains header
    f' -wao -f 0.5' 
    f' > {intersect_filename}'
  )
  shell(cmd) 
  print_string_as_info(f'Wrote {intersect_filename}')

  a_file_header = get_header(a_filename) # contains header
  b_file_header = get_header(b_filename) # contains header
  b_file_header = [f'{b_class}_{field}' for field in b_file_header[:3]] + b_file_header[3:]

  return a_file_header, b_file_header

def make_scores_numeric(df: pl.DataFrame, b_class: str, b_features: list) -> pl.DataFrame:
    # we don't want to include windows that don't overlap any windows with b_feature values
    df = df.filter(pl.col(f'{b_class}_chromosome') != '.')

    # having removed records where 'b_class_chromosome' is '.', 
    # we can now convert the 'b_features' columns to floats
    for b_feature in b_features:
        df = df.with_columns(pl.col(b_feature).cast(pl.Float64))

    return df

def intersect_and_aggregate(a_filename_stem, b_filename, b_class, b_features, b_class_aggregation_functions): 
  a_filename = f'{a_filename_stem}.bed'
  intersect_filename = f'{a_filename_stem}.intersect.{b_class}.bed'

  a_file_header, b_file_header = intersect(a_filename, b_filename, intersect_filename, b_class)

  df = pl.read_csv(
    intersect_filename, 
    separator='\t',
    new_columns=a_file_header + b_file_header + [f'truthSetWindow_{b_class}Window_overlap'],
    infer_schema_length=1000000
  )

  df = make_scores_numeric(df, b_class, b_features) 

  # some truth-set windows may intersect multiple b_feature-windows, 
  # so let's group by truth-set window, and aggregate scores over all b_feature-windows in the group
  df = aggregate_polars(
    df, 
    group_columns = a_file_header,
    aggregation_functions = b_class_aggregation_functions
  )

  return df 

In [6]:
def create_bed_file_of_lambda_s_scores(): 
    # https://mail.google.com/mail/u/0/#inbox/QgrcJHrjCsBTxVdFdTZvkMTlDfGKRnDvZxl
    # http://compgen.cshl.edu/extrainsight/description.php
    df = pl.read_csv(
        f'{CONSTRAINT_TOOLS_DATA}/lambda_s/Results_26July2024.csv',
        infer_schema_length=1000000,
    )
    df = df.with_columns(
        df["start"].cast(pl.Int64),
        df["end"].cast(pl.Int64),
    )
    df = df.to_pandas()

    # Nurdan: "I've included the windows for which ExtRaINSIGHT does not report results, 
    # as they do not pass the filtering steps. 
    # In these cases, all values from columns 4 to 9 are 0."
    df = df[df['num_possible_mutations'] > 0]

    df = df[['chr', 'start', 'end', 'strong_selection']]

    df = df.rename(columns={
        'chr': 'chromosome', 
        'strong_selection': 'lambda_s'
    })

    df.to_csv(f'{CONSTRAINT_TOOLS_DATA}/lambda_s/Results_26July2024.filtered.bed', sep='\t', index=False)
    
    return df 
    
create_bed_file_of_lambda_s_scores()

Unnamed: 0,chromosome,start,end,lambda_s
0,chr1,1432000,1433000,0.087061
1,chr1,1451000,1452000,0.061982
2,chr1,1453000,1454000,0.103610
3,chr1,1458000,1459000,-0.019612
4,chr1,1463000,1464000,0.255973
...,...,...,...,...
1003222,chr9,137262000,137263000,-0.299242
1003223,chr9,137268000,137269000,0.142611
1003224,chr9,137269000,137270000,0.244719
1003225,chr9,137275000,137276000,-0.003339


In [7]:
def assign_lambda_s_to_truth_set(): 
    # TODO: 
    # once Nurdan computes lambda_s for truth-set windows,
    # we can change the implementation of this function to perform a df-merge operation instead of bedtools-intersect
    # c.f., get_positive_examples_with_Gnocchi (above)
    df = intersect_and_aggregate(
        a_filename_stem = f'{CONSTRAINT_TOOLS_DATA}/stringent_truth_set/truth-set.gnocchi', 
        b_filename = f'{CONSTRAINT_TOOLS_DATA}/lambda_s/Results_26July2024.filtered.bed', 
        b_class = 'lambda_s',
        b_features = ['lambda_s'], 
        b_class_aggregation_functions = [pl.col('lambda_s').max()] 
    ) 
    df = df.to_pandas()
    df.to_csv(f'{CONSTRAINT_TOOLS_DATA}/stringent_truth_set/truth-set.gnocchi.lambda_s.bed', sep='\t', index=False)
    return df 

assign_lambda_s_to_truth_set()




[36mWrote /scratch/ucgd/lustre-labs/quinlan/data-shared/constraint-tools/stringent_truth_set/truth-set.gnocchi.intersect.lambda_s.bed[0m


Unnamed: 0,chromosome,start,end,gnocchi,truly constrained,B,B_M1star.EUR,GC_content_1000bp,lambda_s
0,chr1,2128961,2129161,6.530123,True,0.841,0.347981,0.585415,0.117883
1,chr1,2268561,2268761,5.007183,True,0.847,0.347981,0.602398,0.115906
2,chr1,6240740,6241540,3.309271,True,0.872,0.014875,0.548452,0.090827
3,chr1,6483340,6483540,3.687737,True,0.837,0.014875,0.572428,0.088351
4,chr1,6697340,6697540,1.642701,True,0.708,0.014875,0.461538,-0.008011
...,...,...,...,...,...,...,...,...,...
4970,chr6,46411000,46412000,0.025313,False,0.842,0.076253,0.363636,0.034791
4971,chr6,82997000,82998000,-0.291330,False,0.732,0.213906,0.387612,-0.033328
4972,chr1,30222000,30223000,0.489599,False,0.953,0.151525,0.446553,0.031316
4973,chr9,38203000,38204000,2.216012,False,0.782,0.681746,0.441558,0.254321


## Assign Depletion Rank to truth set

In [8]:
def create_bed_file_of_depletion_rank_scores(): 
    df = pl.read_csv(
        f'{CONSTRAINT_TOOLS_DATA}/depletion_rank_scores/41586_2022_4965_MOESM3_ESM', 
        separator='\t',
        # n_rows=1000000  # TODO: testing
    )
    df = df.with_columns((1-pl.col('rank')).alias('depletion_rank_constraint_score_complement'))
    df = df.to_pandas()
    df = df.rename(columns={
        'Chr': 'chromosome',
        'Fromx': 'start',
        'To': 'end',
    })
    df = df.drop(columns=['rank'])
    df.to_csv(f'{CONSTRAINT_TOOLS_DATA}/depletion_rank_scores/41586_2022_4965_MOESM3_ESM.reformatted.bed', sep='\t', index=False)
    return df

create_bed_file_of_depletion_rank_scores()

Unnamed: 0,chromosome,start,end,depletion_rank_constraint_score_complement
0,chr1,777500,778000,0.401760
1,chr1,777550,778050,0.350457
2,chr1,777600,778100,0.317118
3,chr1,777650,778150,0.427053
4,chr1,777700,778200,0.497556
...,...,...,...,...
49104021,chr9,138171800,138172300,0.690354
49104022,chr9,138171850,138172350,0.491224
49104023,chr9,138171900,138172400,0.327472
49104024,chr9,138171950,138172450,0.525170


In [9]:
def assign_depletion_rank_to_truth_set(): 
    df = intersect_and_aggregate(
        a_filename_stem = f'{CONSTRAINT_TOOLS_DATA}/stringent_truth_set/truth-set.gnocchi.lambda_s', 
        b_filename = f'{CONSTRAINT_TOOLS_DATA}/depletion_rank_scores/41586_2022_4965_MOESM3_ESM.reformatted.bed', 
        b_class = 'depletion_rank',
        b_features = ['depletion_rank_constraint_score_complement'], 
        b_class_aggregation_functions = [pl.col('depletion_rank_constraint_score_complement').max()] # expect that this pulls down the window that maximally overlaps enhancer
    ) 
    df = df.to_pandas()
    df.to_csv(f'{CONSTRAINT_TOOLS_DATA}/stringent_truth_set/truth-set.gnocchi.lambda_s.depletion_rank.bed', sep='\t', index=False)
    return df 

assign_depletion_rank_to_truth_set()




[36mWrote /scratch/ucgd/lustre-labs/quinlan/data-shared/constraint-tools/stringent_truth_set/truth-set.gnocchi.lambda_s.intersect.depletion_rank.bed[0m


Unnamed: 0,chromosome,start,end,gnocchi,truly constrained,B,B_M1star.EUR,GC_content_1000bp,lambda_s,depletion_rank_constraint_score_complement
0,chr1,2128961,2129161,6.530123,True,0.841,0.347981,0.585415,0.117883,0.899933
1,chr1,2268561,2268761,5.007183,True,0.847,0.347981,0.602398,0.115906,0.600846
2,chr1,6240740,6241540,3.309271,True,0.872,0.014875,0.548452,0.090827,0.879515
3,chr1,6483340,6483540,3.687737,True,0.837,0.014875,0.572428,0.088351,0.832178
4,chr1,6697340,6697540,1.642701,True,0.708,0.014875,0.461538,-0.008011,0.925050
...,...,...,...,...,...,...,...,...,...,...
4970,chr6,46411000,46412000,0.025313,False,0.842,0.076253,0.363636,0.034791,0.726152
4971,chr6,82997000,82998000,-0.291330,False,0.732,0.213906,0.387612,-0.033328,0.374379
4972,chr1,30222000,30223000,0.489599,False,0.953,0.151525,0.446553,0.031316,0.782622
4973,chr9,38203000,38204000,2.216012,False,0.782,0.681746,0.441558,0.254321,0.990101


## Assign CDTS to truth set

In [10]:
def create_bed_file_of_CDTS_scores(): 
    df = pl.read_csv(
        f'{CONSTRAINT_TOOLS_DATA}/CDTS/CDTS.gnomAD.hg38.noncoding.enhancer.bed', # this contains the correct coordinates of windows
        separator='\t',
        # n_rows=10000000  # TODO: testing
    )
    df = df.with_columns((100-pl.col('percentile_rank_of_observed_minus_expected')).alias('percentile_rank_of_observed_minus_expected_complement'))
    df = df.to_pandas()
    df = df[['chromosome', 'start', 'end', 'percentile_rank_of_observed_minus_expected_complement']]
    df.to_csv(f'{CONSTRAINT_TOOLS_DATA}/CDTS/CDTS.gnomAD.hg38.noncoding.reformatted.bed', sep='\t', index=False)
    return df

create_bed_file_of_CDTS_scores()

Unnamed: 0,chromosome,start,end,percentile_rank_of_observed_minus_expected_complement
0,chr1,47939,48490,98.547897
1,chr1,47949,48500,98.567632
2,chr1,47959,48510,98.567632
3,chr1,47999,48550,98.552431
4,chr1,59349,59900,98.078534
...,...,...,...,...
207098315,chr17,58762584,58763135,54.576893
207098316,chr17,58762594,58763145,54.223632
207098317,chr17,58762604,58763155,54.177060
207098318,chr17,58762614,58763165,53.862348


In [11]:
def assign_CDTS_to_truth_set(): 
    df = intersect_and_aggregate(
        a_filename_stem = f'{CONSTRAINT_TOOLS_DATA}/stringent_truth_set/truth-set.gnocchi.lambda_s.depletion_rank', 
        b_filename = f'{CONSTRAINT_TOOLS_DATA}/CDTS/CDTS.gnomAD.hg38.noncoding.reformatted.bed', 
        b_class = 'CDTS',
        b_features = ['percentile_rank_of_observed_minus_expected_complement'], 
        b_class_aggregation_functions = [pl.col('percentile_rank_of_observed_minus_expected_complement').max()] # expect that this pulls down the window that maximally overlaps enhancer
    ) 
    df = df.to_pandas()
    df.to_csv(f'{CONSTRAINT_TOOLS_DATA}/stringent_truth_set/truth-set.gnocchi.lambda_s.depletion_rank.CDTS.bed', sep='\t', index=False)
    return df 

assign_CDTS_to_truth_set()




[36mWrote /scratch/ucgd/lustre-labs/quinlan/data-shared/constraint-tools/stringent_truth_set/truth-set.gnocchi.lambda_s.depletion_rank.intersect.CDTS.bed[0m


Unnamed: 0,chromosome,start,end,gnocchi,truly constrained,B,B_M1star.EUR,GC_content_1000bp,lambda_s,depletion_rank_constraint_score_complement,percentile_rank_of_observed_minus_expected_complement
0,chr1,2128961,2129161,6.530123,True,0.841,0.347981,0.585415,0.117883,0.899933,53.544438
1,chr1,2268561,2268761,5.007183,True,0.847,0.347981,0.602398,0.115906,0.600846,47.909374
2,chr1,6240740,6241540,3.309271,True,0.872,0.014875,0.548452,0.090827,0.879515,99.420742
3,chr1,6483340,6483540,3.687737,True,0.837,0.014875,0.572428,0.088351,0.832178,92.653453
4,chr1,6697340,6697540,1.642701,True,0.708,0.014875,0.461538,-0.008011,0.925050,99.234114
...,...,...,...,...,...,...,...,...,...,...,...
4928,chr6,46411000,46412000,0.025313,False,0.842,0.076253,0.363636,0.034791,0.726152,92.971132
4929,chr6,82997000,82998000,-0.291330,False,0.732,0.213906,0.387612,-0.033328,0.374379,56.933344
4930,chr1,30222000,30223000,0.489599,False,0.953,0.151525,0.446553,0.031316,0.782622,96.955803
4931,chr9,38203000,38204000,2.216012,False,0.782,0.681746,0.441558,0.254321,0.990101,66.855437
