In [1]:
CONSTRAINT_TOOLS = '/scratch/ucgd/lustre-labs/quinlan/u6018199/constraint-tools'
CONSTRAINT_TOOLS_DATA = '/scratch/ucgd/lustre-labs/quinlan/data-shared/constraint-tools'

import sys
sys.path.append(f'{CONSTRAINT_TOOLS}/utilities')

## Gnocchi truth set

In [2]:
import pandas as pd 

def get_gnocchi_for_essential_enhancers(): 
    labeled_enhancers_with_features = pd.read_csv(f'{CONSTRAINT_TOOLS_DATA}/labeled-enhancers/labeled-enhancers.BGS.gBGC.GC_content.bed', sep='\t')    
    labeled_enhancers_with_Gnocchi = pd.read_csv(f'{CONSTRAINT_TOOLS_DATA}/chen-et-al-2023-published-version/41586_2023_6045_MOESM4_ESM/Supplementary_Data_6_ESM.bed', sep='\t')
    labeled_enhancers_with_Gnocchi = labeled_enhancers_with_Gnocchi[['enhancer_chrom', 'enhancer_start', 'enhancer_end', 'enhancer_Gnocchi']]
    df = pd.merge(labeled_enhancers_with_features, labeled_enhancers_with_Gnocchi, on=['enhancer_chrom', 'enhancer_start', 'enhancer_end'], how='inner')
    df = df.drop_duplicates()
    df = df[df['truly constrained']]
    df = df[['enhancer_chrom', 'enhancer_start', 'enhancer_end', 'enhancer_Gnocchi', 'truly constrained', 'B', 'B_M1star.EUR', 'GC_content_1000bp']]
    df = df.rename(columns={
        'enhancer_chrom': 'chromosome',
        'enhancer_start': 'start',
        'enhancer_end': 'end',
        'enhancer_Gnocchi': 'gnocchi',
    })
    return df 

get_gnocchi_for_essential_enhancers()

Unnamed: 0,chromosome,start,end,gnocchi,truly constrained,B,B_M1star.EUR,GC_content_1000bp
0,chr1,1554620,1555020,4.059724,True,0.652,0.108103,0.606394
1,chr1,2128961,2129161,6.530123,True,0.841,0.347981,0.585415
2,chr1,2268561,2268761,5.007183,True,0.847,0.347981,0.602398
3,chr1,2545161,2545361,2.775673,True,0.840,0.347981,0.640360
4,chr1,3208836,3209036,6.070480,True,0.966,0.788536,0.525475
...,...,...,...,...,...,...,...,...
3651,chr9,136914548,136914748,2.409869,True,0.581,0.869820,0.508492
3652,chr9,136985948,136986348,1.520249,True,0.567,0.165515,0.618382
3653,chr9,137138948,137139148,3.286597,True,0.556,0.165515,0.717283
3654,chr9,137446948,137447348,3.799187,True,0.536,0.165515,0.532468


In [3]:
def get_gnocchi_for_unconstrained_windows(sample_size):
    df = pd.read_csv(
        f'{CONSTRAINT_TOOLS_DATA}/chen-et-al-2023-published-version/41586_2023_6045_MOESM4_ESM/Supplementary_Data_2.features.constraint_scores.bed',
        sep='\t',
    ) # windows don't overlap exons 
    df = df[df['window overlaps enhancer'] == False] # windows don't overlap enhancers
    df = df.sample(sample_size)
    df = df[['chrom', 'start', 'end', 'gnocchi', 'B', 'B_M1star.EUR', 'GC_content_1000bp']]
    df = df.rename(columns={'chrom': 'chromosome'})
    df['truly constrained'] = False
    return df

get_gnocchi_for_unconstrained_windows(sample_size=100)

Unnamed: 0,chromosome,start,end,gnocchi,B,B_M1star.EUR,GC_content_1000bp,truly constrained
668567,chr4,101542000,101543000,-2.834965,0.913,0.711888,0.462537,False
185146,chr12,25573000,25574000,0.587890,0.920,0.252258,0.425574,False
923003,chr8,53280000,53281000,-1.351145,0.883,0.057438,0.467532,False
611726,chr3,159037000,159038000,2.199287,0.837,0.647116,0.306693,False
512673,chr20,15158000,15159000,1.190214,0.878,0.648430,0.362637,False
...,...,...,...,...,...,...,...,...
557908,chr3,17295000,17296000,-1.770893,0.541,-0.295711,0.357642,False
716612,chr5,24969000,24970000,0.321561,0.870,0.125001,0.280719,False
52059,chr1,175031000,175032000,3.013601,0.548,0.359957,0.508492,False
180777,chr12,14332000,14333000,-0.643628,0.764,0.084526,0.469530,False


In [4]:
def get_gnocchi_truth_set(): 
    essential_enhancers = get_gnocchi_for_essential_enhancers()
    unconstrained_windows = get_gnocchi_for_unconstrained_windows(sample_size=len(essential_enhancers))
    df = pd.concat([essential_enhancers, unconstrained_windows])
    df.to_csv(f'{CONSTRAINT_TOOLS_DATA}/stringent_truth_sets/gnocchi-truth-set.bed', sep='\t', index=False)
    return df

get_gnocchi_truth_set()

Unnamed: 0,chromosome,start,end,gnocchi,truly constrained,B,B_M1star.EUR,GC_content_1000bp
0,chr1,1554620,1555020,4.059724,True,0.652,0.108103,0.606394
1,chr1,2128961,2129161,6.530123,True,0.841,0.347981,0.585415
2,chr1,2268561,2268761,5.007183,True,0.847,0.347981,0.602398
3,chr1,2545161,2545361,2.775673,True,0.840,0.347981,0.640360
4,chr1,3208836,3209036,6.070480,True,0.966,0.788536,0.525475
...,...,...,...,...,...,...,...,...
113536,chr10,90560000,90561000,1.536373,False,0.803,0.136005,0.375624
131541,chr11,9337000,9338000,-1.182434,False,0.793,0.394411,0.432567
991343,chr9,103067000,103068000,0.887998,False,0.936,0.116710,0.395604
857706,chr7,34770000,34771000,2.594133,False,0.866,0.524225,0.390609


## lambda_s truth set

In [5]:
import polars as pl 

def create_bed_file_of_lambda_s_scores(): 
    # https://mail.google.com/mail/u/0/#inbox/QgrcJHrjCsBTxVdFdTZvkMTlDfGKRnDvZxl
    # http://compgen.cshl.edu/extrainsight/description.php
    df = pl.read_csv(
        f'{CONSTRAINT_TOOLS_DATA}/lambda_s/Results_26July2024.csv',
        infer_schema_length=1000000,
    )
    df = df.with_columns(
        df["start"].cast(pl.Int64),
        df["end"].cast(pl.Int64),
    )
    df = df.to_pandas()

    # Nurdan: "I've included the windows for which ExtRaINSIGHT does not report results, 
    # as they do not pass the filtering steps. 
    # In these cases, all values from columns 4 to 9 are 0."
    df = df[df['num_possible_mutations'] > 0]

    df = df[['chr', 'start', 'end', 'strong_selection']]

    df = df.rename(columns={
        'chr': 'chromosome', 
        'strong_selection': 'lambda_s'
    })

    df.to_csv(f'{CONSTRAINT_TOOLS_DATA}/lambda_s/Results_26July2024.filtered.bed', sep='\t', index=False)
    
    return df 
    
create_bed_file_of_lambda_s_scores()

Unnamed: 0,chromosome,start,end,lambda_s
0,chr1,1432000,1433000,0.087061
1,chr1,1451000,1452000,0.061982
2,chr1,1453000,1454000,0.103610
3,chr1,1458000,1459000,-0.019612
4,chr1,1463000,1464000,0.255973
...,...,...,...,...
1003222,chr9,137262000,137263000,-0.299242
1003223,chr9,137268000,137269000,0.142611
1003224,chr9,137269000,137270000,0.244719
1003225,chr9,137275000,137276000,-0.003339


In [6]:
from shell import shell 
from colorize import print_string_as_info
from aggregate import aggregate_polars

def get_header(filename):
  with open(filename) as fh: 
    header = fh.readline().strip().split('\t')
  return header

def intersect(a_filename, b_filename, intersect_filename, b_class): 
  cmd = (
    f'bedtools intersect'
    f" -a <(tail -n +2 {a_filename})" # contains header
    f" -b <(tail -n +2 {b_filename})" # contains header
    f' -wao -f 0.5' 
    f' > {intersect_filename}'
  )
  shell(cmd) 
  print_string_as_info(f'Wrote {intersect_filename}')

  a_file_header = get_header(a_filename) # contains header
  b_file_header = get_header(b_filename) # contains header
  b_file_header = [f'{b_class}_{field}' for field in b_file_header[:3]] + b_file_header[3:]

  return a_file_header, b_file_header

def make_scores_numeric(df: pl.DataFrame, b_class: str, b_features: list) -> pl.DataFrame:
    # we don't want to include windows that don't overlap any windows with b_feature values
    df = df.filter(pl.col(f'{b_class}_chromosome') != '.')

    # having removed records where 'b_class_chromosome' is '.', 
    # we can now convert the 'b_features' columns to floats
    for b_feature in b_features:
        df = df.filter(pl.col(b_feature) != '.') # gBGC values are floats, but some are missing
        df = df.with_columns(pl.col(b_feature).cast(pl.Float64))

    return df

def intersect_and_aggregate(a_filename_stem, b_filename, b_class, b_features, b_class_aggregation_functions): 
  a_filename = f'{a_filename_stem}.bed'
  intersect_filename = f'{a_filename_stem}.intersect.{b_class}.bed'

  a_file_header, b_file_header = intersect(a_filename, b_filename, intersect_filename, b_class)

  df = pl.read_csv(
    intersect_filename, 
    separator='\t',
    new_columns=a_file_header + b_file_header + [f'labeledEnhancer_{b_class}Window_overlap'],
    infer_schema_length=1000000
  )

  df = make_scores_numeric(df, b_class, b_features) 

  # some labeled enhancers may intersect multiple b_feature-windows, 
  # so let's group by labeled enhancer, and aggregate scores over all b_feature-windows in the group
  df = aggregate_polars(
    df, 
    group_columns = a_file_header,
    aggregation_functions = b_class_aggregation_functions
  )

  return df 

In [7]:
def get_lambda_s_for_essential_enhancers(): 
    # TODO: 
    # once Nurdan computes lambda_s for all essential enhancers,
    # we can change the implementation of this function to perform a df-merge operation instead of bedtools-intersect
    # c.f., get_gnocchi_for_essential_enhancers (above)
    df = intersect_and_aggregate(
        a_filename_stem = f'{CONSTRAINT_TOOLS_DATA}/labeled-enhancers/labeled-enhancers.BGS.gBGC.GC_content', 
        b_filename = f'{CONSTRAINT_TOOLS_DATA}/lambda_s/Results_26July2024.filtered.bed', 
        b_class = 'lambda_s',
        b_features = ['lambda_s'], 
        b_class_aggregation_functions = [pl.col('lambda_s').max()] 
    ) 
    df = df.filter(pl.col('truly constrained'))
    df = df[['enhancer_chrom', 'enhancer_start', 'enhancer_end', 'truly constrained', 'B', 'B_M1star.EUR', 'GC_content_1000bp', 'lambda_s']]
    df = df.rename({
        'enhancer_chrom': 'chromosome',
        'enhancer_start': 'start',
        'enhancer_end': 'end',
    })
    df = df.to_pandas()
    return df 

get_lambda_s_for_essential_enhancers()




[36mWrote /scratch/ucgd/lustre-labs/quinlan/data-shared/constraint-tools/labeled-enhancers/labeled-enhancers.BGS.gBGC.GC_content.intersect.lambda_s.bed[0m


Unnamed: 0,chromosome,start,end,truly constrained,B,B_M1star.EUR,GC_content_1000bp,lambda_s
0,chr1,2128961,2129161,True,0.841,0.347981,0.585415,0.117883
1,chr1,2268561,2268761,True,0.847,0.347981,0.602398,0.115906
2,chr1,6240740,6241540,True,0.872,0.014875,0.548452,0.090827
3,chr1,6483340,6483540,True,0.837,0.014875,0.572428,0.088351
4,chr1,6697340,6697540,True,0.708,0.014875,0.461538,-0.008011
...,...,...,...,...,...,...,...,...
1670,chr9,135910954,135911154,True,0.881,0.349248,0.489510,0.012404
1671,chr9,136538548,136538748,True,0.899,0.869820,0.629371,-0.026733
1672,chr9,136626148,136626748,True,0.893,0.869820,0.657343,-0.026376
1673,chr9,136811348,136811548,True,0.592,0.869820,0.489510,-0.199961


In [8]:
def get_lambda_s_for_unconstrained_windows(sample_size):
    # https://mail.google.com/mail/u/0/#inbox/QgrcJHrjCsBTxVdFdTZvkMTlDfGKRnDvZxl
    # http://compgen.cshl.edu/extrainsight/description.php
    df = pl.read_csv(
        f'{CONSTRAINT_TOOLS_DATA}/lambda_s/Results_26July2024.filtered.bed',
        separator='\t',
        infer_schema_length=1000000,
    )
    windows_with_lambda_s = df.to_pandas()

    df = pd.read_csv(
        f'{CONSTRAINT_TOOLS_DATA}/chen-et-al-2023-published-version/41586_2023_6045_MOESM4_ESM/Supplementary_Data_2.features.constraint_scores.bed',
        sep='\t',
    ) # windows don't overlap exons 
    df = df[['chrom', 'start', 'end', 'window overlaps enhancer', 'B', 'B_M1star.EUR', 'GC_content_1000bp']]
    windows_with_features_and_enhancer_status = df.rename(columns={'chrom': 'chromosome'})

    df = windows_with_features_and_enhancer_status.merge(
        windows_with_lambda_s, 
        on=['chromosome', 'start', 'end'], 
        how='inner'
    )

    df = df[df['window overlaps enhancer'] == False] # windows don't overlap enhancers
    df = df.sample(sample_size)
    df = df[['chromosome', 'start', 'end', 'lambda_s', 'B', 'B_M1star.EUR', 'GC_content_1000bp']] 
    df['truly constrained'] = False
    return df

get_lambda_s_for_unconstrained_windows(sample_size=100)

Unnamed: 0,chromosome,start,end,lambda_s,B,B_M1star.EUR,GC_content_1000bp,truly constrained
519603,chr3,103702000,103703000,0.018649,0.890,0.036429,0.334665,False
328717,chr18,28559000,28560000,-0.030686,0.763,-0.071301,0.397602,False
516751,chr3,96588000,96589000,0.036268,0.761,-0.070425,0.284715,False
666677,chr5,82155000,82156000,0.167673,0.592,0.593630,0.339660,False
819381,chr7,120533000,120534000,0.172400,0.661,0.195483,0.339660,False
...,...,...,...,...,...,...,...,...
863269,chr8,77258000,77259000,0.068297,0.659,-0.120370,0.319680,False
550183,chr3,178367000,178368000,0.025368,0.876,0.121621,0.367632,False
890670,chr8,142127000,142128000,-0.184212,0.984,0.352085,0.576424,False
446803,chr2,212699000,212700000,0.096395,0.912,0.627981,0.314685,False


In [9]:
def get_lambda_s_truth_set(): 
    essential_enhancers = get_lambda_s_for_essential_enhancers()
    unconstrained_windows = get_lambda_s_for_unconstrained_windows(sample_size=len(essential_enhancers))
    df = pd.concat([essential_enhancers, unconstrained_windows])
    df.to_csv(f'{CONSTRAINT_TOOLS_DATA}/stringent_truth_sets/lambda_s-truth-set.bed', sep='\t', index=False)
    return df

get_lambda_s_truth_set()




[36mWrote /scratch/ucgd/lustre-labs/quinlan/data-shared/constraint-tools/labeled-enhancers/labeled-enhancers.BGS.gBGC.GC_content.intersect.lambda_s.bed[0m


Unnamed: 0,chromosome,start,end,truly constrained,B,B_M1star.EUR,GC_content_1000bp,lambda_s
0,chr1,2128961,2129161,True,0.841,0.347981,0.585415,0.117883
1,chr1,2268561,2268761,True,0.847,0.347981,0.602398,0.115906
2,chr1,6240740,6241540,True,0.872,0.014875,0.548452,0.090827
3,chr1,6483340,6483540,True,0.837,0.014875,0.572428,0.088351
4,chr1,6697340,6697540,True,0.708,0.014875,0.461538,-0.008011
...,...,...,...,...,...,...,...,...
413248,chr2,133387000,133388000,False,0.835,0.281558,0.397602,-0.302237
434616,chr2,182977000,182978000,False,0.664,0.273451,0.383616,0.016217
215209,chr14,28374000,28375000,False,0.838,0.208852,0.323676,0.080208
151374,chr12,67961000,67962000,False,0.925,0.399876,0.369630,-0.026623


## Depletion Rank truth set

In [10]:
def create_bed_file_of_depletion_rank_scores(): 
    df = pl.read_csv(
        f'{CONSTRAINT_TOOLS_DATA}/depletion_rank_scores/41586_2022_4965_MOESM3_ESM', 
        separator='\t',
        # n_rows=1000000  # TODO: testing
    )
    df = df.with_columns((1-pl.col('rank')).alias('depletion_rank_constraint_score_complement'))
    df = df.to_pandas()
    df = df.rename(columns={
        'Chr': 'chromosome',
        'Fromx': 'start',
        'To': 'end',
    })
    df = df.drop(columns=['rank'])
    df.to_csv(f'{CONSTRAINT_TOOLS_DATA}/depletion_rank_scores/41586_2022_4965_MOESM3_ESM.reformatted.bed', sep='\t', index=False)
    return df

create_bed_file_of_depletion_rank_scores()

Unnamed: 0,chromosome,start,end,depletion_rank_constraint_score_complement
0,chr1,777500,778000,0.401760
1,chr1,777550,778050,0.350457
2,chr1,777600,778100,0.317118
3,chr1,777650,778150,0.427053
4,chr1,777700,778200,0.497556
...,...,...,...,...
49104021,chr9,138171800,138172300,0.690354
49104022,chr9,138171850,138172350,0.491224
49104023,chr9,138171900,138172400,0.327472
49104024,chr9,138171950,138172450,0.525170


In [11]:
def get_depletion_rank_for_essential_enhancers(): 
    df = intersect_and_aggregate(
        a_filename_stem = f'{CONSTRAINT_TOOLS_DATA}/labeled-enhancers/labeled-enhancers.BGS.gBGC.GC_content', 
        b_filename = f'{CONSTRAINT_TOOLS_DATA}/depletion_rank_scores/41586_2022_4965_MOESM3_ESM.reformatted.bed', 
        b_class = 'depletion_rank',
        b_features = ['depletion_rank_constraint_score_complement'], 
        b_class_aggregation_functions = [pl.col('depletion_rank_constraint_score_complement').max()] # expect that this pulls down the window that maximally overlaps enhancer
    ) 
    df = df.filter(pl.col('truly constrained'))
    df = df[['enhancer_chrom', 'enhancer_start', 'enhancer_end', 'truly constrained', 'B', 'B_M1star.EUR', 'GC_content_1000bp', 'depletion_rank_constraint_score_complement']]
    df = df.rename({
        'enhancer_chrom': 'chromosome',
        'enhancer_start': 'start',
        'enhancer_end': 'end',
    })
    df = df.to_pandas()
    return df 

get_depletion_rank_for_essential_enhancers()




[36mWrote /scratch/ucgd/lustre-labs/quinlan/data-shared/constraint-tools/labeled-enhancers/labeled-enhancers.BGS.gBGC.GC_content.intersect.depletion_rank.bed[0m


Unnamed: 0,chromosome,start,end,truly constrained,B,B_M1star.EUR,GC_content_1000bp,depletion_rank_constraint_score_complement
0,chr1,1554620,1555020,True,0.652,0.108103,0.606394,0.368898
1,chr1,2128961,2129161,True,0.841,0.347981,0.585415,0.899933
2,chr1,2268561,2268761,True,0.847,0.347981,0.602398,0.600846
3,chr1,2545161,2545361,True,0.840,0.347981,0.640360,0.650301
4,chr1,3208836,3209036,True,0.966,0.788536,0.525475,0.982930
...,...,...,...,...,...,...,...,...
3537,chr9,136914548,136914748,True,0.581,0.869820,0.508492,0.427218
3538,chr9,136985948,136986348,True,0.567,0.165515,0.618382,0.887174
3539,chr9,137138948,137139148,True,0.556,0.165515,0.717283,0.999730
3540,chr9,137446948,137447348,True,0.536,0.165515,0.532468,0.285545


In [12]:
def get_depletion_rank_for_unconstrained_windows(sample_size):
    df = pl.read_csv(
        f'{CONSTRAINT_TOOLS_DATA}/depletion_rank_scores/41586_2022_4965_MOESM3_ESM.noncoding.enhancer.BGS.gBGC.GC_content.bed',
        separator='\t',
        infer_schema_length=1000000,
        # n_rows=1000000  # TODO: testing
    ) # windows don't overlap exons 
    df = df.to_pandas()
    df = df[df['window overlaps enhancer'] == False] # windows don't overlap enhancers
    df = df.sample(sample_size)
    df['depletion_rank_constraint_score_complement'] = 1 - df['depletion_rank']
    df = df[['chromosome', 'start', 'end', 'depletion_rank_constraint_score_complement', 'B', 'B_M1star.EUR', 'GC_content_1000bp']] 
    df['truly constrained'] = False
    return df

get_depletion_rank_for_unconstrained_windows(sample_size=100)

Unnamed: 0,chromosome,start,end,depletion_rank_constraint_score_complement,B,B_M1star.EUR,GC_content_1000bp,truly constrained
26948900,chr4,158645500,158646000,0.372838,0.750,0.257686,0.338661,False
1509981,chr1,107107150,107107650,0.036144,0.754,0.154187,0.401598,False
27309201,chr4,179810950,179811450,0.773860,0.924,0.283588,0.291708,False
22071863,chr3,34545400,34545900,0.464828,0.778,0.284311,0.330669,False
6999876,chr12,13932400,13932900,0.516001,0.864,0.396959,0.421578,False
...,...,...,...,...,...,...,...,...
33619925,chr7,71407950,71408450,0.192579,0.879,0.079256,0.474525,False
11215197,chr14,87915200,87915700,0.647143,0.875,0.461816,0.390609,False
24369788,chr3,189417900,189418400,0.218419,0.917,0.299414,0.405594,False
23482988,chr3,132106150,132106650,0.916060,0.938,0.075179,0.333666,False


In [13]:
def get_depletion_rank_truth_set(): 
    essential_enhancers = get_depletion_rank_for_essential_enhancers()
    unconstrained_windows = get_depletion_rank_for_unconstrained_windows(sample_size=len(essential_enhancers))
    df = pd.concat([essential_enhancers, unconstrained_windows])
    df.to_csv(f'{CONSTRAINT_TOOLS_DATA}/stringent_truth_sets/depletion_rank-truth-set.bed', sep='\t', index=False)
    return df

get_depletion_rank_truth_set()




[36mWrote /scratch/ucgd/lustre-labs/quinlan/data-shared/constraint-tools/labeled-enhancers/labeled-enhancers.BGS.gBGC.GC_content.intersect.depletion_rank.bed[0m


Unnamed: 0,chromosome,start,end,truly constrained,B,B_M1star.EUR,GC_content_1000bp,depletion_rank_constraint_score_complement
0,chr1,1554620,1555020,True,0.652,0.108103,0.606394,0.368898
1,chr1,2128961,2129161,True,0.841,0.347981,0.585415,0.899933
2,chr1,2268561,2268761,True,0.847,0.347981,0.602398,0.600846
3,chr1,2545161,2545361,True,0.840,0.347981,0.640360,0.650301
4,chr1,3208836,3209036,True,0.966,0.788536,0.525475,0.982930
...,...,...,...,...,...,...,...,...
2234645,chr1,187935800,187936300,False,0.944,0.167226,0.381618,0.382341
25573299,chr4,72266800,72267300,False,0.904,0.121811,0.328671,0.106622
34891954,chr7,158780450,158780950,False,0.617,0.109041,0.436563,0.036848
34809165,chr7,153212750,153213250,False,0.947,0.173963,0.359640,0.128205


## CDTS truth set

In [14]:
def create_bed_file_of_CDTS_scores(): 
    df = pl.read_csv(
        f'{CONSTRAINT_TOOLS_DATA}/CDTS/CDTS.gnomAD.hg38.noncoding.enhancer.bed', # this contains the correct coordinates of windows
        separator='\t',
        # n_rows=10000000  # TODO: testing
    )
    df = df.with_columns((100-pl.col('percentile_rank_of_observed_minus_expected')).alias('percentile_rank_of_observed_minus_expected_complement'))
    df = df.to_pandas()
    df = df[['chromosome', 'start', 'end', 'percentile_rank_of_observed_minus_expected_complement']]
    df.to_csv(f'{CONSTRAINT_TOOLS_DATA}/CDTS/CDTS.gnomAD.hg38.noncoding.reformatted.bed', sep='\t', index=False)
    return df

create_bed_file_of_CDTS_scores()

Unnamed: 0,chromosome,start,end,percentile_rank_of_observed_minus_expected_complement
0,chr1,47939,48490,98.547897
1,chr1,47949,48500,98.567632
2,chr1,47959,48510,98.567632
3,chr1,47999,48550,98.552431
4,chr1,59349,59900,98.078534
...,...,...,...,...
207098315,chr17,58762584,58763135,54.576893
207098316,chr17,58762594,58763145,54.223632
207098317,chr17,58762604,58763155,54.177060
207098318,chr17,58762614,58763165,53.862348


In [15]:
def get_CDTS_for_essential_enhancers(): 
    df = intersect_and_aggregate(
        a_filename_stem = f'{CONSTRAINT_TOOLS_DATA}/labeled-enhancers/labeled-enhancers.BGS.gBGC.GC_content', 
        b_filename = f'{CONSTRAINT_TOOLS_DATA}/CDTS/CDTS.gnomAD.hg38.noncoding.reformatted.bed', 
        b_class = 'CDTS',
        b_features = ['percentile_rank_of_observed_minus_expected_complement'], 
        b_class_aggregation_functions = [pl.col('percentile_rank_of_observed_minus_expected_complement').max()] # expect that this pulls down the window that maximally overlaps enhancer
    ) 
    df = df.filter(pl.col('truly constrained'))
    df = df[['enhancer_chrom', 'enhancer_start', 'enhancer_end', 'truly constrained', 'B', 'B_M1star.EUR', 'GC_content_1000bp', 'percentile_rank_of_observed_minus_expected_complement']]
    df = df.rename({
        'enhancer_chrom': 'chromosome',
        'enhancer_start': 'start',
        'enhancer_end': 'end',
    })
    df = df.to_pandas()
    return df 

get_CDTS_for_essential_enhancers()




[36mWrote /scratch/ucgd/lustre-labs/quinlan/data-shared/constraint-tools/labeled-enhancers/labeled-enhancers.BGS.gBGC.GC_content.intersect.CDTS.bed[0m


Unnamed: 0,chromosome,start,end,truly constrained,B,B_M1star.EUR,GC_content_1000bp,percentile_rank_of_observed_minus_expected_complement
0,chr1,1554620,1555020,True,0.652,0.108103,0.606394,72.416125
1,chr1,2128961,2129161,True,0.841,0.347981,0.585415,53.544438
2,chr1,2268561,2268761,True,0.847,0.347981,0.602398,47.909374
3,chr1,2545161,2545361,True,0.840,0.347981,0.640360,98.359012
4,chr1,3208836,3209036,True,0.966,0.788536,0.525475,97.164713
...,...,...,...,...,...,...,...,...
3123,chr9,136626148,136626748,True,0.893,0.869820,0.657343,93.548655
3124,chr9,136811348,136811548,True,0.592,0.869820,0.489510,96.106831
3125,chr9,136914548,136914748,True,0.581,0.869820,0.508492,88.702648
3126,chr9,137446948,137447348,True,0.536,0.165515,0.532468,99.002440


In [16]:
def get_CDTS_for_unconstrained_windows(sample_size):
    df = pl.read_csv(
        f'{CONSTRAINT_TOOLS_DATA}/CDTS/CDTS.gnomAD.hg38.noncoding.enhancer.BGS.gBGC.GC_content.bed',
        separator='\t',
        infer_schema_length=1000000,
        # n_rows=1000000  # TODO: testing
    ) # windows don't overlap exons 
    df = df.to_pandas()
    df = df[df['window overlaps enhancer'] == False] # windows don't overlap enhancers
    df = df.sample(sample_size)
    df['percentile_rank_of_observed_minus_expected_complement'] = 100 - df['percentile_rank_of_observed_minus_expected']
    df = df[['chromosome', 'start', 'end', 'percentile_rank_of_observed_minus_expected_complement', 'B', 'B_M1star.EUR', 'GC_content_1000bp']] 
    df['truly constrained'] = False
    return df

get_CDTS_for_unconstrained_windows(sample_size=100)

Unnamed: 0,chromosome,start,end,percentile_rank_of_observed_minus_expected_complement,B,B_M1star.EUR,GC_content_1000bp,truly constrained
144045911,chr5,90758202,90758753,71.271876,0.622,-0.177061,0.373626,False
18485767,chr10,37353971,37354522,18.566458,0.683,0.137428,0.430569,False
132857843,chr4,132264724,132265275,70.927498,0.835,0.044665,0.267732,False
53407259,chr14,52980421,52980972,54.263480,0.855,-0.025658,0.374625,False
175986390,chr8,16993790,16994341,11.317714,0.924,0.449785,0.390609,False
...,...,...,...,...,...,...,...,...
59677425,chr15,58385050,58385601,94.674166,0.961,0.964553,0.376623,False
176474412,chr8,23697506,23698057,6.595606,0.954,0.406638,0.411588,False
42309113,chr12,116488704,116489255,71.513260,0.891,0.026776,0.433566,False
68770324,chr17,16782065,16782616,91.076902,0.875,0.473468,0.385614,False


In [17]:
def get_CDTS_truth_set(): 
    essential_enhancers = get_CDTS_for_essential_enhancers()
    unconstrained_windows = get_CDTS_for_unconstrained_windows(sample_size=len(essential_enhancers))
    df = pd.concat([essential_enhancers, unconstrained_windows])
    df.to_csv(f'{CONSTRAINT_TOOLS_DATA}/stringent_truth_sets/CDTS-truth-set.bed', sep='\t', index=False)
    return df

get_CDTS_truth_set()




[36mWrote /scratch/ucgd/lustre-labs/quinlan/data-shared/constraint-tools/labeled-enhancers/labeled-enhancers.BGS.gBGC.GC_content.intersect.CDTS.bed[0m


Unnamed: 0,chromosome,start,end,truly constrained,B,B_M1star.EUR,GC_content_1000bp,percentile_rank_of_observed_minus_expected_complement
0,chr1,1554620,1555020,True,0.652,0.108103,0.606394,72.416125
1,chr1,2128961,2129161,True,0.841,0.347981,0.585415,53.544438
2,chr1,2268561,2268761,True,0.847,0.347981,0.602398,47.909374
3,chr1,2545161,2545361,True,0.840,0.347981,0.640360,98.359012
4,chr1,3208836,3209036,True,0.966,0.788536,0.525475,97.164713
...,...,...,...,...,...,...,...,...
31475557,chr11,98922329,98922880,False,0.944,0.246060,0.325674,34.542219
65599754,chr16,60926505,60927056,False,0.783,0.517240,0.463536,8.388564
57023100,chr14,101636322,101636873,False,0.857,0.200026,0.505495,70.191058
86918155,chr2,71485009,71485560,False,0.956,0.522355,0.476523,39.155480
