## Preliminary Workflow 



In [1]:
# 1. intersect CDTS windows with enhancers and exons: 
#    papers/neutral_models_are_biased/7.CDTS/add-overlapAmounts.sh
# 2. find noncoding CDTS windows and determine whether they significantly overlap enhancers or not: 
#    papers/neutral_models_are_biased/7.CDTS/assign_enhancer_and_exon_status.ipynb
# 3. compute GC-content for CDTS windows: 
#    papers/neutral_models_are_biased/7.CDTS/compute-GC-content-for-all-window-sizes-based-on-CDTS-windows.sh

# Assign BGS, gBGC, GC_content to CDTS windows 

In [1]:
CONSTRAINT_TOOLS = '/scratch/ucgd/lustre-labs/quinlan/u6018199/constraint-tools'
CONSTRAINT_TOOLS_DATA = '/scratch/ucgd/lustre-labs/quinlan/data-shared/constraint-tools'

import sys
sys.path.append(f'{CONSTRAINT_TOOLS}/utilities')

POP = 'EUR'
GC_WINDOW_SIZE = 1000

In [2]:
import polars as pl 

from shell import shell 
from colorize import print_string_as_info
from aggregate import aggregate_polars

def get_header(filename):
  with open(filename) as fh: 
    header = fh.readline().strip().split('\t')
  return header

def intersect(a_filename, b_filename, intersect_filename, b_class): 
  cmd = (
    f'bedtools intersect'
    f" -a <(tail -n +2 {a_filename})" # contains header
    f" -b <(tail -n +2 {b_filename})" # contains header
    f' -wao -f 0.5' 
    f' > {intersect_filename}'
  )
  shell(cmd) 
  print_string_as_info(f'Wrote {intersect_filename}')

  a_file_header = get_header(a_filename) # contains header
  b_file_header = get_header(b_filename) # contains header
  b_file_header = [f'{b_class}_{field}' for field in b_file_header[:3]] + b_file_header[3:]

  return a_file_header, b_file_header

def make_scores_numeric(df: pl.DataFrame, b_class: str, b_features: list) -> pl.DataFrame:
    # we don't want to include windows that don't overlap any windows with b_feature values
    df = df.filter(pl.col(f'{b_class}_chromosome') != '.')

    # having removed records where 'b_class_chromosome' is '.', 
    # we can now convert the 'b_features' columns to floats
    for b_feature in b_features:
        df = df.filter(pl.col(b_feature) != '.') # gBGC values are floats, but some are missing
        df = df.with_columns(pl.col(b_feature).cast(pl.Float64))

    return df

def intersect_and_aggregate(a_filename_stem, b_filename, b_class, b_features, b_class_aggregation_functions): 
  a_filename = f'{a_filename_stem}.bed'
  intersect_filename = f'{a_filename_stem}.intersect.{b_class}.bed'

  a_file_header, b_file_header = intersect(a_filename, b_filename, intersect_filename, b_class)

  df = pl.read_csv(
    intersect_filename, 
    separator='\t',
    new_columns=a_file_header + b_file_header + [f'CDTSWindow_{b_class}Window_overlap'],
    infer_schema_length=1000000
  )

  df = make_scores_numeric(df, b_class, b_features) 

  # some CDTS-windows may intersect multiple b_feature-windows, 
  # so let's group by CDTS-window, and aggregate scores over all b_feature-windows in the group
  df = aggregate_polars(
    df, 
    group_columns = a_file_header,
    aggregation_functions = b_class_aggregation_functions
  )

  df.write_csv(
    f'{a_filename_stem}.{b_class}.bed',
    separator='\t',
  )

  print_string_as_info(f'Wrote {a_filename_stem}.{b_class}.bed')

  return df 

In [3]:
# # assign BGS values to CDTS windows 
# intersect_and_aggregate(
#     a_filename_stem = f'{CONSTRAINT_TOOLS_DATA}/CDTS/CDTS.gnomAD.hg38.noncoding.enhancer', 
#     b_filename = f'{CONSTRAINT_TOOLS_DATA}/background-selection/CADD-B-map/bmap.hg38.header.bed', 
#     b_class = 'BGS',
#     b_features = ['B'], 
#     b_class_aggregation_functions = [pl.col('B').min()] 
# )   




[36mWrote /scratch/ucgd/lustre-labs/quinlan/data-shared/constraint-tools/CDTS/CDTS.gnomAD.hg38.noncoding.enhancer.intersect.BGS.bed[0m
[36mWrote /scratch/ucgd/lustre-labs/quinlan/data-shared/constraint-tools/CDTS/CDTS.gnomAD.hg38.noncoding.enhancer.BGS.bed[0m


chromosome,start,end,observed_counts,expected_counts,observed_minus_expected,percentile_rank_of_observed_minus_expected,enhancer_overlap,window overlaps enhancer,B
str,i64,i64,i64,f64,f64,f64,i64,bool,f64
"""chr1""",1382849,1383400,10,12.513945,-2.51394,27.27855,551,true,0.653
"""chr1""",1382859,1383410,10,12.150556,-2.15056,30.844058,551,true,0.653
"""chr1""",1382869,1383420,10,12.163989,-2.16399,30.708951,551,true,0.653
"""chr1""",1382879,1383430,10,12.142615,-2.14261,30.923845,551,true,0.653
"""chr1""",1382889,1383440,9,12.162707,-3.16271,21.38556,551,true,0.653
…,…,…,…,…,…,…,…,…,…
"""chr17""",58762584,58763135,16,16.767538,-0.767538,45.423107,,false,0.536
"""chr17""",58762594,58763145,16,16.734729,-0.734729,45.776368,,false,0.536
"""chr17""",58762604,58763155,16,16.730413,-0.730413,45.82294,,false,0.536
"""chr17""",58762614,58763165,16,16.701082,-0.701082,46.137652,,false,0.536


In [4]:
# # assign gBGC coefficients to CDTS windows 
# intersect_and_aggregate(
#     a_filename_stem = f'{CONSTRAINT_TOOLS_DATA}/CDTS/CDTS.gnomAD.hg38.noncoding.enhancer.BGS', 
#     b_filename = f'{CONSTRAINT_TOOLS_DATA}/GC-biased-gene-conversion/gBGC-coefficient.hg38.{POP}.header.bed', 
#     b_class = 'gBGC',
#     b_features = [f'B_M1star.{POP}'], 
#     b_class_aggregation_functions = [pl.col(f'B_M1star.{POP}').mean()]
# )   




[36mWrote /scratch/ucgd/lustre-labs/quinlan/data-shared/constraint-tools/CDTS/CDTS.gnomAD.hg38.noncoding.enhancer.BGS.intersect.gBGC.bed[0m
[36mWrote /scratch/ucgd/lustre-labs/quinlan/data-shared/constraint-tools/CDTS/CDTS.gnomAD.hg38.noncoding.enhancer.BGS.gBGC.bed[0m


chromosome,start,end,observed_counts,expected_counts,observed_minus_expected,percentile_rank_of_observed_minus_expected,enhancer_overlap,window overlaps enhancer,B,B_M1star.EUR
str,i64,i64,i64,f64,f64,f64,i64,bool,f64,f64
"""chr1""",1382859,1383410,10,12.150556,-2.15056,30.844058,551,true,0.653,0.108103
"""chr1""",1382869,1383420,10,12.163989,-2.16399,30.708951,551,true,0.653,0.108103
"""chr1""",1382879,1383430,10,12.142615,-2.14261,30.923845,551,true,0.653,0.108103
"""chr1""",1382889,1383440,9,12.162707,-3.16271,21.38556,551,true,0.653,0.108103
"""chr1""",1382899,1383450,9,11.994043,-2.99404,22.8547,551,true,0.653,0.108103
…,…,…,…,…,…,…,…,…,…,…
"""chr17""",58762584,58763135,16,16.767538,-0.767538,45.423107,,false,0.536,0.226818
"""chr17""",58762594,58763145,16,16.734729,-0.734729,45.776368,,false,0.536,0.226818
"""chr17""",58762604,58763155,16,16.730413,-0.730413,45.82294,,false,0.536,0.226818
"""chr17""",58762614,58763165,16,16.701082,-0.701082,46.137652,,false,0.536,0.226818


In [6]:
def read(gc_window_size): 
    df = pl.read_csv(
        f'{CONSTRAINT_TOOLS_DATA}/CDTS/CDTS.gnomAD.hg38.noncoding.GC_content_{gc_window_size}.bed',
        separator='\t',
        infer_schema_length=1000000
    )
    df = df.select(pl.col(
        "CDTS_window_chrom",
        "CDTS_window_start",
        "CDTS_window_end", 
        "GC_window__GC_content"
    ))
    df = df.rename({"GC_window__GC_content": f'GC_content_{gc_window_size}bp'})
    return df

def assign_GC_to_CDTS_windows():
    df = pl.read_csv(
        f'{CONSTRAINT_TOOLS_DATA}/CDTS/CDTS.gnomAD.hg38.noncoding.enhancer.BGS.gBGC.bed', 
        separator='\t',
        infer_schema_length=1000000
    )

    for df_gc in [read(gc_window_size) for gc_window_size in [1000]]: 
        df = df.join(
            df_gc, 
            left_on=['chromosome', 'start', 'end'],
            right_on=["CDTS_window_chrom", "CDTS_window_start", "CDTS_window_end"], 
            how='inner'
        )

    df = df.unique(subset=['chromosome', 'start', 'end'], keep='first', maintain_order=True)

    output_filename = f'{CONSTRAINT_TOOLS_DATA}/CDTS/CDTS.gnomAD.hg38.noncoding.enhancer.BGS.gBGC.GC_content.bed'
    df.write_csv(output_filename, separator='\t')
    print_string_as_info(f'Wrote {output_filename}')
    
    return df 

assign_GC_to_CDTS_windows()

[36mWrote /scratch/ucgd/lustre-labs/quinlan/data-shared/constraint-tools/CDTS/CDTS.gnomAD.hg38.noncoding.enhancer.BGS.gBGC.GC_content.bed[0m


chromosome,start,end,observed_counts,expected_counts,observed_minus_expected,percentile_rank_of_observed_minus_expected,enhancer_overlap,window overlaps enhancer,B,B_M1star.EUR,GC_content_1000bp
str,i64,i64,i64,f64,f64,f64,i64,bool,f64,f64,f64
"""chr1""",1382859,1383410,10,12.150556,-2.15056,30.844058,551,true,0.653,0.108103,0.499501
"""chr1""",1382869,1383420,10,12.163989,-2.16399,30.708951,551,true,0.653,0.108103,0.498502
"""chr1""",1382879,1383430,10,12.142615,-2.14261,30.923845,551,true,0.653,0.108103,0.502497
"""chr1""",1382889,1383440,9,12.162707,-3.16271,21.38556,551,true,0.653,0.108103,0.504496
"""chr1""",1382899,1383450,9,11.994043,-2.99404,22.8547,551,true,0.653,0.108103,0.504496
…,…,…,…,…,…,…,…,…,…,…,…
"""chr17""",58762584,58763135,16,16.767538,-0.767538,45.423107,,false,0.536,0.226818,0.421578
"""chr17""",58762594,58763145,16,16.734729,-0.734729,45.776368,,false,0.536,0.226818,0.420579
"""chr17""",58762604,58763155,16,16.730413,-0.730413,45.82294,,false,0.536,0.226818,0.417582
"""chr17""",58762614,58763165,16,16.701082,-0.701082,46.137652,,false,0.536,0.226818,0.414585
