## Preliminary Workflow 



In [1]:
# 1. intersect Halldorsson windows with enhancers and exons: 
#    papers/neutral_models_are_biased/6.Halldorsson/add-overlapAmounts.sh
# 2. find noncoding Halldorsson windows and determine whether they significantly overlap enhancers or not: 
#    papers/neutral_models_are_biased/6.Halldorsson/assign_enhancer_and_exon_status.ipynb
# 3. compute GC-content for Halldorsson windows: 
#    papers/neutral_models_are_biased/6.Halldorsson/compute-GC-content-for-all-window-sizes-based-on-Halldorsson-windows.sh

# Assign BGS, gBGC, GC_content to Halldorsson windows 

In [2]:
CONSTRAINT_TOOLS = '/scratch/ucgd/lustre-labs/quinlan/u6018199/constraint-tools'
CONSTRAINT_TOOLS_DATA = '/scratch/ucgd/lustre-labs/quinlan/data-shared/constraint-tools'

import sys
sys.path.append(f'{CONSTRAINT_TOOLS}/utilities')

POP = 'EUR'
GC_WINDOW_SIZE = 1000

In [3]:
import polars as pl 

from shell import shell 
from colorize import print_string_as_info
from aggregate import aggregate_polars

def get_header(filename):
  with open(filename) as fh: 
    header = fh.readline().strip().split('\t')
  return header

def intersect(a_filename, b_filename, intersect_filename, b_class): 
  cmd = (
    f'bedtools intersect'
    f" -a <(tail -n +2 {a_filename})" # contains header
    f" -b <(tail -n +2 {b_filename})" # contains header
    f' -wao -f 0.5' 
    f' > {intersect_filename}'
  )
  shell(cmd) 
  print_string_as_info(f'Wrote {intersect_filename}')

  a_file_header = get_header(a_filename) # contains header
  b_file_header = get_header(b_filename) # contains header
  b_file_header = [f'{b_class}_{field}' for field in b_file_header[:3]] + b_file_header[3:]

  return a_file_header, b_file_header

def make_scores_numeric(df: pl.DataFrame, b_class: str, b_features: list) -> pl.DataFrame:
    # we don't want to include windows that don't overlap any windows with b_feature values
    df = df.filter(pl.col(f'{b_class}_chromosome') != '.')

    # having removed records where 'b_class_chromosome' is '.', 
    # we can now convert the 'b_features' columns to floats
    for b_feature in b_features:
        df = df.filter(pl.col(b_feature) != '.') # gBGC values are floats, but some are missing
        df = df.with_columns(pl.col(b_feature).cast(pl.Float64))

    return df

def intersect_and_aggregate(a_filename_stem, b_filename, b_class, b_features, b_class_aggregation_functions): 
  a_filename = f'{a_filename_stem}.bed'
  intersect_filename = f'{a_filename_stem}.intersect.{b_class}.bed'

  a_file_header, b_file_header = intersect(a_filename, b_filename, intersect_filename, b_class)

  df = pl.read_csv(
    intersect_filename, 
    separator='\t',
    new_columns=a_file_header + b_file_header + [f'halldorssonWindow_{b_class}Window_overlap'],
    infer_schema_length=1000000
  )

  df = make_scores_numeric(df, b_class, b_features) 

  # some Halldorsson-windows may intersect multiple b_feature-windows, 
  # so let's group by Halldorsson-window, and aggregate scores over all b_feature-windows in the group
  df = aggregate_polars(
    df, 
    group_columns = a_file_header,
    aggregation_functions = b_class_aggregation_functions
  )

  df.write_csv(
    f'{a_filename_stem}.{b_class}.bed',
    separator='\t',
  )

  print_string_as_info(f'Wrote {a_filename_stem}.{b_class}.bed')

  return df 

In [4]:
# assign BGS values to Halldorsson windows 
intersect_and_aggregate(
    a_filename_stem = f'{CONSTRAINT_TOOLS_DATA}/depletion_rank_scores/41586_2022_4965_MOESM3_ESM.noncoding.enhancer', 
    b_filename = f'{CONSTRAINT_TOOLS_DATA}/background-selection/CADD-B-map/bmap.hg38.header.bed', 
    b_class = 'BGS',
    b_features = ['B'], 
    b_class_aggregation_functions = [pl.col('B').min()] 
)   




[36mWrote /scratch/ucgd/lustre-labs/quinlan/data-shared/constraint-tools/depletion_rank_scores/41586_2022_4965_MOESM3_ESM.noncoding.enhancer.intersect.BGS.bed[0m
[36mWrote /scratch/ucgd/lustre-labs/quinlan/data-shared/constraint-tools/depletion_rank_scores/41586_2022_4965_MOESM3_ESM.noncoding.enhancer.BGS.bed[0m


chromosome,start,end,depletion_rank,enhancer_overlap,window overlaps enhancer,B
str,i64,i64,f64,i64,bool,f64
"""chr1""",1382900,1383400,0.673671,500,true,0.653
"""chr1""",1382950,1383450,0.637265,500,true,0.653
"""chr1""",1383000,1383500,0.696719,500,true,0.653
"""chr1""",1383050,1383550,0.856514,500,true,0.653
"""chr1""",1383100,1383600,0.848593,499,true,0.653
…,…,…,…,…,…,…
"""chr9""",138171800,138172300,0.309646,,false,0.536
"""chr9""",138171850,138172350,0.508776,,false,0.536
"""chr9""",138171900,138172400,0.672528,,false,0.536
"""chr9""",138171950,138172450,0.47483,,false,0.536


In [5]:
# assign gBGC coefficients to Halldorsson windows 
intersect_and_aggregate(
    a_filename_stem = f'{CONSTRAINT_TOOLS_DATA}/depletion_rank_scores/41586_2022_4965_MOESM3_ESM.noncoding.enhancer.BGS', 
    b_filename = f'{CONSTRAINT_TOOLS_DATA}/GC-biased-gene-conversion/gBGC-coefficient.hg38.{POP}.header.bed', 
    b_class = 'gBGC',
    b_features = [f'B_M1star.{POP}'], 
    b_class_aggregation_functions = [pl.col(f'B_M1star.{POP}').mean()]
)   




[36mWrote /scratch/ucgd/lustre-labs/quinlan/data-shared/constraint-tools/depletion_rank_scores/41586_2022_4965_MOESM3_ESM.noncoding.enhancer.BGS.intersect.gBGC.bed[0m
[36mWrote /scratch/ucgd/lustre-labs/quinlan/data-shared/constraint-tools/depletion_rank_scores/41586_2022_4965_MOESM3_ESM.noncoding.enhancer.BGS.gBGC.bed[0m


chromosome,start,end,depletion_rank,enhancer_overlap,window overlaps enhancer,B,B_M1star.EUR
str,i64,i64,f64,i64,bool,f64,f64
"""chr1""",1382950,1383450,0.637265,500,true,0.653,0.108103
"""chr1""",1383000,1383500,0.696719,500,true,0.653,0.108103
"""chr1""",1383050,1383550,0.856514,500,true,0.653,0.108103
"""chr1""",1383100,1383600,0.848593,499,true,0.653,0.108103
"""chr1""",1383150,1383650,0.94027,449,true,0.653,0.108103
…,…,…,…,…,…,…,…
"""chr9""",137985250,137985750,0.16218,,false,0.536,0.165515
"""chr9""",137985300,137985800,0.103548,,false,0.536,0.165515
"""chr9""",137985350,137985850,0.137103,,false,0.536,0.165515
"""chr9""",137985400,137985900,0.192227,,false,0.536,0.165515


In [6]:
def read(gc_window_size): 
    df = pl.read_csv(
        f'{CONSTRAINT_TOOLS_DATA}/depletion_rank_scores/41586_2022_4965_MOESM3_ESM.GC_content_{gc_window_size}.bed',
        separator='\t',
        infer_schema_length=1000000
    )
    df = df.select(pl.col(
        "halldorsson_chrom",
        "halldorsson_start",
        "halldorson_end", # typo 
        "halldorsson_score",
        "window_GC_content"
    ))
    df = df.rename({"window_GC_content": f'GC_content_{gc_window_size}bp'})
    return df

def assign_GC_to_halldorsson_windows():
    df = pl.read_csv(
        f'{CONSTRAINT_TOOLS_DATA}/depletion_rank_scores/41586_2022_4965_MOESM3_ESM.noncoding.enhancer.BGS.gBGC.bed', 
        separator='\t',
        infer_schema_length=1000000
    )

    for df_gc in [read(gc_window_size) for gc_window_size in [1000, 10000, 100000]]: 
        df = df.join(
            df_gc, 
            left_on=['chromosome', 'start', 'end', 'depletion_rank'],
            right_on=["halldorsson_chrom", "halldorsson_start", "halldorson_end", "halldorsson_score"], 
            how='inner'
        )

    output_filename = f'{CONSTRAINT_TOOLS_DATA}/depletion_rank_scores/41586_2022_4965_MOESM3_ESM.noncoding.enhancer.BGS.gBGC.GC_content.bed'
    df.write_csv(output_filename, separator='\t')
    print_string_as_info(f'Wrote {output_filename}')
    
    return df 

assign_GC_to_halldorsson_windows()

[36mWrote /scratch/ucgd/lustre-labs/quinlan/data-shared/constraint-tools/depletion_rank_scores/41586_2022_4965_MOESM3_ESM.noncoding.enhancer.BGS.gBGC.GC_content.bed[0m


chromosome,start,end,depletion_rank,enhancer_overlap,window overlaps enhancer,B,B_M1star.EUR,GC_content_1000bp,GC_content_10000bp,GC_content_100000bp
str,i64,i64,f64,i64,bool,f64,f64,f64,f64,f64
"""chr1""",1382950,1383450,0.637265,500,true,0.653,0.108103,0.507492,0.549345,0.580424
"""chr1""",1383000,1383500,0.696719,500,true,0.653,0.108103,0.51049,0.548945,0.580354
"""chr1""",1383050,1383550,0.856514,500,true,0.653,0.108103,0.508492,0.550245,0.580274
"""chr1""",1383100,1383600,0.848593,499,true,0.653,0.108103,0.518481,0.549245,0.580314
"""chr1""",1383150,1383650,0.94027,449,true,0.653,0.108103,0.512488,0.549545,0.580284
…,…,…,…,…,…,…,…,…,…,…
"""chr9""",137985250,137985750,0.16218,,false,0.536,0.165515,0.578422,0.518648,0.489205
"""chr9""",137985300,137985800,0.103548,,false,0.536,0.165515,0.575425,0.519848,0.489175
"""chr9""",137985350,137985850,0.137103,,false,0.536,0.165515,0.578422,0.520048,0.489265
"""chr9""",137985400,137985900,0.192227,,false,0.536,0.165515,0.581419,0.520548,0.489265
