## TODO: grand plan 
 

In [None]:

# 1. Assign GC content, gBGC, BGS, exon status, enhancer status to Halldorsson windows using same infra I used for Chen windows 
# 2. Copy papers/neutral_models_are_biased/2.ipynb to new notebook, ingest Halldorsson dataframe, and compute PR curves (but not auPRC bar charts) for non-exonic Halldorsson windows 
# 3. Include PR curves for Halldorsson windows as supp info, showing that trends are same as when using Chen windows 


## Workflow 



In [1]:
# 1. intersect Halldorsson windows with enhancers and exons: 
#    papers/neutral_models_are_biased/6.Halldorsson/add-overlapAmounts.sh
# 2. find noncoding Halldorsson windows and determine whether they significantly overlap enhancers or not: 
#    papers/neutral_models_are_biased/6.Halldorsson/assign_enhancer_and_exon_status.ipynb
# 3. compute GC-content for Halldorsson windows: 
#    papers/neutral_models_are_biased/6.Halldorsson/compute-GC-content-for-all-window-sizes-based-on-Halldorsson-windows.sh

## TODO 

In [None]:
# TODO: delete 1, 2, 3, 4: 
#   1. assign BGS  experiments/germline-model/chen-et-al-2022/Chen_models_SNV_counts_best_for_most_frequent_windows.1.1.ipynb
#   2. assign gBGC experiments/germline-model/chen-et-al-2022/Chen_models_SNV_counts_best_for_most_frequent_windows.1.2.ipynb
#   3. assign GC content experiments/germline-model/chen-et-al-2022/Chen_models_SNV_counts_best_for_most_frequent_windows.7.ipynb
#   4. also used: papers/neutral_models_are_biased/assign-depletion-rank-scores-to-chen-windows.ipynb



# Assign BGS, gBGC to Halldorsson windows 

In [1]:
CONSTRAINT_TOOLS = '/scratch/ucgd/lustre-labs/quinlan/u6018199/constraint-tools'
CONSTRAINT_TOOLS_DATA = '/scratch/ucgd/lustre-labs/quinlan/data-shared/constraint-tools'

import sys
sys.path.append(f'{CONSTRAINT_TOOLS}/utilities')

POP = 'EUR'
GC_WINDOW_SIZE = 1000

In [None]:
import polars as pl # TODO 
import pandas as pd
import numpy as np 

from shell import shell 
from colorize import print_string_as_info
from aggregate import aggregate

def get_header(filename):
  with open(filename) as fh: 
    header = fh.readline().strip().split('\t')
  return header

def intersect(a_filename, b_filename, intersect_filename, b_class): 
  cmd = (
    f'bedtools intersect'
    f" -a <(tail -n +2 {a_filename})" # contains header
    f" -b <(tail -n +2 {b_filename})" # contains header
    f' -wao -f 0.5 -r' 
    f' > {intersect_filename}'
  )
  shell(cmd)
  print_string_as_info(f'Wrote {intersect_filename}')

  a_file_header = get_header(a_filename) # contains header
  b_file_header = get_header(b_filename) # contains header
  b_file_header = [f'{b_class}_{field}' for field in b_file_header[:3]] + b_file_header[3:]

  return a_file_header, b_file_header

def set_column_dtypes(df, column_dtypes, log=False):
  df = df.replace('.', np.nan)

  for col, dtype in column_dtypes.items():
    df[col] = df[col].astype(dtype)

  if log: print(dict(df.dtypes))
  return df 

def make_scores_numeric(df, b_class, b_features): 
  # we don't want to include windows that don't overlap any windows with b_feature values
  df = df[df[f'{b_class}_chromosome'] != '.'] 

  # having removed records where 'b_class_chromosome' is '.', we can now convert the 'b_features' columns to floats
  column_dtypes = {b_feature: float for b_feature in b_features}
  df = set_column_dtypes(df, column_dtypes)

  return df 
  
def intersect_and_aggregate(a_filename_stem, b_filename, b_class, b_features, b_class_aggregation): 
  a_filename = f'{a_filename_stem}.bed'
  intersect_filename = f'{a_filename_stem}.intersect.{b_class}.bed'

  a_file_header, b_file_header = intersect(a_filename, b_filename, intersect_filename, b_class)

  df = pd.read_csv(
    intersect_filename, 
    sep='\t',
    names=a_file_header + b_file_header + [f'halldorssonWindow_{b_class}Window_overlap'],
  )

  df = make_scores_numeric(df, b_class, b_features)

  # some Halldorsson-windows may intersect multiple b_feature-windows, 
  # so let's group by Halldorsson-window, and aggregate scores over all b_feature-windows in the group
  df = aggregate(
    df, 
    group_columns = a_file_header,
    aggregation_functions = {b_feature: [b_class_aggregation] for b_feature in b_features}
  )
  df = df.rename(columns={f'{b_class_aggregation} {b_feature}': b_feature for b_feature in b_features})

  # TODO: 
  df = df.rename(columns={'rank': b_class})
  
  df.to_csv(
    f'{a_filename_stem}.{b_class}.bed',
    sep='\t',
    index=False,
  )

  print_string_as_info(f'Wrote {a_filename_stem}.{b_class}.bed')

  return df 

# TODO: continue here: 
def assign_BGS_and_gBGC_to_halldorsson_windows():
  # assign BGS values to Halldorsson windows 
  intersect_and_aggregate(
    a_filename_stem = f'{CONSTRAINT_TOOLS_DATA}/depletion_rank_scores/41586_2022_4965_MOESM3_ESM.noncoding.enhancer', 
    b_filename = f'{CONSTRAINT_TOOLS_DATA}/background-selection/CADD-B-map/bmap.hg38.header.bed', 
    b_class = 'XXX',
    b_features = ['YYY'], 
    b_class_aggregation = 'min' 
  )   

  return 

  # assign gBGC coefficients to Halldorsson windows 
  df = intersect_and_aggregate(
    a_filename_stem = f'{CONSTRAINT_TOOLS_DATA}/depletion_rank_scores/41586_2022_4965_MOESM3_ESM.noncoding.enhancer.B', 
    b_filename = f'{CONSTRAINT_TOOLS_DATA}/GC-biased-gene-conversion/gBGC-coefficient.hg38.{POP}.bed', # TODO: does this bed file have a header, as the code assumes? 
    b_class = 'XXX',
    b_features = ['YYY'], 
    b_class_aggregation = 'mean'
  )   

  return df

pd.set_option('display.max_columns', 100)

WINDOWS = assign_BGS_and_gBGC_to_halldorsson_windows()
WINDOWS




[36mWrote /scratch/ucgd/lustre-work/quinlan/data-shared/constraint-tools/chen-et-al-2023-published-version/41586_2023_6045_MOESM4_ESM/Supplementary_Data_2.gnocchi.N_expected.N_observed.intersect.B-map.bed[0m
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col] = df[col].astype(dtype)





[36mWrote /scratch/ucgd/lustre-work/quinlan/data-shared/constraint-tools/chen-et-al-2023-published-version/41586_2023_6045_MOESM4_ESM/Supplementary_Data_2.gnocchi.N_expected.N_observed.B.intersect.paternal_recombination_rate-map.bed[0m





[36mWrote /scratch/ucgd/lustre-work/quinlan/data-shared/constraint-tools/chen-et-al-2023-published-version/41586_2023_6045_MOESM4_ESM/Supplementary_Data_2.gnocchi.N_expected.N_observed.B.paternal_recombination_rate.intersect.maternal_recombination_rate-map.bed[0m





[36mWrote /scratch/ucgd/lustre-work/quinlan/data-shared/constraint-tools/chen-et-al-2023-published-version/41586_2023_6045_MOESM4_ESM/Supplementary_Data_2.gnocchi.N_expected.N_observed.B.paternal_recombination_rate.maternal_recombination_rate.gBGC-tract-counts.bed[0m


Unnamed: 0,chrom,start,end,gnocchi,N_expected,N_observed,window overlaps enhancer,window overlaps merged_exon,B,paternal_recombination_rate,maternal_recombination_rate,gBGC-tract_overlap_count,log10_paternal_recombination_rate,log10_maternal_recombination_rate
0,chr1,1432000,1433000,4.299894,338.059552,259,True,False,0.653,0.039046,0.025852,0,-1.408422,-1.587499
4,chr1,1451000,1452000,0.666316,302.590657,291,False,False,0.652,0.039046,0.025852,0,-1.408422,-1.587499
5,chr1,1453000,1454000,0.828398,269.601925,256,False,False,0.651,0.039046,0.025852,0,-1.408422,-1.587499
7,chr1,1458000,1459000,-0.086128,270.583243,272,False,False,0.651,0.039046,0.025852,0,-1.408422,-1.587499
11,chr1,1463000,1464000,2.948188,236.321790,191,True,False,0.651,0.039046,0.025852,0,-1.408422,-1.587499
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1556241,chr9,137262000,137263000,-2.396149,263.131293,302,True,False,0.536,0.057053,0.063434,0,-1.243724,-1.197675
1556242,chr9,137268000,137269000,3.640544,272.046527,212,True,False,0.536,0.057053,0.063434,0,-1.243724,-1.197675
1556243,chr9,137269000,137270000,5.276351,300.458958,209,True,False,0.536,0.057053,0.063434,0,-1.243724,-1.197675
1556244,chr9,137275000,137276000,2.687348,339.517045,290,True,False,0.536,1.564889,0.024254,0,0.194484,-1.615221


# TODO: Assign GC content to Halldorsson windows 

In [None]:
import pandas as pd
from functools import reduce

def get_windows_with_GC_content_and_cpg_islands(): 
  df1 = pd.read_csv(
    f'{CONSTRAINT_TOOLS_DATA}/chen-et-al-2023-published-version/41586_2023_6045_MOESM4_ESM/Supplementary_Data_2.gnocchi.N_expected.N_observed.B.paternal_recombination_rate.maternal_recombination_rate.gBGC-tract-counts.non-exonic.gBGC.bed', 
    sep='\t', 
  )

  df2 = pd.read_csv(
    f'{CONSTRAINT_TOOLS_DATA}/chen-et-al-2023-published-version/41586_2023_6045_MOESM4_ESM/Supplementary_Data_2.GC_content_{GC_WINDOW_SIZE}.bed', 
    sep='\t', 
  )
  df2 = df2[['chen_chrom', 'chen_start', 'chen_end', 'window_GC_content']]
  df2 = df2.rename(columns={
    'chen_chrom': 'chrom', 
    'chen_start': 'start', 
    'chen_end': 'end',
    'window_GC_content': f'GC_content_{GC_WINDOW_SIZE}bp'
  })

  # created using: experiments/germline-model/chen-et-al-2022/cpg-island-enrichment.ipynb
  df3 = pd.read_csv(
    f'{CONSTRAINT_TOOLS_DATA}/benchmark-genome-wide-predictions/chen-et-al-2022/mchale.kmerSizes.trainSets.noisy.enhancer-exon-cpgIsland.bed',
    sep='\t', 
  )
  df3 = df3[['chromosome', 'start', 'end', 'cpg_island overlap', 'window overlaps cpg_island']]
  df3 = df3.rename(columns={
    'chromosome': 'chrom', 
    'cpg_island overlap': 'cpg_island_overlap', 
    'window overlaps cpg_island': 'window_overlaps_cpg_island'
  })

  dfs = [df1, df2, df3]
  df = reduce(lambda left, right: pd.merge(left, right, on=['chrom', 'start', 'end'], how='inner'), dfs)

  df = df[df['window overlaps enhancer'] == False]

  return df

WINDOWS = get_windows_with_GC_content_and_cpg_islands()
WINDOWS