In [None]:
CONSTRAINT_TOOLS = '/scratch/ucgd/lustre-work/quinlan/u6018199/constraint-tools'
CONSTRAINT_TOOLS_DATA = '/scratch/ucgd/lustre-labs/quinlan/data-shared/constraint-tools'

import sys
sys.path.append(f'{CONSTRAINT_TOOLS}/utilities')

In [None]:
import pandas as pd
import numpy as np

from shell import shell 
from colorize import print_string_as_info
from aggregate import aggregate

## Assign depletion-rank scores to non-exonic Chen windows 

In [None]:
def get_header(filename):
  with open(filename) as fh: 
    header = fh.readline().strip().split('\t')
  return header

def intersect(a_filename, b_filename, intersect_filename, b_class): 
  cmd = (
    f'bedtools intersect'
    f" -a <(tail -n +2 {a_filename})" # contains header
    f" -b <(tail -n +2 {b_filename})" # contains header
    f' -wao -f 0.5'
    f' > {intersect_filename}'
  )
  shell(cmd)
  print_string_as_info(f'Wrote {intersect_filename}')

  a_file_header = get_header(a_filename) # contains header
  b_file_header = get_header(b_filename) # contains header
  b_file_header = [f'{b_class}_{field}' for field in b_file_header[:3]] + b_file_header[3:]

  return a_file_header, b_file_header

def set_column_dtypes(df, column_dtypes, log=False):
  df = df.replace('.', np.nan)

  for col, dtype in column_dtypes.items():
    df[col] = df[col].astype(dtype)

  if log: print(dict(df.dtypes))
  return df 

def make_scores_numeric(df, b_class, b_features): 
  # we don't want to include windows that don't overlap any windows with depletion_rank values
  df = df[df[f'{b_class}_Chr'] != '.'] 

  # having removed records where 'b_class_Chr' is '.', we can now convert the 'b_features' columns to floats
  column_dtypes = {b_feature: float for b_feature in b_features}
  df = set_column_dtypes(df, column_dtypes)

  return df 
  
def intersect_and_aggregate(a_filename_stem, b_filename, b_class, b_features, b_class_aggregation): 
  a_filename = f'{a_filename_stem}.bed'
  intersect_filename = f'{a_filename_stem}.intersect.{b_class}.bed'

  a_file_header, b_file_header = intersect(a_filename, b_filename, intersect_filename, b_class)

  df = pd.read_csv(
    intersect_filename, 
    sep='\t',
    names=a_file_header + b_file_header + [f'chenWindow_{b_class}Window_overlap'],
  )

  df = make_scores_numeric(df, b_class, b_features)

  # some Chen-windows may intersect multiple b_feature-windows, 
  # so let's group by Chen-window, and aggregate scores over all b_feature-windows in the group
  df = aggregate(
    df, 
    group_columns = a_file_header,
    aggregation_functions = {b_feature: [b_class_aggregation] for b_feature in b_features}
  )
  df = df.rename(columns={f'{b_class_aggregation} {b_feature}': b_feature for b_feature in b_features})

  df.to_csv(
    f'{a_filename_stem}.{b_class}.bed',
    sep='\t',
    index=False,
  )

  print_string_as_info(f'Wrote {a_filename_stem}.{b_class}.bed')

  return df 

def assign_scores_to_chen_windows():
  b_class = 'depletion_rank_constraint_score' 
  b_features = ['rank']

  # assign depletion_rank constraint scores to Chen et al. windows 
  df = intersect_and_aggregate(                                                                                                             
    a_filename_stem = f'{CONSTRAINT_TOOLS_DATA}/chen-et-al-2023-published-version/41586_2023_6045_MOESM4_ESM/Supplementary_Data_2.gnocchi.N_expected.N_observed.B.paternal_recombination_rate.maternal_recombination_rate.gBGC-tract-counts.non-exonic.gBGC', 
    b_filename = f'{CONSTRAINT_TOOLS_DATA}/depletion_rank_scores/41586_2022_4965_MOESM3_ESM', 
    b_class = b_class,
    b_features = b_features, 
    b_class_aggregation = 'min' # assign to the Chen window the score of the most constrained 500bp overlapping window
  )   

  print_string_as_info(f'Assigned depletion_rank scores to Chen et al. windows')

  return df 

assign_scores_to_chen_windows()