In [1]:
CONSTRAINT_TOOLS = '/scratch/ucgd/lustre-work/quinlan/u6018199/constraint-tools'
CONSTRAINT_TOOLS_DATA = '/scratch/ucgd/lustre-labs/quinlan/data-shared/constraint-tools'

import sys
sys.path.append(f'{CONSTRAINT_TOOLS}/utilities')

In [2]:
import pandas as pd
import numpy as np

from shell import shell 
from colorize import print_string_as_info
from aggregate import aggregate

## Assign depletion-rank scores to non-exonic Chen windows 

In [3]:
def get_header(filename):
  with open(filename) as fh: 
    header = fh.readline().strip().split('\t')
  return header

def intersect(a_filename, b_filename, intersect_filename, b_class): 
  cmd = (
    f'bedtools intersect'
    f" -a <(tail -n +2 {a_filename})" # contains header
    f" -b <(tail -n +2 {b_filename})" # contains header
    f' -wao -f 0.5'
    f' > {intersect_filename}'
  )
  shell(cmd)
  print_string_as_info(f'Wrote {intersect_filename}')

  a_file_header = get_header(a_filename) # contains header
  b_file_header = get_header(b_filename) # contains header
  b_file_header = [f'{b_class}_{field}' for field in b_file_header[:3]] + b_file_header[3:]

  return a_file_header, b_file_header

def set_column_dtypes(df, column_dtypes, log=False):
  df = df.replace('.', np.nan)

  for col, dtype in column_dtypes.items():
    df[col] = df[col].astype(dtype)

  if log: print(dict(df.dtypes))
  return df 

def make_scores_numeric(df, b_class, b_features): 
  # we don't want to include windows that don't overlap any windows with depletion_rank values
  df = df[df[f'{b_class}_Chr'] != '.'] 

  # having removed records where 'b_class_Chr' is '.', we can now convert the 'b_features' columns to floats
  column_dtypes = {b_feature: float for b_feature in b_features}
  df = set_column_dtypes(df, column_dtypes)

  return df 
  
def intersect_and_aggregate(a_filename_stem, b_filename, b_class, b_features, b_class_aggregation): 
  a_filename = f'{a_filename_stem}.bed'
  intersect_filename = f'{a_filename_stem}.intersect.{b_class}.bed'

  a_file_header, b_file_header = intersect(a_filename, b_filename, intersect_filename, b_class)

  df = pd.read_csv(
    intersect_filename, 
    sep='\t',
    names=a_file_header + b_file_header + [f'chenWindow_{b_class}Window_overlap'],
  )

  df = make_scores_numeric(df, b_class, b_features)

  # some Chen-windows may intersect multiple b_feature-windows, 
  # so let's group by Chen-window, and aggregate scores over all b_feature-windows in the group
  df = aggregate(
    df, 
    group_columns = a_file_header,
    aggregation_functions = {b_feature: [b_class_aggregation] for b_feature in b_features}
  )
  df = df.rename(columns={f'{b_class_aggregation} {b_feature}': b_feature for b_feature in b_features})

  df = df.rename(columns={'rank': b_class})
  
  df.to_csv(
    f'{a_filename_stem}.{b_class}.bed',
    sep='\t',
    index=False,
  )

  print_string_as_info(f'Wrote {a_filename_stem}.{b_class}.bed')

  return df 

def assign_scores_to_chen_windows():
  b_class = 'depletion_rank_constraint_score' 
  b_features = ['rank']

  # assign depletion_rank constraint scores to Chen et al. windows 
  df = intersect_and_aggregate(                                                                                                             
    a_filename_stem = f'{CONSTRAINT_TOOLS_DATA}/chen-et-al-2023-published-version/41586_2023_6045_MOESM4_ESM/Supplementary_Data_2.gnocchi.N_expected.N_observed.B.paternal_recombination_rate.maternal_recombination_rate.gBGC-tract-counts.non-exonic.gBGC', 
    b_filename = f'{CONSTRAINT_TOOLS_DATA}/depletion_rank_scores/41586_2022_4965_MOESM3_ESM', 
    b_class = b_class,
    b_features = b_features, 
    b_class_aggregation = 'min' # assign to the Chen window the score of the most constrained 500bp overlapping window
  )   

  print_string_as_info(f'Assigned depletion_rank scores to Chen et al. windows')

  return df 

assign_scores_to_chen_windows()




[36mWrote /scratch/ucgd/lustre-labs/quinlan/data-shared/constraint-tools/chen-et-al-2023-published-version/41586_2023_6045_MOESM4_ESM/Supplementary_Data_2.gnocchi.N_expected.N_observed.B.paternal_recombination_rate.maternal_recombination_rate.gBGC-tract-counts.non-exonic.gBGC.intersect.depletion_rank_constraint_score.bed[0m
  df = intersect_and_aggregate(
[36mWrote /scratch/ucgd/lustre-labs/quinlan/data-shared/constraint-tools/chen-et-al-2023-published-version/41586_2023_6045_MOESM4_ESM/Supplementary_Data_2.gnocchi.N_expected.N_observed.B.paternal_recombination_rate.maternal_recombination_rate.gBGC-tract-counts.non-exonic.gBGC.depletion_rank_constraint_score.bed[0m
[36mAssigned depletion_rank scores to Chen et al. windows[0m


Unnamed: 0,chrom,start,end,gnocchi,N_expected,N_observed,window overlaps enhancer,window overlaps merged_exon,B,paternal_recombination_rate,maternal_recombination_rate,gBGC-tract_overlap_count,log10_paternal_recombination_rate,log10_maternal_recombination_rate,B_M1star.EUR,B0_M3starHMT6.EUR,B1_M3starHMT6.EUR,Bmean_M3starHMT6.EUR,lambda_M3starHMT6.EUR,depletion_rank_constraint_score
0,chr1,1432000,1433000,4.299894,338.059552,259,True,False,0.653,0.039046,0.025852,0,-1.408422,-1.587499,0.108103,0.057741,0.513191,0.105518,2.585568,0.776961
1,chr1,1451000,1452000,0.666316,302.590657,291,False,False,0.652,0.039046,0.025852,0,-1.408422,-1.587499,0.108103,0.057741,0.513191,0.105518,2.585568,0.679834
2,chr1,1453000,1454000,0.828398,269.601925,256,False,False,0.651,0.039046,0.025852,0,-1.408422,-1.587499,0.108103,0.057741,0.513191,0.105518,2.585568,0.320987
3,chr1,1458000,1459000,-0.086128,270.583243,272,False,False,0.651,0.039046,0.025852,0,-1.408422,-1.587499,0.108103,0.057741,0.513191,0.105518,2.585568,0.718503
4,chr1,1463000,1464000,2.948188,236.321790,191,True,False,0.651,0.039046,0.025852,0,-1.408422,-1.587499,0.108103,0.057741,0.513191,0.105518,2.585568,0.170495
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1003313,chr9,137262000,137263000,-2.396149,263.131293,302,True,False,0.536,0.057053,0.063434,0,-1.243724,-1.197675,0.165515,0.135944,3.636507,0.219258,2.281162,0.968399
1003314,chr9,137268000,137269000,3.640544,272.046527,212,True,False,0.536,0.057053,0.063434,0,-1.243724,-1.197675,0.165515,0.135944,3.636507,0.219258,2.281162,0.401000
1003315,chr9,137269000,137270000,5.276351,300.458958,209,True,False,0.536,0.057053,0.063434,0,-1.243724,-1.197675,0.165515,0.135944,3.636507,0.219258,2.281162,0.282586
1003316,chr9,137275000,137276000,2.687348,339.517045,290,True,False,0.536,1.564889,0.024254,0,0.194484,-1.615221,0.165515,0.135944,3.636507,0.219258,2.281162,0.203953
