In [4]:
CONSTRAINT_TOOLS = '/scratch/ucgd/lustre-work/quinlan/u6018199/constraint-tools'
CONSTRAINT_TOOLS_DATA = '/scratch/ucgd/lustre-work/quinlan/data-shared/constraint-tools'

import sys
sys.path.append(f'{CONSTRAINT_TOOLS}/utilities')

## Assign scores to non-exonic Chen windows 

In [5]:
import pandas as pd
import numpy as np

from shell import shell 
from colorize import print_string_as_info
from aggregate import aggregate

def get_chen_windows_with_enhancers_exons():
  # the following bed file was produced by experiments/germline-model/chen-et-al-2022/Fig_2a.Nonly.noisy.ipynb
  filename = f'{CONSTRAINT_TOOLS_DATA}/benchmark-genome-wide-predictions/chen-et-al-2022/mchale.kmerSizes.trainSets.noisy.enhancer-exon.bed'
  df = pd.read_csv(filename, sep='\t')
  df = df[['chromosome', 'start', 'end', 'window overlaps enhancer', 'window overlaps merged_exon']]
  df = df.rename(columns={
    'chromosome': 'chrom'
  })
  return df

WINDOWS_0 = get_chen_windows_with_enhancers_exons()
WINDOWS_0

Unnamed: 0,chrom,start,end,window overlaps enhancer,window overlaps merged_exon
0,chr1,1432000,1433000,True,False
1,chr1,1435000,1436000,True,True
2,chr1,1449000,1450000,False,True
3,chr1,1450000,1451000,False,True
4,chr1,1451000,1452000,False,False
...,...,...,...,...,...
1786072,chr9,137269000,137270000,True,False
1786073,chr9,137275000,137276000,True,False
1786074,chr9,137282000,137283000,True,True
1786075,chr9,137290000,137291000,False,False


In [6]:
def get_header(filename):
  with open(filename) as fh: 
    header = fh.readline().strip().split('\t')
  return header

def intersect(a_filename, b_filename, intersect_filename, b_feature): 
  cmd = (
    f'bedtools intersect'
    f" -a <(tail -n +2 {a_filename})" # contains header
    f" -b <(cat {b_filename})"        # does not contain header
    f' -wao -f 0.5'
    f' > {intersect_filename}'
  )
  shell(cmd)
  print_string_as_info(f'Wrote {intersect_filename}')

  a_file_header = get_header(a_filename) # contains header
  b_file_header = [f'{b_feature}_chrom', f'{b_feature}_start', f'{b_feature}_end', b_feature] # does not contain header

  return a_file_header, b_file_header

def count_overlaps(a_filename, b_filename, count_filename, b_feature): 
  cmd = (
    f'bedtools intersect'
    f" -a <(tail -n +2 {a_filename})" # contains header
    f" -b <(cat {b_filename})"        # does not contain header
    f' -c'                            # count overlaps
    f' > {count_filename}'
  )
  shell(cmd)
  print_string_as_info(f'Wrote {count_filename}')

  return get_header(a_filename) + [f'{b_feature}_overlap_count']

def set_column_dtypes(df, column_dtypes, log=False): 
  for col, dtype in column_dtypes.items(): 
    df[col] = df[col].astype(dtype)

  if log: print(dict(df.dtypes))
  return df 

def make_scores_numeric(df, b_feature): 
  # we don't want to include enhancers that don't overlap any windows with b_feature values
  df = df[df[f'{b_feature}_chrom'] != '.'] 

  # having removed records where 'b_feature_chrom' is '.', we can now convert the 'b_feature' column to a float
  df = set_column_dtypes(
    df,   
    column_dtypes = {
      b_feature: float,
    }
  )

  return df 
  
def intersect_and_aggregate(a_filename_stem, b_filename, b_feature, b_feature_aggregation): 
  a_filename = f'{a_filename_stem}.bed'
  intersect_filename = f'{a_filename_stem}.intersect.{b_feature}-map.bed'

  a_file_header, b_file_header = intersect(a_filename, b_filename, intersect_filename, b_feature)

  df = pd.read_csv(
    intersect_filename, 
    sep='\t',
    names=a_file_header + b_file_header + [f'chenWindow_{b_feature}Window_overlap'],
  )

  df = make_scores_numeric(df, b_feature)

  # some Chen-windows may intersect multiple b_feature-windows, 
  # so let's group by Chen-window, and aggregate scores over all b_feature-windows in the group
  df = aggregate(
    df, 
    group_columns = a_file_header,
    aggregation_functions = {
      b_feature: [b_feature_aggregation],
    }
  )
  df = df.rename(columns={f'{b_feature_aggregation} {b_feature}': b_feature})

  df.to_csv(
    f'{a_filename_stem}.{b_feature}.bed',
    sep='\t',
    index=False,
  )

def count_overlaps_wrapper(a_filename_stem, b_filename, b_feature): 
  a_filename = f'{a_filename_stem}.bed'
  count_filename = f'{a_filename_stem}.{b_feature}-counts.bed'

  count_file_header = count_overlaps(a_filename, b_filename, count_filename, b_feature)

  df = pd.read_csv(
    count_filename, 
    sep='\t',
    names=count_file_header,
  )

  df.to_csv(
    count_filename,
    sep='\t',
    index=False,
  )

def assign_scores_to_chen_windows():
  df_gnocchi = pd.read_csv(
    f'{CONSTRAINT_TOOLS_DATA}/chen-et-al-2023-published-version/41586_2023_6045_MOESM4_ESM/Supplementary_Data_2-gnocchi.bed',
    # f'{CONSTRAINT_TOOLS_DATA}/chen-et-al-2023-published-version/41586_2023_6045_MOESM4_ESM/Supplementary_Data_2-gnocchi.test.bed', # TODO: comment out this line
    sep='\t',
    names=['chrom', 'start', 'end', 'gnocchi']
  )
  df_N_expected = pd.read_csv(
    f'{CONSTRAINT_TOOLS_DATA}/chen-et-al-2023-published-version/41586_2023_6045_MOESM4_ESM/Supplementary_Data_2-N_expected_gnocchi.bed',
    sep='\t',
    names=['chrom', 'start', 'end', 'N_expected']
  )
  df_N_observed = pd.read_csv(
    f'{CONSTRAINT_TOOLS_DATA}/chen-et-al-2023-published-version/41586_2023_6045_MOESM4_ESM/Supplementary_Data_2-N_observed.bed',
    sep='\t',
    names=['chrom', 'start', 'end', 'N_observed']
  )
  df_enhancers_exons = WINDOWS_0

  df = pd.merge(
    df_gnocchi, 
    df_N_expected, 
    on=['chrom', 'start', 'end'],
    how='inner'
  )
  df = pd.merge(
    df, 
    df_N_observed, 
    on=['chrom', 'start', 'end'],
    how='inner'
  )
  df = pd.merge(
    df, 
    df_enhancers_exons, 
    on=['chrom', 'start', 'end'],
    how='inner'
  )

  df.to_csv(
    f'{CONSTRAINT_TOOLS_DATA}/chen-et-al-2023-published-version/41586_2023_6045_MOESM4_ESM/Supplementary_Data_2.gnocchi.N_expected.N_observed.bed',
    sep='\t',
    index=False,
  )

  # assign BGS values to Chen et al. windows 
  intersect_and_aggregate(
    a_filename_stem = f'{CONSTRAINT_TOOLS_DATA}/chen-et-al-2023-published-version/41586_2023_6045_MOESM4_ESM/Supplementary_Data_2.gnocchi.N_expected.N_observed', 
    b_filename = f'{CONSTRAINT_TOOLS_DATA}/background-selection/CADD-B-map/bmap.hg38.bed', 
    b_feature = 'B', 
    b_feature_aggregation = 'max'
  )   

  # assign paternal recombination rates to Chen et al. windows 
  intersect_and_aggregate(
    a_filename_stem = f'{CONSTRAINT_TOOLS_DATA}/chen-et-al-2023-published-version/41586_2023_6045_MOESM4_ESM/Supplementary_Data_2.gnocchi.N_expected.N_observed.B', 
    b_filename = f'{CONSTRAINT_TOOLS_DATA}/GC-biased-gene-conversion/paternal-recombination-rate.grch38.bed', 
    b_feature = 'paternal_recombination_rate', 
    b_feature_aggregation = 'mean',
  )   

  # assign maternal recombination rates to Chen et al. windows 
  intersect_and_aggregate(
    a_filename_stem = f'{CONSTRAINT_TOOLS_DATA}/chen-et-al-2023-published-version/41586_2023_6045_MOESM4_ESM/Supplementary_Data_2.gnocchi.N_expected.N_observed.B.paternal_recombination_rate', 
    b_filename = f'{CONSTRAINT_TOOLS_DATA}/GC-biased-gene-conversion/maternal-recombination-rate.grch38.bed', 
    b_feature = 'maternal_recombination_rate', 
    b_feature_aggregation = 'mean'
  )   

  # compute number of gBGC tracts that each Chen et al. window overlaps
  count_overlaps_wrapper(
    a_filename_stem = f'{CONSTRAINT_TOOLS_DATA}/chen-et-al-2023-published-version/41586_2023_6045_MOESM4_ESM/Supplementary_Data_2.gnocchi.N_expected.N_observed.B.paternal_recombination_rate.maternal_recombination_rate', 
    b_filename = f'{CONSTRAINT_TOOLS_DATA}/GC-biased-gene-conversion/gBGC-tracts.hg38.bed', 
    b_feature = 'gBGC-tract'
  )

  df = pd.read_csv(
    f'{CONSTRAINT_TOOLS_DATA}/chen-et-al-2023-published-version/41586_2023_6045_MOESM4_ESM/Supplementary_Data_2.gnocchi.N_expected.N_observed.B.paternal_recombination_rate.maternal_recombination_rate.gBGC-tract-counts.bed',
    sep='\t',
  )

  df = df[df['window overlaps merged_exon'] == False] # strictly non-exonic windows

  # throw out windows with recombination rates less than 1e-8, and take the log10 of the recombination rates
  for sex in ['paternal', 'maternal']:
    df = df[df[f'{sex}_recombination_rate'] > 1e-8] # log10(0) is undefined 
    df[f'log10_{sex}_recombination_rate'] = np.log10(df[f'{sex}_recombination_rate'])

  df.to_csv(
    f'{CONSTRAINT_TOOLS_DATA}/chen-et-al-2023-published-version/41586_2023_6045_MOESM4_ESM/Supplementary_Data_2.gnocchi.N_expected.N_observed.B.paternal_recombination_rate.maternal_recombination_rate.gBGC-tract-counts.non-exonic.bed',
    sep='\t',
    index=False,
  )

  return df

pd.set_option('display.max_columns', 100)

WINDOWS = assign_scores_to_chen_windows()
WINDOWS




[36mWrote /scratch/ucgd/lustre-work/quinlan/data-shared/constraint-tools/chen-et-al-2023-published-version/41586_2023_6045_MOESM4_ESM/Supplementary_Data_2.gnocchi.N_expected.N_observed.intersect.B-map.bed[0m
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col] = df[col].astype(dtype)





[36mWrote /scratch/ucgd/lustre-work/quinlan/data-shared/constraint-tools/chen-et-al-2023-published-version/41586_2023_6045_MOESM4_ESM/Supplementary_Data_2.gnocchi.N_expected.N_observed.B.intersect.paternal_recombination_rate-map.bed[0m





[36mWrote /scratch/ucgd/lustre-work/quinlan/data-shared/constraint-tools/chen-et-al-2023-published-version/41586_2023_6045_MOESM4_ESM/Supplementary_Data_2.gnocchi.N_expected.N_observed.B.paternal_recombination_rate.intersect.maternal_recombination_rate-map.bed[0m





[36mWrote /scratch/ucgd/lustre-work/quinlan/data-shared/constraint-tools/chen-et-al-2023-published-version/41586_2023_6045_MOESM4_ESM/Supplementary_Data_2.gnocchi.N_expected.N_observed.B.paternal_recombination_rate.maternal_recombination_rate.gBGC-tract-counts.bed[0m


Unnamed: 0,chrom,start,end,gnocchi,N_expected,N_observed,window overlaps enhancer,window overlaps merged_exon,B,paternal_recombination_rate,maternal_recombination_rate,gBGC-tract_overlap_count,log10_paternal_recombination_rate,log10_maternal_recombination_rate
0,chr1,1432000,1433000,4.299894,338.059552,259,True,False,0.653,0.039046,0.025852,0,-1.408422,-1.587499
4,chr1,1451000,1452000,0.666316,302.590657,291,False,False,0.652,0.039046,0.025852,0,-1.408422,-1.587499
5,chr1,1453000,1454000,0.828398,269.601925,256,False,False,0.651,0.039046,0.025852,0,-1.408422,-1.587499
7,chr1,1458000,1459000,-0.086128,270.583243,272,False,False,0.651,0.039046,0.025852,0,-1.408422,-1.587499
11,chr1,1463000,1464000,2.948188,236.321790,191,True,False,0.651,0.039046,0.025852,0,-1.408422,-1.587499
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1556241,chr9,137262000,137263000,-2.396149,263.131293,302,True,False,0.536,0.057053,0.063434,0,-1.243724,-1.197675
1556242,chr9,137268000,137269000,3.640544,272.046527,212,True,False,0.536,0.057053,0.063434,0,-1.243724,-1.197675
1556243,chr9,137269000,137270000,5.276351,300.458958,209,True,False,0.536,0.057053,0.063434,0,-1.243724,-1.197675
1556244,chr9,137275000,137276000,2.687348,339.517045,290,True,False,0.536,1.564889,0.024254,0,0.194484,-1.615221
