In [None]:
import logging 

logging.basicConfig(
  format = '%(asctime)s %(message)s',
  level=logging.INFO,
  datefmt='%Y-%m-%d %H:%M:%S'
)

In [None]:
import pyranges as pr

CONSTRAINT_TOOLS_DATA = '/scratch/ucgd/lustre-work/quinlan/data-shared/constraint-tools'
cpg_islands_path = f'{CONSTRAINT_TOOLS_DATA}/cpg-islands/grch38/cpg-islands.sorted.bed.gz'

# https://biocore-ntnu.github.io/pyranges/loadingcreating-pyranges.html
cpg_islands = pr.read_bed(cpg_islands_path)

cpg_islands.columns = ['Chromosome', 'Start', 'End', 'Name', 'Length', 'cpgNum', 'gcNum', 'perCpg', 'perGc', 'obsExp']
cpg_islands

In [None]:
import sys
sys.path.append('/scratch/ucgd/lustre-work/quinlan/u6018199/constraint-tools/utilities')

from read_model import read_model 

model = read_model('/scratch/ucgd/lustre-work/quinlan/u6018199/constraint-tools/dist/model-germline-grch38.json')

# https://biocore-ntnu.github.io/pyranges/loadingcreating-pyranges.html
neutral_regions = pr.read_bed(model['neutralRegions'])

neutral_regions

In [None]:
cpg_islands.cpg_island_id = cpg_islands.df.index.values
cpg_islands

In [None]:
# https://pyranges.readthedocs.io/en/latest/autoapi/pyranges/pyranges/index.html?highlight=intersect#pyranges.pyranges.PyRanges.overlap

cpg_islands_that_overlap_neutral_regions = cpg_islands.overlap(neutral_regions).df
cpg_islands_that_overlap_neutral_regions

In [None]:
# https://pyranges.readthedocs.io/en/latest/autoapi/pyranges/pyranges/index.html?highlight=intersect#pyranges.pyranges.PyRanges.intersect

neutral_regions_within_cpg_islands = cpg_islands.intersect(neutral_regions).df[['Start', 'End', 'cpg_island_id']]
neutral_regions_within_cpg_islands


In [None]:
import pandas as pd

cpg_islands_with_neutral_regions = pd.merge(cpg_islands_that_overlap_neutral_regions, neutral_regions_within_cpg_islands, on='cpg_island_id', how='inner')
cpg_islands_with_neutral_regions = cpg_islands_with_neutral_regions.rename(columns={
  "Start_x": "Start_cpg_island", 
  "End_x": "End_cpg_island",
  "Start_y": "Start_neutral_part", 
  "End_y": "End_neutral_part",
})
cpg_islands_with_neutral_regions


In [None]:
def compute_neutral_part_length(row):
  return row['End_neutral_part'] - row['Start_neutral_part']

cpg_islands_with_neutral_regions['neutral_part_length'] = cpg_islands_with_neutral_regions.apply(compute_neutral_part_length, axis=1)
cpg_islands_with_neutral_regions

In [None]:
cpg_islands_with_neutral_regions = cpg_islands_with_neutral_regions[['cpg_island_id', 'neutral_part_length']]
cpg_islands_with_neutral_regions 


In [None]:
grouped_neutral_regions = cpg_islands_with_neutral_regions.groupby('cpg_island_id')

number_neutral_bases = grouped_neutral_regions['neutral_part_length'].sum().rename("number_neutral_bases")
number_neutral_bases

In [None]:
number_neutral_parts = grouped_neutral_regions['neutral_part_length'].count().rename('number_neutral_parts')
number_neutral_parts

In [None]:
cpg_islands = ( 
  cpg_islands.df
  .merge(number_neutral_bases, on='cpg_island_id', how='inner')
  .merge(number_neutral_parts, on='cpg_island_id', how='inner')
)

cpg_islands

In [None]:
# https://pandas.pydata.org/pandas-docs/stable/getting_started/intro_tutorials/03_subset_data.html#how-do-i-filter-specific-rows-from-a-dataframe
cpg_islands = cpg_islands[(cpg_islands.number_neutral_bases > 1000) & (cpg_islands.number_neutral_parts < 5)]
cpg_islands

In [None]:
sys.path.append('/scratch/ucgd/lustre-work/quinlan/u6018199/constraint-tools/predict-constraint/germline-model')

import numpy as np 

from expected_observed_counts import compute_expected_observed_counts
from pack_unpack import pack

window_stride = 10 

def filter_and_average(xs): 
  xs_filtered = [x for x in xs if x]
  return np.mean(xs_filtered) 

def compute_neutral_zscores(row): 
  region = pack(row.Chromosome, row.Start, row.End)  
  expected_observed_counts = compute_expected_observed_counts(region, model, window_stride, log=False) 
  # https://www.dataquest.io/blog/settingwithcopywarning/
  with pd.option_context('mode.chained_assignment', None):
    row['N_bar_mean_neutral'] = filter_and_average(expected_observed_counts['NBarsNeutralRegions'])
    row['K_bar_mean_neutral'] = filter_and_average(expected_observed_counts['KBarsNeutralRegions'])
  return row 

# https://stackoverflow.com/a/34365537/6674256
from tqdm.auto import tqdm  # for notebooks
tqdm.pandas()
logging.info('Computing z-scores for cpg islands...')
cpg_islands_with_zscores = cpg_islands.progress_apply(compute_neutral_zscores, axis=1)
logging.info('...finished computing z-scores for cpg islands...')

cpg_islands_with_zscores.head()

In [None]:
cpg_islands_with_zscores_path = f'{CONSTRAINT_TOOLS_DATA}/cpg-islands/grch38/cpg-islands-with-zscores.bed'

cpg_islands_with_zscores.to_csv(cpg_islands_with_zscores_path, index=False, sep='\t')

In [None]:
import matplotlib.pyplot as plt 
plt.rcParams.update({'font.size': 20})

cpg_islands_with_zscores.plot.scatter(x="perCpg", y="N_bar_mean_neutral", alpha=0.5)
cpg_islands_with_zscores.plot.scatter(x="perCpg", y="K_bar_mean_neutral", alpha=0.5)