In [None]:
from IPython.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [None]:
import logging 

logging.basicConfig(
  format = '%(asctime)s %(message)s',
  level=logging.INFO,
  datefmt='%Y-%m-%d %H:%M:%S'
)

In [None]:
import pandas as pd

CONSTRAINT_TOOLS = '/scratch/ucgd/lustre-work/quinlan/u6018199/constraint-tools'
promoters_filename = f'{CONSTRAINT_TOOLS}/download-process-data/promoters/promoters.grch38.test.csv'
promoters = pd.read_csv(promoters_filename, sep=',')
promoters

In [None]:
import sys
sys.path.append('/scratch/ucgd/lustre-work/quinlan/u6018199/constraint-tools/utilities')

from pack_unpack import unpack
import pyranges as pr

def add_chromosome_start_end(row): 
    region = row['region_grch38'] 
    row['Chromosome'], row['Start'], row['End'] = unpack(region)
    return row

promoters = promoters.apply(add_chromosome_start_end, axis=1)
promoters = pr.PyRanges(promoters)
promoters

In [None]:

# neutral regions that overlap train AND test promoters: 
neutral_regions = pr.read_bed('/scratch/ucgd/lustre-work/quinlan/data-shared/constraint-tools/work-train-germline-model/neutral-regions.filtered.bed')

neutral_regions

In [None]:
promoters.promoter_id = promoters.df.index.values

promoters

In [None]:
# https://pyranges.readthedocs.io/en/latest/autoapi/pyranges/pyranges/index.html?highlight=intersect#pyranges.pyranges.PyRanges.overlap

promoters_that_overlap_neutral_regions = promoters.overlap(neutral_regions).df
promoters_that_overlap_neutral_regions

In [None]:
# https://pyranges.readthedocs.io/en/latest/autoapi/pyranges/pyranges/index.html?highlight=intersect#pyranges.pyranges.PyRanges.intersect

neutral_regions_within_promoters = promoters.intersect(neutral_regions).df[['Start', 'End', 'promoter_id']]
neutral_regions_within_promoters


In [None]:
promoters_with_neutral_regions = pd.merge(
  promoters_that_overlap_neutral_regions, 
  neutral_regions_within_promoters,
  on='promoter_id', 
  how='inner'
)
promoters_with_neutral_regions = promoters_with_neutral_regions.rename(columns={
  "Start_x": "Start_promoter", 
  "End_x": "End_promoter",
  "Start_y": "Start_neutral_part", 
  "End_y": "End_neutral_part",
})
promoters_with_neutral_regions


In [None]:
def compute_neutral_part_length(row):
    return row['End_neutral_part'] - row['Start_neutral_part']

promoters_with_neutral_regions['neutral_part_length'] = promoters_with_neutral_regions.apply(compute_neutral_part_length, axis=1)
promoters_with_neutral_regions

In [None]:
promoters_with_neutral_regions = promoters_with_neutral_regions[['promoter_id', 'neutral_part_length']]
promoters_with_neutral_regions 


In [None]:
grouped_neutral_regions = promoters_with_neutral_regions.groupby('promoter_id')

number_neutral_bases = grouped_neutral_regions['neutral_part_length'].sum().rename("number_neutral_bases")
number_neutral_bases

In [None]:
number_neutral_parts = grouped_neutral_regions['neutral_part_length'].count().rename('number_neutral_parts')
number_neutral_parts

In [None]:
promoters = ( 
  promoters.df
  .merge(number_neutral_bases, on='promoter_id', how='inner')
  .merge(number_neutral_parts, on='promoter_id', how='inner')
)

promoters

In [None]:
promoters['number_neutral_bases'].hist() 

In [None]:
# https://papermill.readthedocs.io/en/latest/usage-parameterize.html
window_size = None
window_stride = None 

In [None]:
#papermill_description=COMPUTE_ZSCORES

import numpy as np 

sys.path.append('/scratch/ucgd/lustre-work/quinlan/u6018199/constraint-tools/predict-constraint/germline-model')

from expected_observed_counts import (
    compute_expected_observed_counts,
    filter_by_regions
)
from pack_unpack import pack
from read_model import read_model 

def filter_and_average(xs): 
    if not xs: return None 
    elif len(xs) == 0: return None
    else: return np.mean([x for x in xs if x])

model = read_model(f'/scratch/ucgd/lustre-work/quinlan/u6018199/constraint-tools/dist/model-germline-grch38-exclude-test-promoters.windowSize-{window_size}.json')

def compute_neutral_zscores(row): 
    region = pack(row['Chromosome'], row['Start'], row['End'])  
    expected_observed_counts = compute_expected_observed_counts(region, model, window_stride, log=False) 
    (
        window_positions_neutral_regions, 
        N_bars_neutral_regions, 
        N_observeds_neutral_regions,
        K_bars_neutral_regions,
        K_observeds_neutral_regions
    ) = filter_by_regions(
        expected_observed_counts['windows'], 
        expected_observed_counts['NBars'], 
        expected_observed_counts['NObserveds'], 
        expected_observed_counts['KBars'], 
        expected_observed_counts['KObserveds'], 
        regions=neutral_regions, 
        how='containment'
    )    
    # https://www.dataquest.io/blog/settingwithcopywarning/
    with pd.option_context('mode.chained_assignment', None):
        row['N_bar_mean_neutral'] = filter_and_average(N_bars_neutral_regions)
        row['N_observeds_mean_neutral'] = filter_and_average(N_observeds_neutral_regions)
        row['K_bar_mean_neutral'] = filter_and_average(K_bars_neutral_regions)
        row['K_observeds_mean_neutral'] = filter_and_average(K_observeds_neutral_regions)
                
    return row 

# https://stackoverflow.com/a/34365537/6674256
from tqdm.auto import tqdm  # for notebooks
tqdm.pandas()
logging.info('Computing z-scores for promoters...')
promoters_with_zscores = promoters.progress_apply(compute_neutral_zscores, axis=1)
logging.info('...finished computing z-scores for promoters...')

promoters_with_zscores

In [None]:
promoters_with_zscores = promoters_with_zscores.rename(columns={'cpg_density': 'cpg_density_region_grch38'})
promoters_with_zscores

In [None]:
CONSTRAINT_TOOLS_DATA = '/scratch/ucgd/lustre-work/quinlan/data-shared/constraint-tools'
promoters_with_zscores_path = (
    f'{CONSTRAINT_TOOLS_DATA}/promoters-windowSize/grch38/'
    f'promoters-with-zscores.windowSize-{window_size}.windowStride-{window_stride}.bed'
)

promoters_with_zscores.to_csv(promoters_with_zscores_path, index=False, sep='\t')