In [1]:
import logging 

logging.basicConfig(
  format = '%(asctime)s %(message)s',
  level=logging.INFO,
  datefmt='%Y-%m-%d %H:%M:%S'
)

In [2]:
import pyranges as pr

CONSTRAINT_TOOLS_DATA = '/scratch/ucgd/lustre-work/quinlan/data-shared/constraint-tools'
cpg_islands_path = f'{CONSTRAINT_TOOLS_DATA}/cpg-islands/grch38/cpg-islands.sorted.bed.gz'

# https://biocore-ntnu.github.io/pyranges/loadingcreating-pyranges.html
cpg_islands = pr.read_bed(cpg_islands_path)

cpg_islands.columns = ['Chromosome', 'Start', 'End', 'Name', 'Length', 'cpgNum', 'gcNum', 'perCpg', 'perGc', 'obsExp']
cpg_islands

Unnamed: 0,Chromosome,Start,End,Name,Length,cpgNum,gcNum,perCpg,perGc,obsExp
0,chr1,28735,29737,CpG: 111,1002,111,731,22.2,73.0,0.85
1,chr1,135124,135563,CpG: 30,439,30,295,13.7,67.2,0.64
2,chr1,199251,200121,CpG: 104,870,104,643,23.9,73.9,0.89
3,chr1,368792,370063,CpG: 99,1271,99,777,15.6,61.1,0.84
4,chr1,381172,382185,CpG: 84,1013,84,734,16.6,72.5,0.64
...,...,...,...,...,...,...,...,...,...,...
27944,chrY,25464370,25464941,CpG: 51,571,51,403,17.9,70.6,0.72
27945,chrY,26409388,26409785,CpG: 32,397,32,252,16.1,63.5,0.82
27946,chrY,26627168,26627397,CpG: 25,229,25,172,21.8,75.1,0.78
27947,chrY,57067645,57068034,CpG: 36,389,36,257,18.5,66.1,0.85


In [3]:
import sys
sys.path.append('/scratch/ucgd/lustre-work/quinlan/u6018199/constraint-tools/utilities')

from read_model import read_model 

model = read_model('/scratch/ucgd/lustre-work/quinlan/u6018199/constraint-tools/dist/model-germline-grch38.json')

# https://biocore-ntnu.github.io/pyranges/loadingcreating-pyranges.html
neutral_regions = pr.read_bed(model['neutralRegions'])

neutral_regions

Unnamed: 0,Chromosome,Start,End
0,chr1,14653,14905
1,chr1,15354,15772
2,chr1,16125,16607
3,chr1,17055,17171
4,chr1,17436,17606
...,...,...,...
1649454,chr22,50794754,50796286
1649455,chr22,50796386,50796692
1649456,chr22,50796780,50796942
1649457,chr22,50797586,50797812


In [4]:
cpg_islands.cpg_island_id = cpg_islands.df.index.values

def compute_cpg_island_region(row):
  return f'{row["Chromosome"]}:{row["Start"]}-{row["End"]}'
cpg_islands.cpg_island_region = cpg_islands.df.apply(compute_cpg_island_region, axis=1)

cpg_islands

Unnamed: 0,Chromosome,Start,End,Name,Length,cpgNum,gcNum,perCpg,perGc,obsExp,cpg_island_id,cpg_island_region
0,chr1,28735,29737,CpG: 111,1002,111,731,22.2,73.0,0.85,0,chr1:28735-29737
1,chr1,135124,135563,CpG: 30,439,30,295,13.7,67.2,0.64,1,chr1:135124-135563
2,chr1,199251,200121,CpG: 104,870,104,643,23.9,73.9,0.89,2,chr1:199251-200121
3,chr1,368792,370063,CpG: 99,1271,99,777,15.6,61.1,0.84,3,chr1:368792-370063
4,chr1,381172,382185,CpG: 84,1013,84,734,16.6,72.5,0.64,4,chr1:381172-382185
...,...,...,...,...,...,...,...,...,...,...,...,...
27944,chrY,25464370,25464941,CpG: 51,571,51,403,17.9,70.6,0.72,27944,chrY:25464370-25464941
27945,chrY,26409388,26409785,CpG: 32,397,32,252,16.1,63.5,0.82,27945,chrY:26409388-26409785
27946,chrY,26627168,26627397,CpG: 25,229,25,172,21.8,75.1,0.78,27946,chrY:26627168-26627397
27947,chrY,57067645,57068034,CpG: 36,389,36,257,18.5,66.1,0.85,27947,chrY:57067645-57068034


In [5]:
# https://pyranges.readthedocs.io/en/latest/autoapi/pyranges/pyranges/index.html?highlight=intersect#pyranges.pyranges.PyRanges.overlap

cpg_islands_that_overlap_neutral_regions = cpg_islands.overlap(neutral_regions).df
cpg_islands_that_overlap_neutral_regions

Unnamed: 0,Chromosome,Start,End,Name,Length,cpgNum,gcNum,perCpg,perGc,obsExp,cpg_island_id,cpg_island_region
0,chr1,135124,135563,CpG: 30,439,30,295,13.7,67.2,0.64,1,chr1:135124-135563
1,chr1,778604,779167,CpG: 60,563,60,385,21.3,68.4,0.92,8,chr1:778604-779167
2,chr1,869818,870248,CpG: 50,430,50,316,23.3,73.5,0.87,12,chr1:869818-870248
3,chr1,904314,905239,CpG: 119,925,119,693,25.7,74.9,0.92,13,chr1:904314-905239
4,chr1,908919,910503,CpG: 154,1584,154,1106,19.4,69.8,0.82,14,chr1:908919-910503
...,...,...,...,...,...,...,...,...,...,...,...,...
21422,chr22,50697242,50697698,CpG: 56,456,56,327,24.6,71.7,0.96,26860,chr22:50697242-50697698
21423,chr22,50704374,50704880,CpG: 38,506,38,328,15.0,64.8,0.72,26861,chr22:50704374-50704880
21424,chr22,50710877,50711294,CpG: 41,417,41,267,19.7,64.0,1.01,26862,chr22:50710877-50711294
21425,chr22,50719958,50721632,CpG: 180,1674,180,1209,21.5,72.2,0.84,26863,chr22:50719958-50721632


In [6]:
# https://pyranges.readthedocs.io/en/latest/autoapi/pyranges/pyranges/index.html?highlight=intersect#pyranges.pyranges.PyRanges.intersect

neutral_regions_within_cpg_islands = cpg_islands.intersect(neutral_regions).df[['Start', 'End', 'cpg_island_id']]
neutral_regions_within_cpg_islands


Unnamed: 0,Start,End,cpg_island_id
0,135124,135141,1
1,779092,779167,8
2,869818,870086,12
3,870201,870248,12
4,904314,904478,13
...,...,...,...
35769,50710983,50711117,26862
35770,50711118,50711224,26862
35771,50711225,50711294,26862
35772,50719958,50720184,26863


In [7]:
import pandas as pd

cpg_islands_with_neutral_regions = pd.merge(
  cpg_islands_that_overlap_neutral_regions, 
  neutral_regions_within_cpg_islands, 
  on='cpg_island_id', 
  how='inner'
)
cpg_islands_with_neutral_regions = cpg_islands_with_neutral_regions.rename(columns={
  "Start_x": "Start_cpg_island", 
  "End_x": "End_cpg_island",
  "Start_y": "Start_neutral_part", 
  "End_y": "End_neutral_part",
})
cpg_islands_with_neutral_regions


Unnamed: 0,Chromosome,Start_cpg_island,End_cpg_island,Name,Length,cpgNum,gcNum,perCpg,perGc,obsExp,cpg_island_id,cpg_island_region,Start_neutral_part,End_neutral_part
0,chr1,135124,135563,CpG: 30,439,30,295,13.7,67.2,0.64,1,chr1:135124-135563,135124,135141
1,chr1,778604,779167,CpG: 60,563,60,385,21.3,68.4,0.92,8,chr1:778604-779167,779092,779167
2,chr1,869818,870248,CpG: 50,430,50,316,23.3,73.5,0.87,12,chr1:869818-870248,869818,870086
3,chr1,869818,870248,CpG: 50,430,50,316,23.3,73.5,0.87,12,chr1:869818-870248,870201,870248
4,chr1,904314,905239,CpG: 119,925,119,693,25.7,74.9,0.92,13,chr1:904314-905239,904314,904478
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
35769,chr22,50710877,50711294,CpG: 41,417,41,267,19.7,64.0,1.01,26862,chr22:50710877-50711294,50710983,50711117
35770,chr22,50710877,50711294,CpG: 41,417,41,267,19.7,64.0,1.01,26862,chr22:50710877-50711294,50711118,50711224
35771,chr22,50710877,50711294,CpG: 41,417,41,267,19.7,64.0,1.01,26862,chr22:50710877-50711294,50711225,50711294
35772,chr22,50719958,50721632,CpG: 180,1674,180,1209,21.5,72.2,0.84,26863,chr22:50719958-50721632,50719958,50720184


In [8]:
def compute_neutral_part_length(row):
  return row['End_neutral_part'] - row['Start_neutral_part']

cpg_islands_with_neutral_regions['neutral_part_length'] = cpg_islands_with_neutral_regions.apply(compute_neutral_part_length, axis=1)
cpg_islands_with_neutral_regions

Unnamed: 0,Chromosome,Start_cpg_island,End_cpg_island,Name,Length,cpgNum,gcNum,perCpg,perGc,obsExp,cpg_island_id,cpg_island_region,Start_neutral_part,End_neutral_part,neutral_part_length
0,chr1,135124,135563,CpG: 30,439,30,295,13.7,67.2,0.64,1,chr1:135124-135563,135124,135141,17
1,chr1,778604,779167,CpG: 60,563,60,385,21.3,68.4,0.92,8,chr1:778604-779167,779092,779167,75
2,chr1,869818,870248,CpG: 50,430,50,316,23.3,73.5,0.87,12,chr1:869818-870248,869818,870086,268
3,chr1,869818,870248,CpG: 50,430,50,316,23.3,73.5,0.87,12,chr1:869818-870248,870201,870248,47
4,chr1,904314,905239,CpG: 119,925,119,693,25.7,74.9,0.92,13,chr1:904314-905239,904314,904478,164
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
35769,chr22,50710877,50711294,CpG: 41,417,41,267,19.7,64.0,1.01,26862,chr22:50710877-50711294,50710983,50711117,134
35770,chr22,50710877,50711294,CpG: 41,417,41,267,19.7,64.0,1.01,26862,chr22:50710877-50711294,50711118,50711224,106
35771,chr22,50710877,50711294,CpG: 41,417,41,267,19.7,64.0,1.01,26862,chr22:50710877-50711294,50711225,50711294,69
35772,chr22,50719958,50721632,CpG: 180,1674,180,1209,21.5,72.2,0.84,26863,chr22:50719958-50721632,50719958,50720184,226


In [9]:
cpg_islands_with_neutral_regions = cpg_islands_with_neutral_regions[['cpg_island_id', 'neutral_part_length']]
cpg_islands_with_neutral_regions 


Unnamed: 0,cpg_island_id,neutral_part_length
0,1,17
1,8,75
2,12,268
3,12,47
4,13,164
...,...,...
35769,26862,134
35770,26862,106
35771,26862,69
35772,26863,226


In [10]:
grouped_neutral_regions = cpg_islands_with_neutral_regions.groupby('cpg_island_id')

number_neutral_bases = grouped_neutral_regions['neutral_part_length'].sum().rename("number_neutral_bases")
number_neutral_bases

cpg_island_id
1         17
8         75
12       315
13       422
14       547
        ... 
26860    204
26861    382
26862    414
26863    226
26864    122
Name: number_neutral_bases, Length: 21427, dtype: int64

In [11]:
number_neutral_parts = grouped_neutral_regions['neutral_part_length'].count().rename('number_neutral_parts')
number_neutral_parts

cpg_island_id
1        1
8        1
12       2
13       3
14       2
        ..
26860    1
26861    2
26862    4
26863    1
26864    1
Name: number_neutral_parts, Length: 21427, dtype: int64

In [12]:
cpg_islands = ( 
  cpg_islands.df
  .merge(number_neutral_bases, on='cpg_island_id', how='inner')
  .merge(number_neutral_parts, on='cpg_island_id', how='inner')
)

cpg_islands

Unnamed: 0,Chromosome,Start,End,Name,Length,cpgNum,gcNum,perCpg,perGc,obsExp,cpg_island_id,cpg_island_region,number_neutral_bases,number_neutral_parts
0,chr1,135124,135563,CpG: 30,439,30,295,13.7,67.2,0.64,1,chr1:135124-135563,17,1
1,chr1,778604,779167,CpG: 60,563,60,385,21.3,68.4,0.92,8,chr1:778604-779167,75,1
2,chr1,869818,870248,CpG: 50,430,50,316,23.3,73.5,0.87,12,chr1:869818-870248,315,2
3,chr1,904314,905239,CpG: 119,925,119,693,25.7,74.9,0.92,13,chr1:904314-905239,422,3
4,chr1,908919,910503,CpG: 154,1584,154,1106,19.4,69.8,0.82,14,chr1:908919-910503,547,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21422,chr22,50697242,50697698,CpG: 56,456,56,327,24.6,71.7,0.96,26860,chr22:50697242-50697698,204,1
21423,chr22,50704374,50704880,CpG: 38,506,38,328,15.0,64.8,0.72,26861,chr22:50704374-50704880,382,2
21424,chr22,50710877,50711294,CpG: 41,417,41,267,19.7,64.0,1.01,26862,chr22:50710877-50711294,414,4
21425,chr22,50719958,50721632,CpG: 180,1674,180,1209,21.5,72.2,0.84,26863,chr22:50719958-50721632,226,1


In [13]:
# https://papermill.readthedocs.io/en/latest/usage-parameterize.html
number_neutral_bases_lower = None
number_neutral_bases_upper = None

In [14]:
# Parameters
number_neutral_bases_lower = 150
number_neutral_bases_upper = 200


In [15]:
# https://pandas.pydata.org/pandas-docs/stable/getting_started/intro_tutorials/03_subset_data.html#how-do-i-filter-specific-rows-from-a-dataframe
cpg_islands = cpg_islands[
    (cpg_islands.number_neutral_bases > number_neutral_bases_lower) & 
    (cpg_islands.number_neutral_bases < number_neutral_bases_upper) & 
    (cpg_islands.number_neutral_parts < 5)
]
cpg_islands

Unnamed: 0,Chromosome,Start,End,Name,Length,cpgNum,gcNum,perCpg,perGc,obsExp,cpg_island_id,cpg_island_region,number_neutral_bases,number_neutral_parts
10,chr1,950976,951222,CpG: 18,246,18,163,14.6,66.3,0.67,21,chr1:950976-951222,151,1
15,chr1,1013290,1013514,CpG: 19,224,19,138,17.0,61.6,0.89,27,chr1:1013290-1013514,164,1
22,chr1,1046772,1047131,CpG: 26,359,26,248,14.5,69.1,0.61,34,chr1:1046772-1047131,174,1
45,chr1,1242505,1242827,CpG: 26,322,26,228,16.1,70.8,0.66,69,chr1:1242505-1242827,181,1
55,chr1,1336045,1336694,CpG: 51,649,51,467,15.7,72.0,0.61,85,chr1:1336045-1336694,179,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21386,chr22,50197876,50198568,CpG: 59,692,59,487,17.1,70.4,0.69,26810,chr22:50197876-50198568,159,1
21391,chr22,50266977,50268124,CpG: 131,1147,131,834,22.8,72.7,0.87,26820,chr22:50266977-50268124,175,1
21395,chr22,50287040,50287389,CpG: 29,349,29,248,16.6,71.1,0.66,26825,chr22:50287040-50287389,196,2
21396,chr22,50287679,50288091,CpG: 35,412,35,293,17.0,71.1,0.67,26826,chr22:50287679-50288091,198,2


In [16]:
#papermill_description=COMPUTE_ZSCORES

sys.path.append('/scratch/ucgd/lustre-work/quinlan/u6018199/constraint-tools/predict-constraint/germline-model')

import numpy as np 

from expected_observed_counts import compute_expected_observed_counts
from pack_unpack import pack

window_stride = 10 

def filter_and_average(xs): 
  if not xs: return None 
  else: return np.mean([x for x in xs if x])

def compute_neutral_zscores(row): 
  region = pack(row.Chromosome, row.Start, row.End)  
  expected_observed_counts = compute_expected_observed_counts(region, model, window_stride, log=False) 
  # https://www.dataquest.io/blog/settingwithcopywarning/
  with pd.option_context('mode.chained_assignment', None):
    row['N_bar_mean_neutral'] = filter_and_average(expected_observed_counts['NBarsNeutralRegions'])
    row['N_observeds_mean_neutral'] = filter_and_average(expected_observed_counts['NObservedsNeutralRegions'])
    row['K_bar_mean_neutral'] = filter_and_average(expected_observed_counts['KBarsNeutralRegions'])
    row['K_observeds_mean_neutral'] = filter_and_average(expected_observed_counts['KObservedsNeutralRegions'])
  return row 

# https://stackoverflow.com/a/34365537/6674256
from tqdm.auto import tqdm  # for notebooks
tqdm.pandas()
logging.info('Computing z-scores for cpg islands...')
cpg_islands_with_zscores = cpg_islands.progress_apply(compute_neutral_zscores, axis=1)
logging.info('...finished computing z-scores for cpg islands...')

cpg_islands_with_zscores.head()

2022-06-01 16:28:50 Computing z-scores for cpg islands...


  0%|          | 0/1555 [00:00<?, ?it/s]

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
2022-06-01 17:42:00 ...finished computing z-scores for cpg islands...


Unnamed: 0,Chromosome,Start,End,Name,Length,cpgNum,gcNum,perCpg,perGc,obsExp,cpg_island_id,cpg_island_region,number_neutral_bases,number_neutral_parts,N_bar_mean_neutral,N_observeds_mean_neutral,K_bar_mean_neutral,K_observeds_mean_neutral
10,chr1,950976,951222,CpG: 18,246,18,163,14.6,66.3,0.67,21,chr1:950976-951222,151,1,0.621543,31.5,0.238332,16.333333
15,chr1,1013290,1013514,CpG: 19,224,19,138,17.0,61.6,0.89,27,chr1:1013290-1013514,164,1,0.637661,30.571429,-1.060627,12.0
22,chr1,1046772,1047131,CpG: 26,359,26,248,14.5,69.1,0.61,34,chr1:1046772-1047131,174,1,1.178759,36.0,-1.693344,11.571429
45,chr1,1242505,1242827,CpG: 26,322,26,228,16.1,70.8,0.66,69,chr1:1242505-1242827,181,1,-0.709334,28.75,-1.185715,11.0
55,chr1,1336045,1336694,CpG: 51,649,51,467,15.7,72.0,0.61,85,chr1:1336045-1336694,179,1,2.863783,46.0,0.755085,24.25


In [17]:
cpg_islands_with_zscores_path = (
    f'{CONSTRAINT_TOOLS_DATA}/cpg-islands/grch38/'
    f'cpg-islands-with-zscores.{number_neutral_bases_lower}-{number_neutral_bases_upper}.bed'
)

cpg_islands_with_zscores.to_csv(cpg_islands_with_zscores_path, index=False, sep='\t')