## Setup 

In [10]:
import importlib
from pathlib import Path 
import sys 
import polars as pl 

PB_CPG_TOOL_MODE = 'model' # mode of aligned_bam_to_cpg_scores
OUTPUT_DIR = f"/scratch/ucgd/lustre-labs/quinlan/data-shared/dna-methylation/CEPH1463.GRCh38.hifi.{PB_CPG_TOOL_MODE}.founder-phased.all-cpgs" 
BED_ALL_CPGS = f"{OUTPUT_DIR}/all_cpg_sites.bed" # output of src/write_all_cpgs.py
METH_FOUNDER_PHASED_DIR = f"/scratch/ucgd/lustre-labs/quinlan/data-shared/dna-methylation/CEPH1463.GRCh38.hifi.{PB_CPG_TOOL_MODE}.founder-phased" # output dir of phase_meth_to_founder_haps.py
UID = 200081
BED_HAP_MAP = f"{METH_FOUNDER_PHASED_DIR}/{UID}.hap-map-blocks.bed" # bed file of hap-map blocks from src/phase_meth_to_founder_haps.py
BED_METH_FOUNDER_PHASED = f"{METH_FOUNDER_PHASED_DIR}/{UID}.dna-methylation.founder-phased.bed" # bed file of founder-phased methylation levels from src/phase_meth_to_founder_haps.py
BED_METH_FOUNDER_PHASED_ALL_CPGS = f"{OUTPUT_DIR}/{UID}.dna-methylation.founder-phased.all_cpgs.bed"

REPO_DIR = Path('/scratch/ucgd/lustre-labs/quinlan/u6018199/tapestry')
sys.path.append(f"{REPO_DIR}/src/util") 

## Get all CpG sites

In [11]:
import expand_to_all_cpgs
importlib.reload(expand_to_all_cpgs)
from expand_to_all_cpgs import read_all_cpgs

DF_ALL_CPGS = read_all_cpgs(BED_ALL_CPGS)
DF_ALL_CPGS

chrom,start,end
str,i64,i64
"""chr1""",10470,10471
"""chr1""",10483,10484
"""chr1""",10488,10489
"""chr1""",10492,10493
"""chr1""",10496,10497
…,…,…
"""chrM""",16448,16449
"""chrM""",16453,16454
"""chrM""",16494,16495
"""chrM""",16541,16542


## Get hap-map blocks

In [12]:
importlib.reload(expand_to_all_cpgs)
from expand_to_all_cpgs import read_hap_map_blocks

DF_HAP_MAP_BLOCKS = read_hap_map_blocks(BED_HAP_MAP)
DF_HAP_MAP_BLOCKS

chrom,start,end
str,i64,i64
"""chr1""",492339,532812
"""chr1""",593123,1314109
"""chr1""",1351126,1382658
"""chr1""",1421668,1427528
"""chr1""",1432961,1610923
…,…,…
"""chrX""",154941725,154993498
"""chrX""",155280990,155292580
"""chrX""",155370893,155485214
"""chrX""",155516010,155585416


## Assign CpG sites to an overlapping hap-map block, if such exists 

In [13]:
importlib.reload(expand_to_all_cpgs)
from expand_to_all_cpgs import assign_hap_map_blocks_to_cpgs

DF_ALL_CPGS_WITH_HAP_MAP_BLOCKS = assign_hap_map_blocks_to_cpgs(DF_ALL_CPGS, DF_HAP_MAP_BLOCKS)
DF_ALL_CPGS_WITH_HAP_MAP_BLOCKS 

chrom,start,end,start_hap_map_block,end_hap_map_block
str,i64,i64,i64,i64
"""chr1""",10470,10471,,
"""chr1""",10483,10484,,
"""chr1""",10488,10489,,
"""chr1""",10492,10493,,
"""chr1""",10496,10497,,
…,…,…,…,…
"""chrM""",16448,16449,,
"""chrM""",16453,16454,,
"""chrM""",16494,16495,,
"""chrM""",16541,16542,,


## Read in founder-phased DNA methylation at CpG sites

In [14]:
importlib.reload(expand_to_all_cpgs)
from expand_to_all_cpgs import read_meth_founder_phased

DF_METH_FOUNDER_PHASED = read_meth_founder_phased(BED_METH_FOUNDER_PHASED)
DF_METH_FOUNDER_PHASED

chrom,start,end,start_hap_map_block,end_hap_map_block,haplotype_concordance_in_hap_map_block,num_het_SNVs_in_hap_map_block,methylation_level_pat,methylation_level_mat,total_read_count_pat,total_read_count_mat,founder_haplotype_pat,founder_haplotype_mat,is_within_50bp_of_mismatch_site
str,i64,i64,i64,i64,f64,i64,f64,f64,i64,i64,str,str,bool
"""chr1""",14520,14521,,,,,,,,,,,false
"""chr1""",14552,14553,,,,,,,,,,,false
"""chr1""",14588,14589,,,,,,,,,,,false
"""chr1""",14652,14653,,,,,,,,,,,false
"""chr1""",14672,14673,,,,,,,,,,,false
…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""chrY""",56878251,56878252,,,,,,,,,,,
"""chrY""",56878350,56878351,,,,,,,,,,,
"""chrY""",56878385,56878386,,,,,,,,,,,
"""chrY""",56878431,56878432,,,,,,,,,,,


## Expand the dataframe of methylation levels to include all CpG sites and save to disk

In [15]:
importlib.reload(expand_to_all_cpgs)
from expand_to_all_cpgs import expand_meth_to_all_cpgs

DF_METH_FOUNDER_PHASED_ALL_CPGS = expand_meth_to_all_cpgs(DF_ALL_CPGS_WITH_HAP_MAP_BLOCKS, DF_METH_FOUNDER_PHASED, BED_METH_FOUNDER_PHASED_ALL_CPGS)
DF_METH_FOUNDER_PHASED_ALL_CPGS

chrom,start,end,start_hap_map_block,end_hap_map_block,haplotype_concordance_in_hap_map_block,num_het_SNVs_in_hap_map_block,methylation_level_pat,methylation_level_mat,total_read_count_pat,total_read_count_mat,founder_haplotype_pat,founder_haplotype_mat,is_within_50bp_of_mismatch_site
str,i64,i64,i64,i64,f64,i64,f64,f64,i64,i64,str,str,bool
"""chr1""",10470,10471,,,,,,,,,,,
"""chr1""",10483,10484,,,,,,,,,,,
"""chr1""",10488,10489,,,,,,,,,,,
"""chr1""",10492,10493,,,,,,,,,,,
"""chr1""",10496,10497,,,,,,,,,,,
…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""chrM""",16448,16449,,,,,,,,,,,
"""chrM""",16453,16454,,,,,,,,,,,
"""chrM""",16494,16495,,,,,,,,,,,
"""chrM""",16541,16542,,,,,,,,,,,


In [16]:
importlib.reload(expand_to_all_cpgs)
from expand_to_all_cpgs import write_bed_and_header

write_bed_and_header(BED_METH_FOUNDER_PHASED_ALL_CPGS, DF_METH_FOUNDER_PHASED_ALL_CPGS)
print(f"Wrote expanded methylation dataframe to {BED_METH_FOUNDER_PHASED_ALL_CPGS}")

Wrote expanded methylation dataframe to /scratch/ucgd/lustre-labs/quinlan/data-shared/dna-methylation/CEPH1463.GRCh38.hifi.model.founder-phased.all-cpgs/200081.dna-methylation.founder-phased.all_cpgs.bed


## Some CpG sites are in hap-map blocks but do not have methylation levels 

In [17]:
DF_METH_FOUNDER_PHASED_ALL_CPGS.filter(
    pl.col('start_hap_map_block').is_not_null() & 
    pl.col('haplotype_concordance_in_hap_map_block').is_null()
)

chrom,start,end,start_hap_map_block,end_hap_map_block,haplotype_concordance_in_hap_map_block,num_het_SNVs_in_hap_map_block,methylation_level_pat,methylation_level_mat,total_read_count_pat,total_read_count_mat,founder_haplotype_pat,founder_haplotype_mat,is_within_50bp_of_mismatch_site
str,i64,i64,i64,i64,f64,i64,f64,f64,i64,i64,str,str,bool
"""chr1""",492370,492371,492339,532812,,,,,,,,,
"""chr1""",492391,492392,492339,532812,,,,,,,,,
"""chr1""",492405,492406,492339,532812,,,,,,,,,
"""chr1""",492407,492408,492339,532812,,,,,,,,,
"""chr1""",492419,492420,492339,532812,,,,,,,,,
…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""chrX""",155651255,155651256,155616130,155659104,,,,,,,,,
"""chrX""",155651858,155651859,155616130,155659104,,,,,,,,,
"""chrX""",155655742,155655743,155616130,155659104,,,,,,,,,
"""chrX""",155657645,155657646,155616130,155659104,,,,,,,,,


In [18]:
DF_METH_FOUNDER_PHASED_ALL_CPGS.filter(
    (pl.col('start_hap_map_block') == 3399126) &
    (pl.col('start') > 3665300) & 
    (pl.col('end') < 3665600)
)

chrom,start,end,start_hap_map_block,end_hap_map_block,haplotype_concordance_in_hap_map_block,num_het_SNVs_in_hap_map_block,methylation_level_pat,methylation_level_mat,total_read_count_pat,total_read_count_mat,founder_haplotype_pat,founder_haplotype_mat,is_within_50bp_of_mismatch_site
str,i64,i64,i64,i64,f64,i64,f64,f64,i64,i64,str,str,bool
"""chr1""",3665407,3665408,3399126,4207029,1.0,1114.0,0.783,0.951,21.0,10.0,"""B""","""I""",False
"""chr1""",3665514,3665515,3399126,4207029,,,,,,,,,
"""chr1""",3665526,3665527,3399126,4207029,,,,,,,,,
"""chr1""",3665561,3665562,3399126,4207029,1.0,1114.0,0.53,0.478,20.0,10.0,"""B""","""I""",False
