## Setup 

In [1]:
import importlib
from pathlib import Path 
import sys 
import polars as pl

PB_CPG_TOOL_MODE = 'model' # mode of aligned_bam_to_cpg_scores
OUTPUT_DIR = f"/scratch/ucgd/lustre-labs/quinlan/data-shared/dna-methylation/CEPH1463.GRCh38.hifi.{PB_CPG_TOOL_MODE}.founder-phased.all-cpgs" 
BED_ALL_CPGS = f"{OUTPUT_DIR}/all_cpg_sites.bed" # output of src/write_all_cpgs.py
METH_FOUNDER_PHASED_DIR = f"/scratch/ucgd/lustre-labs/quinlan/data-shared/dna-methylation/CEPH1463.GRCh38.hifi.{PB_CPG_TOOL_MODE}.founder-phased" # output dir of phase_meth_to_founder_haps.py
METH_READ_PHASED_DIR = Path(f"/scratch/ucgd/lustre-labs/quinlan/data-shared/dna-methylation/CEPH1463.GRCh38.hifi.{PB_CPG_TOOL_MODE}.read-backed-phased") # output dir of aligned_bam_to_cpg_scores
UID = 200081
BED_METH_FOUNDER_PHASED = f"{METH_FOUNDER_PHASED_DIR}/{UID}.dna-methylation.founder-phased.bed" # bed file of founder-phased methylation levels from src/phase_meth_to_founder_haps.py
BED_METH_UNPHASED = Path(f"{METH_READ_PHASED_DIR}/{UID}.GRCh38.haplotagged.combined.bed.gz") # bed file from aligned_bam_to_cpg_scores (pooling all reads, irrespective of haplotype)
BED_METH_FOUNDER_PHASED_ALL_CPGS = f"{OUTPUT_DIR}/{UID}.dna-methylation.founder-phased.all_cpgs.bed"

REPO_DIR = Path('/scratch/ucgd/lustre-labs/quinlan/u6018199/tapestry')
sys.path.append(f"{REPO_DIR}/src/util") 

## Get all CpG sites

In [2]:
import expand_to_all_cpgs
importlib.reload(expand_to_all_cpgs)
from expand_to_all_cpgs import read_all_cpgs

DF_ALL_CPGS = read_all_cpgs(BED_ALL_CPGS)
DF_ALL_CPGS

chrom,start,end
str,i64,i64
"""chr1""",10470,10471
"""chr1""",10483,10484
"""chr1""",10488,10489
"""chr1""",10492,10493
"""chr1""",10496,10497
…,…,…
"""chrM""",16448,16449
"""chrM""",16453,16454
"""chrM""",16494,16495
"""chrM""",16541,16542


## Read in founder-phased DNA methylation at CpG sites

In [3]:
importlib.reload(expand_to_all_cpgs)
from expand_to_all_cpgs import read_meth_founder_phased

DF_METH_FOUNDER_PHASED = read_meth_founder_phased(BED_METH_FOUNDER_PHASED)
DF_METH_FOUNDER_PHASED

chrom,start,end,start_hap_map_block,end_hap_map_block,haplotype_concordance_in_hap_map_block,num_het_SNVs_in_hap_map_block,methylation_level_pat,methylation_level_mat,total_read_count_pat,total_read_count_mat,founder_haplotype_pat,founder_haplotype_mat,is_within_50bp_of_mismatch_site
str,i64,i64,i64,i64,f64,i64,f64,f64,i64,i64,str,str,bool
"""chr1""",14520,14521,,,,,,,,,,,false
"""chr1""",14552,14553,,,,,,,,,,,false
"""chr1""",14588,14589,,,,,,,,,,,false
"""chr1""",14652,14653,,,,,,,,,,,false
"""chr1""",14672,14673,,,,,,,,,,,false
…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""chrY""",56878251,56878252,,,,,,,,,,,
"""chrY""",56878350,56878351,,,,,,,,,,,
"""chrY""",56878385,56878386,,,,,,,,,,,
"""chrY""",56878431,56878432,,,,,,,,,,,


## Read in unphased DNA methylation at CpG sites

In [4]:
importlib.reload(expand_to_all_cpgs)
from expand_to_all_cpgs import read_meth_unphased

DF_METH_UNPHASED = read_meth_unphased(BED_METH_UNPHASED, PB_CPG_TOOL_MODE) 
DF_METH_UNPHASED

chrom,start,end,total_read_count,methylation_level
str,i64,i64,i64,f64
"""chr1""",10468,10469,13,0.868
"""chr1""",10470,10471,13,0.878
"""chr1""",10483,10484,13,0.929
"""chr1""",10488,10489,14,0.95
"""chr1""",10492,10493,13,0.964
…,…,…,…,…
"""chrM""",16426,16427,192,0.037
"""chrM""",16448,16449,192,0.029
"""chrM""",16453,16454,192,0.032
"""chrM""",16494,16495,192,0.043


## Expand the dataframe of methylation levels to include all CpG sites, and unphased methylation levels (where available), and save to disk

In [5]:
importlib.reload(expand_to_all_cpgs)
from expand_to_all_cpgs import expand_meth_to_all_cpgs

DF_METH_FOUNDER_PHASED_ALL_CPGS = expand_meth_to_all_cpgs(DF_ALL_CPGS, DF_METH_FOUNDER_PHASED, DF_METH_UNPHASED)
DF_METH_FOUNDER_PHASED_ALL_CPGS

chrom,start,end,start_hap_map_block,end_hap_map_block,haplotype_concordance_in_hap_map_block,num_het_SNVs_in_hap_map_block,methylation_level_pat,methylation_level_mat,total_read_count_pat,total_read_count_mat,founder_haplotype_pat,founder_haplotype_mat,is_within_50bp_of_mismatch_site,total_read_count,methylation_level
str,i64,i64,i64,i64,f64,i64,f64,f64,i64,i64,str,str,bool,i64,f64
"""chr1""",10470,10471,,,,,,,,,,,,13,0.878
"""chr1""",10483,10484,,,,,,,,,,,,13,0.929
"""chr1""",10488,10489,,,,,,,,,,,,14,0.95
"""chr1""",10492,10493,,,,,,,,,,,,13,0.964
"""chr1""",10496,10497,,,,,,,,,,,,14,0.954
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""chrY""",56887220,56887221,,,,,,,,,,,,38,0.962
"""chrY""",56887399,56887400,,,,,,,,,,,,38,0.942
"""chrY""",56887579,56887580,,,,,,,,,,,,38,0.945
"""chrY""",56887581,56887582,,,,,,,,,,,,,


In [None]:
import write_data 
importlib.reload(write_data)
from write_data import write_dataframe_to_bed

write_dataframe_to_bed(DF_METH_FOUNDER_PHASED_ALL_CPGS, BED_METH_FOUNDER_PHASED_ALL_CPGS)
print(f"Wrote expanded methylation dataframe to {BED_METH_FOUNDER_PHASED_ALL_CPGS}")

Wrote expanded methylation dataframe to /scratch/ucgd/lustre-labs/quinlan/data-shared/dna-methylation/CEPH1463.GRCh38.hifi.model.founder-phased.all-cpgs/200081.dna-methylation.founder-phased.all_cpgs.bed


## Some CpG sites are in hap-map blocks but do not have phased methylation levels (though they do have unphased methylation levels)

In [7]:
DF_METH_FOUNDER_PHASED_ALL_CPGS.filter(
    (pl.col('chrom') == 'chr1') &
    (pl.col('start') > 3665300) & 
    (pl.col('end') < 3665600)
)

chrom,start,end,start_hap_map_block,end_hap_map_block,haplotype_concordance_in_hap_map_block,num_het_SNVs_in_hap_map_block,methylation_level_pat,methylation_level_mat,total_read_count_pat,total_read_count_mat,founder_haplotype_pat,founder_haplotype_mat,is_within_50bp_of_mismatch_site,total_read_count,methylation_level
str,i64,i64,i64,i64,f64,i64,f64,f64,i64,i64,str,str,bool,i64,f64
"""chr1""",3665407,3665408,3399126.0,4207029.0,1.0,1114.0,0.783,0.951,21.0,10.0,"""B""","""I""",False,31,0.875
"""chr1""",3665514,3665515,,,,,,,,,,,,29,0.878
"""chr1""",3665526,3665527,,,,,,,,,,,,29,0.833
"""chr1""",3665561,3665562,3399126.0,4207029.0,1.0,1114.0,0.53,0.478,20.0,10.0,"""B""","""I""",False,30,0.487


In [8]:
DF_METH_FOUNDER_PHASED_ALL_CPGS.filter(
    (pl.col('chrom') == 'chr1') &
    (pl.col('start') > 36678000) & 
    (pl.col('end') < 36680000)
)

chrom,start,end,start_hap_map_block,end_hap_map_block,haplotype_concordance_in_hap_map_block,num_het_SNVs_in_hap_map_block,methylation_level_pat,methylation_level_mat,total_read_count_pat,total_read_count_mat,founder_haplotype_pat,founder_haplotype_mat,is_within_50bp_of_mismatch_site,total_read_count,methylation_level
str,i64,i64,i64,i64,f64,i64,f64,f64,i64,i64,str,str,bool,i64,f64
"""chr1""",36678211,36678212,36572121.0,36737884.0,1.0,179.0,0.674,0.942,11.0,12.0,"""B""","""K""",False,23,0.837
"""chr1""",36678518,36678519,36572121.0,36737884.0,1.0,179.0,0.916,0.704,11.0,12.0,"""B""","""K""",False,23,0.832
"""chr1""",36678538,36678539,36572121.0,36737884.0,1.0,179.0,0.862,0.055,11.0,12.0,"""B""","""K""",False,23,0.43
"""chr1""",36678689,36678690,36572121.0,36737884.0,1.0,179.0,0.812,0.937,10.0,12.0,"""B""","""K""",False,22,0.924
"""chr1""",36679081,36679082,,,,,,,,,,,,20,0.947
"""chr1""",36679125,36679126,,,,,,,,,,,,20,0.952
"""chr1""",36679168,36679169,,,,,,,,,,,,20,0.907
"""chr1""",36679367,36679368,,,,,,,,,,,,19,0.901
"""chr1""",36679693,36679694,,,,,,,,,,,,19,0.865
"""chr1""",36679896,36679897,,,,,,,,,,,,19,0.934
