## Setup 

In [43]:
import importlib
from pathlib import Path 
import sys
import polars as pl 

OUTPUT_DIR = f"/scratch/ucgd/lustre-labs/quinlan/data-shared/dna-methylation/CEPH1463.GRCh38.hifi.founder-phased.all-cpgs" 
BED_ALL_CPGS_IN_REFERENCE = f"{OUTPUT_DIR}/all_cpg_sites_in_reference.bed" # output of src/write_all_cpgs.py
METH_FOUNDER_PHASED_DIR = f"/scratch/ucgd/lustre-labs/quinlan/data-shared/dna-methylation/CEPH1463.GRCh38.hifi.founder-phased" # output dir of phase_meth_to_founder_haps.py
METH_COUNT_READ_PHASED_DIR = Path(f"/scratch/ucgd/lustre-labs/quinlan/data-shared/dna-methylation/CEPH1463.GRCh38.hifi.count.read-backed-phased") # output dir of aligned_bam_to_cpg_scores (containing count-based unphased meth)
METH_MODEL_READ_PHASED_DIR = Path(f"/scratch/ucgd/lustre-labs/quinlan/data-shared/dna-methylation/CEPH1463.GRCh38.hifi.model.read-backed-phased") # output dir of aligned_bam_to_cpg_scores (containing model-based unphased meth)
UID = '200081' # much of the interpretation in this notebook is specific to this sample
BED_METH_FOUNDER_PHASED = f"{METH_FOUNDER_PHASED_DIR}/{UID}.dna-methylation.founder-phased.bed" # bed file of founder-phased methylation levels from src/phase_meth_to_founder_haps.py
BED_METH_COUNT_UNPHASED = Path(f"{METH_COUNT_READ_PHASED_DIR}/{UID}.GRCh38.haplotagged.combined.bed.gz") # bed file from aligned_bam_to_cpg_scores (unphased count-based meth)
BED_METH_MODEL_UNPHASED = Path(f"{METH_MODEL_READ_PHASED_DIR}/{UID}.GRCh38.haplotagged.combined.bed.gz") # bed file from aligned_bam_to_cpg_scores (unphased model-based meth)
BED_METH_FOUNDER_PHASED_ALL_CPGS = f"{OUTPUT_DIR}/{UID}.dna-methylation.founder-phased.all_cpgs.bed"
BED_HET_SITE_MISMATCHES = f"{METH_FOUNDER_PHASED_DIR}/{UID}.bit-vector-sites-mismatches.bed" # bed file of heterozygous sites at which bit-vectors are mismatched, from src/phase_meth_to_founder_haps.py

VCF_JOINT_CALLED = "/scratch/ucgd/lustre-labs/quinlan/data-shared/datasets/Palladium/deepvariant/CEPH-1463.joint.GRCh38.deepvariant.glnexus.phased.vcf.gz"

REPO_DIR = Path('/scratch/ucgd/lustre-labs/quinlan/u6018199/tapestry')
sys.path.append(f"{REPO_DIR}/src/util") 

## Get all CpG sites in reference genome

In [2]:
import expand_to_all_cpgs
importlib.reload(expand_to_all_cpgs)
from expand_to_all_cpgs import read_all_cpgs_in_reference

DF_ALL_CPGS_IN_REFERENCE = read_all_cpgs_in_reference(BED_ALL_CPGS_IN_REFERENCE)
DF_ALL_CPGS_IN_REFERENCE

chrom,start,end
str,i64,i64
"""chr1""",10468,10469
"""chr1""",10470,10471
"""chr1""",10483,10484
"""chr1""",10488,10489
"""chr1""",10492,10493
…,…,…
"""chrY""",56887220,56887221
"""chrY""",56887399,56887400
"""chrY""",56887579,56887580
"""chrY""",56887581,56887582


## Read in unphased DNA methylation at CpG sites, both those in the reference genome, and those present in the sample but not in the reference genome

In [3]:
importlib.reload(expand_to_all_cpgs)
from expand_to_all_cpgs import read_meth_unphased

DF_METH_UNPHASED = read_meth_unphased(BED_METH_COUNT_UNPHASED, BED_METH_MODEL_UNPHASED) 
DF_METH_UNPHASED

chrom,start,end,total_read_count_count,methylation_level_count,total_read_count_model,methylation_level_model
str,i64,i64,i64,f64,i64,f64
"""chr1""",10468,10469,13,0.769,13,0.868
"""chr1""",10470,10471,13,0.769,13,0.878
"""chr1""",10483,10484,13,0.923,13,0.929
"""chr1""",10488,10489,14,1.0,14,0.95
"""chr1""",10492,10493,13,1.0,13,0.964
…,…,…,…,…,…,…
"""chrY""",56887220,56887221,38,0.947,38,0.962
"""chrY""",56887399,56887400,38,0.737,38,0.942
"""chrY""",56887579,56887580,38,0.895,38,0.945
"""chrY""",56887592,56887593,38,0.579,38,0.721


## Methylation levels are computed at CpG sites observed in the sample (which may or may not be in the reference)

The default value of the `--modsites-mode` argument of `aligned_bam_to_cpg_scores` is `denovo`, meaning that DNA methylation levels are computed at all CG sites in the sample's haplotypes: 

https://github.com/PacificBiosciences/pb-CpG-tools?tab=readme-ov-file#output-modes-and-option-details

https://quinlangroup.slack.com/archives/C0803TM7X0X/p1759348751929209

https://quinlangroup.slack.com/archives/C0803TM7X0X/p1759349045861589

## CpG site creation: Sites that are CpG in at least one haplotype of the sample, but not CpG in the reference sequence

In [4]:
# IGV snapshots: 
# https://quinlangroup.slack.com/archives/C0803TM7X0X/p1759879585412219 
# https://quinlangroup.slack.com/archives/C0803TM7X0X/p1759880211882149
# https://quinlangroup.slack.com/archives/C0803TM7X0X/p1759880434142149 
# https://quinlangroup.slack.com/archives/C0803TM7X0X/p1759880669955469

# These sites also appear as variants in /scratch/ucgd/lustre-labs/quinlan/data-shared/read-backed-phasing/200081.GRCh38.deepvariant.glnexus.phased.vcf.gz
# e.g., 
# $ tabix 200081.GRCh38.deepvariant.glnexus.phased.vcf.gz chr1:10623-10623 
# chr1    10623   chr1_10623_T_C  T       C       36      .       AF=1;AQ=36      GT:DP:AD:GQ:PL:RNC      1/1:23:0,23:22:33,22,0:..

DF_METH_UNPHASED.join(DF_ALL_CPGS_IN_REFERENCE, on=['chrom', 'start', 'end'], how='anti')

chrom,start,end,total_read_count_count,methylation_level_count,total_read_count_model,methylation_level_model
str,i64,i64,i64,f64,i64,f64
"""chr1""",10622,10623,15,0.8,15,0.919
"""chr1""",10804,10805,13,0.692,13,0.708
"""chr1""",10820,10821,13,0.615,13,0.778
"""chr1""",10828,10829,14,0.571,14,0.652
"""chr1""",10925,10926,14,0.786,14,0.949
…,…,…,…,…,…,…
"""chrY""",56885796,56885797,41,0.854,41,0.953
"""chrY""",56885831,56885832,41,0.805,41,0.948
"""chrY""",56885915,56885916,41,0.732,41,0.9
"""chrY""",56886309,56886310,41,0.854,41,0.927


## Sites that are CpG in the reference, but at which unphased DNA methylation is not reported

These sites fall into two classes: 

1. CpG site destruction: A variant destroyed the CpG (relative to the reference sequence), e.g., https://quinlangroup.slack.com/archives/C0803TM7X0X/p1759955795431799 We see these variants in the VCF too. 
2. The site is CpG in the sample, but read coverage was too low to report a reliable estimate of DNA methylation 

In [5]:
DF_ALL_CPGS_IN_REFERENCE.join(DF_METH_UNPHASED, on=['chrom', 'start', 'end'], how='anti')

chrom,start,end
str,i64,i64
"""chr1""",10930,10931
"""chr1""",10933,10934
"""chr1""",11166,11167
"""chr1""",12781,12782
"""chr1""",13301,13302
…,…,…
"""chrY""",56884829,56884830
"""chrY""",56885859,56885860
"""chrY""",56886407,56886408
"""chrY""",56886943,56886944


## Read in founder-phased DNA methylation at CpG sites

In [6]:
importlib.reload(expand_to_all_cpgs)
from expand_to_all_cpgs import read_meth_founder_phased

DF_METH_FOUNDER_PHASED = read_meth_founder_phased(BED_METH_FOUNDER_PHASED)
DF_METH_FOUNDER_PHASED

chrom,start,end,start_hap_map_block,end_hap_map_block,haplotype_concordance_in_hap_map_block,num_het_SNVs_in_hap_map_block,total_read_count_pat,total_read_count_mat,founder_haplotype_pat,founder_haplotype_mat,methylation_level_pat_count,methylation_level_mat_count,methylation_level_pat_model,methylation_level_mat_model
str,i64,i64,i64,i64,f64,i64,f64,f64,str,str,f64,f64,f64,f64
"""chr1""",14061,14062,,,,,,,,,,,,
"""chr1""",14178,14179,,,,,,,,,,,,
"""chr1""",14348,14349,,,,,,,,,,,,
"""chr1""",14353,14354,,,,,,,,,,,,
"""chr1""",14434,14435,,,,,,,,,,,,
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""chrY""",56887220,56887221,,,,,,,,,,,,
"""chrY""",56887399,56887400,,,,,,,,,,,,
"""chrY""",56887579,56887580,,,,,,,,,,,,
"""chrY""",56887592,56887593,,,,,,,,,,,,


## Expand the dataframe of founder-phased methylation levels to include all CpG sites in reference and sample genome, and unphased methylation levels (where available)

In [7]:
importlib.reload(expand_to_all_cpgs)
from expand_to_all_cpgs import expand_meth_to_all_cpgs

DF_METH_FOUNDER_PHASED_ALL_CPGS = expand_meth_to_all_cpgs(DF_ALL_CPGS_IN_REFERENCE, DF_METH_UNPHASED, DF_METH_FOUNDER_PHASED)
DF_METH_FOUNDER_PHASED_ALL_CPGS

chrom,start,end,total_read_count,methylation_level_count,methylation_level_model,start_hap_map_block,end_hap_map_block,haplotype_concordance_in_hap_map_block,num_het_SNVs_in_hap_map_block,total_read_count_pat,total_read_count_mat,founder_haplotype_pat,founder_haplotype_mat,methylation_level_pat_count,methylation_level_mat_count,methylation_level_pat_model,methylation_level_mat_model
str,i64,i64,i64,f64,f64,i64,i64,f64,i64,f64,f64,str,str,f64,f64,f64,f64
"""chr1""",10468,10469,13,0.769,0.868,,,,,,,,,,,,
"""chr1""",10470,10471,13,0.769,0.878,,,,,,,,,,,,
"""chr1""",10483,10484,13,0.923,0.929,,,,,,,,,,,,
"""chr1""",10488,10489,14,1.0,0.95,,,,,,,,,,,,
"""chr1""",10492,10493,13,1.0,0.964,,,,,,,,,,,,
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""chrY""",56887399,56887400,38,0.737,0.942,,,,,,,,,,,,
"""chrY""",56887579,56887580,38,0.895,0.945,,,,,,,,,,,,
"""chrY""",56887581,56887582,,,,,,,,,,,,,,,
"""chrY""",56887592,56887593,38,0.579,0.721,,,,,,,,,,,,


## Add proximity of each CpG site to heterozygous sites at which bit-vectors are mismatched 

In [8]:
importlib.reload(expand_to_all_cpgs)
from expand_to_all_cpgs import compute_proximity_to_mismatched_heterozygous_sites

DF_METH_FOUNDER_PHASED_ALL_CPGS = compute_proximity_to_mismatched_heterozygous_sites(DF_METH_FOUNDER_PHASED_ALL_CPGS, BED_HET_SITE_MISMATCHES)
DF_METH_FOUNDER_PHASED_ALL_CPGS

chrom,start,end,total_read_count,methylation_level_count,methylation_level_model,start_hap_map_block,end_hap_map_block,haplotype_concordance_in_hap_map_block,num_het_SNVs_in_hap_map_block,total_read_count_pat,total_read_count_mat,founder_haplotype_pat,founder_haplotype_mat,methylation_level_pat_count,methylation_level_mat_count,methylation_level_pat_model,methylation_level_mat_model,is_within_50bp_of_mismatch_site
str,i64,i64,i64,f64,f64,i64,i64,f64,i64,i64,i64,str,str,f64,f64,f64,f64,bool
"""chr1""",10468,10469,13,0.769,0.868,,,,,,,,,,,,,false
"""chr1""",10470,10471,13,0.769,0.878,,,,,,,,,,,,,false
"""chr1""",10483,10484,13,0.923,0.929,,,,,,,,,,,,,false
"""chr1""",10488,10489,14,1.0,0.95,,,,,,,,,,,,,false
"""chr1""",10492,10493,13,1.0,0.964,,,,,,,,,,,,,false
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""chrY""",56887399,56887400,38,0.737,0.942,,,,,,,,,,,,,
"""chrY""",56887579,56887580,38,0.895,0.945,,,,,,,,,,,,,
"""chrY""",56887581,56887582,,,,,,,,,,,,,,,,
"""chrY""",56887592,56887593,38,0.579,0.721,,,,,,,,,,,,,


## We don't phase methylation on chrM and chrY, and a note about gender

In [9]:
# we don't phase methylation on chrM (>2 copies per cell): 
DF_METH_FOUNDER_PHASED_ALL_CPGS.filter(pl.col('chrom') == 'chrM').filter(pl.col('start_hap_map_block').is_not_null())

chrom,start,end,total_read_count,methylation_level_count,methylation_level_model,start_hap_map_block,end_hap_map_block,haplotype_concordance_in_hap_map_block,num_het_SNVs_in_hap_map_block,total_read_count_pat,total_read_count_mat,founder_haplotype_pat,founder_haplotype_mat,methylation_level_pat_count,methylation_level_mat_count,methylation_level_pat_model,methylation_level_mat_model,is_within_50bp_of_mismatch_site
str,i64,i64,i64,f64,f64,i64,i64,f64,i64,i64,i64,str,str,f64,f64,f64,f64,bool


In [10]:
# we don't phase methylation on chrY (<2 copies per cell): 
DF_METH_FOUNDER_PHASED_ALL_CPGS.filter(pl.col('chrom') == 'chrY').filter(pl.col('start_hap_map_block').is_not_null())

chrom,start,end,total_read_count,methylation_level_count,methylation_level_model,start_hap_map_block,end_hap_map_block,haplotype_concordance_in_hap_map_block,num_het_SNVs_in_hap_map_block,total_read_count_pat,total_read_count_mat,founder_haplotype_pat,founder_haplotype_mat,methylation_level_pat_count,methylation_level_mat_count,methylation_level_pat_model,methylation_level_mat_model,is_within_50bp_of_mismatch_site
str,i64,i64,i64,f64,f64,i64,i64,f64,i64,i64,i64,str,str,f64,f64,f64,f64,bool


In [11]:
# since we don't phase methylation on chrM (>2 copies per cell) and chrY (<2 copies per cell), there are null values for "is_within_50bp_of_mismatch_site":
DF_METH_FOUNDER_PHASED_ALL_CPGS.filter(pl.col('is_within_50bp_of_mismatch_site').is_null())

chrom,start,end,total_read_count,methylation_level_count,methylation_level_model,start_hap_map_block,end_hap_map_block,haplotype_concordance_in_hap_map_block,num_het_SNVs_in_hap_map_block,total_read_count_pat,total_read_count_mat,founder_haplotype_pat,founder_haplotype_mat,methylation_level_pat_count,methylation_level_mat_count,methylation_level_pat_model,methylation_level_mat_model,is_within_50bp_of_mismatch_site
str,i64,i64,i64,f64,f64,i64,i64,f64,i64,i64,i64,str,str,f64,f64,f64,f64,bool
"""chrM""",32,33,192,0.25,0.071,,,,,,,,,,,,,
"""chrM""",60,61,190,0.242,0.039,,,,,,,,,,,,,
"""chrM""",77,78,190,0.089,0.029,,,,,,,,,,,,,
"""chrM""",79,80,190,0.121,0.034,,,,,,,,,,,,,
"""chrM""",90,91,190,0.232,0.032,,,,,,,,,,,,,
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""chrY""",56887399,56887400,38,0.737,0.942,,,,,,,,,,,,,
"""chrY""",56887579,56887580,38,0.895,0.945,,,,,,,,,,,,,
"""chrY""",56887581,56887582,,,,,,,,,,,,,,,,
"""chrY""",56887592,56887593,38,0.579,0.721,,,,,,,,,,,,,


In [12]:
# total read count (about half of 30X) suggests just one copy of chrX, i.e., a male: 
DF_METH_FOUNDER_PHASED_ALL_CPGS.filter(pl.col('chrom') == 'chrX').filter(pl.col('total_read_count').is_not_null())

chrom,start,end,total_read_count,methylation_level_count,methylation_level_model,start_hap_map_block,end_hap_map_block,haplotype_concordance_in_hap_map_block,num_het_SNVs_in_hap_map_block,total_read_count_pat,total_read_count_mat,founder_haplotype_pat,founder_haplotype_mat,methylation_level_pat_count,methylation_level_mat_count,methylation_level_pat_model,methylation_level_mat_model,is_within_50bp_of_mismatch_site
str,i64,i64,i64,f64,f64,i64,i64,f64,i64,i64,i64,str,str,f64,f64,f64,f64,bool
"""chrX""",25567,25568,10,1.0,0.962,,,,,,,,,,,,,false
"""chrX""",25575,25576,10,0.8,0.956,,,,,,,,,,,,,false
"""chrX""",25631,25632,10,0.7,0.732,,,,,,,,,,,,,false
"""chrX""",25638,25639,10,0.9,0.96,,,,,,,,,,,,,false
"""chrX""",25647,25648,10,1.0,0.969,,,,,,,,,,,,,false
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""chrX""",156020271,156020272,11,0.727,0.866,,,,,,,,,,,,,false
"""chrX""",156020287,156020288,11,0.909,0.933,,,,,,,,,,,,,false
"""chrX""",156020376,156020377,10,0.8,0.885,,,,,,,,,,,,,false
"""chrX""",156030064,156030065,15,0.133,0.429,,,,,,,,,,,,,false


In [13]:
# this phasing is probably incorrect due to technical errors, since there is only one X in a male, and it must come from the mother (with the Y coming from the father), 
# whereas these data say that the X comes from the father:
DF_METH_FOUNDER_PHASED_ALL_CPGS.filter(pl.col('chrom') == 'chrX').filter(pl.col('start_hap_map_block').is_not_null())

chrom,start,end,total_read_count,methylation_level_count,methylation_level_model,start_hap_map_block,end_hap_map_block,haplotype_concordance_in_hap_map_block,num_het_SNVs_in_hap_map_block,total_read_count_pat,total_read_count_mat,founder_haplotype_pat,founder_haplotype_mat,methylation_level_pat_count,methylation_level_mat_count,methylation_level_pat_model,methylation_level_mat_model,is_within_50bp_of_mismatch_site
str,i64,i64,i64,f64,f64,i64,i64,f64,i64,i64,i64,str,str,f64,f64,f64,f64,bool
"""chrX""",2241257,2241258,19,0.895,0.959,2240422,3036819,1.0,193,10,,"""A""","""G""",0.9,,0.962,,false
"""chrX""",2241263,2241264,19,0.632,0.652,2240422,3036819,1.0,193,10,,"""A""","""G""",0.7,,0.875,,false
"""chrX""",2241329,2241330,19,0.632,0.775,2240422,3036819,1.0,193,10,,"""A""","""G""",0.6,,0.872,,false
"""chrX""",2241404,2241405,19,0.421,0.27,2240422,3036819,1.0,193,10,,"""A""","""G""",0.4,,0.295,,false
"""chrX""",2241507,2241508,20,0.9,0.947,2240422,3036819,1.0,193,11,,"""A""","""G""",0.909,,0.951,,false
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""chrX""",155658077,155658078,35,0.629,0.854,155616130,155659104,1.0,1,20,15,"""A""","""K""",0.75,0.467,0.956,0.491,false
"""chrX""",155658233,155658234,34,0.588,0.721,155616130,155659104,1.0,1,19,15,"""A""","""K""",0.737,0.4,0.915,0.205,false
"""chrX""",155658380,155658381,34,0.647,0.704,155616130,155659104,1.0,1,19,15,"""A""","""K""",0.737,0.533,0.854,0.574,false
"""chrX""",155658475,155658476,34,0.647,0.821,155616130,155659104,1.0,1,19,15,"""A""","""K""",0.737,0.533,0.939,0.638,false


In [14]:
# TODO: check phasing of methylation on chrX in a female sample (XX karyotype)

## Examples of CpG sites where phasing is partial, even though they are in hap-map blocks 

In [15]:
DF_METH_FOUNDER_PHASED_ALL_CPGS.filter(
    (pl.col('chrom') == 'chr1') &
    (pl.col('start') > 3665300) & 
    (pl.col('end') < 3665600)
)

chrom,start,end,total_read_count,methylation_level_count,methylation_level_model,start_hap_map_block,end_hap_map_block,haplotype_concordance_in_hap_map_block,num_het_SNVs_in_hap_map_block,total_read_count_pat,total_read_count_mat,founder_haplotype_pat,founder_haplotype_mat,methylation_level_pat_count,methylation_level_mat_count,methylation_level_pat_model,methylation_level_mat_model,is_within_50bp_of_mismatch_site
str,i64,i64,i64,f64,f64,i64,i64,f64,i64,i64,i64,str,str,f64,f64,f64,f64,bool
"""chr1""",3665407,3665408,31,0.742,0.875,3399126,4207029,1.0,1114,21,10.0,"""B""","""I""",0.714,0.8,0.783,0.951,False
"""chr1""",3665514,3665515,29,0.69,0.878,3399126,4207029,1.0,1114,20,,"""B""","""I""",0.6,,0.799,,False
"""chr1""",3665526,3665527,29,0.69,0.833,3399126,4207029,1.0,1114,20,,"""B""","""I""",0.7,,0.774,,False
"""chr1""",3665561,3665562,30,0.433,0.487,3399126,4207029,1.0,1114,20,10.0,"""B""","""I""",0.45,0.4,0.53,0.478,False


In [16]:
DF_METH_FOUNDER_PHASED_ALL_CPGS.filter(
    (pl.col('chrom') == 'chr1') &
    (pl.col('start') > 36678000) & 
    (pl.col('end') < 36680000)
)

chrom,start,end,total_read_count,methylation_level_count,methylation_level_model,start_hap_map_block,end_hap_map_block,haplotype_concordance_in_hap_map_block,num_het_SNVs_in_hap_map_block,total_read_count_pat,total_read_count_mat,founder_haplotype_pat,founder_haplotype_mat,methylation_level_pat_count,methylation_level_mat_count,methylation_level_pat_model,methylation_level_mat_model,is_within_50bp_of_mismatch_site
str,i64,i64,i64,f64,f64,i64,i64,f64,i64,i64,i64,str,str,f64,f64,f64,f64,bool
"""chr1""",36678211,36678212,23,0.696,0.837,36572121,36737884,1.0,179,11.0,12,"""B""","""K""",0.636,0.75,0.674,0.942,False
"""chr1""",36678518,36678519,23,0.522,0.832,36572121,36737884,1.0,179,11.0,12,"""B""","""K""",0.636,0.417,0.916,0.704,False
"""chr1""",36678538,36678539,23,0.348,0.43,36572121,36737884,1.0,179,11.0,12,"""B""","""K""",0.727,0.0,0.862,0.055,False
"""chr1""",36678689,36678690,22,0.773,0.924,36572121,36737884,1.0,179,10.0,12,"""B""","""K""",0.7,0.833,0.812,0.937,False
"""chr1""",36679081,36679082,20,0.8,0.947,36572121,36737884,1.0,179,,12,"""B""","""K""",,0.917,,0.966,False
"""chr1""",36679125,36679126,20,0.9,0.952,36572121,36737884,1.0,179,,12,"""B""","""K""",,0.917,,0.956,False
"""chr1""",36679168,36679169,20,0.75,0.907,36572121,36737884,1.0,179,,12,"""B""","""K""",,0.833,,0.957,False
"""chr1""",36679367,36679368,19,0.789,0.901,36572121,36737884,1.0,179,,12,"""B""","""K""",,0.833,,0.915,False
"""chr1""",36679693,36679694,19,0.684,0.865,36572121,36737884,1.0,179,,12,"""B""","""K""",,0.75,,0.89,False
"""chr1""",36679896,36679897,19,0.737,0.934,36572121,36737884,1.0,179,,12,"""B""","""K""",,0.667,,0.943,False


## QC Statistics 

In [17]:
importlib.reload(expand_to_all_cpgs)
from expand_to_all_cpgs import compute_fraction_of_cpgs_that_are_close_to_mismatches

compute_fraction_of_cpgs_that_are_close_to_mismatches(DF_METH_FOUNDER_PHASED_ALL_CPGS)

Percentage of CpG sites (in reference and sample genome, and on phasable chroms) that are within 50bp of a heterozygous mismatch site: 0.173%


In [18]:
importlib.reload(expand_to_all_cpgs)
from expand_to_all_cpgs import compute_fraction_of_cpgs_at_which_meth_is_phased_wrapper

compute_fraction_of_cpgs_at_which_meth_is_phased_wrapper(DF_METH_FOUNDER_PHASED_ALL_CPGS)

Percentage of CpG sites (in reference and sample genomes, and on phasable chroms) at which count-based methylation is phased to pat haplotype: 80.58%
Percentage of CpG sites (in reference and sample genomes, and on phasable chroms) at which count-based methylation is phased to mat haplotype: 80.53%
Percentage of CpG sites (in reference and sample genomes, and on phasable chroms) at which count-based methylation is phased to at least one parental haplotype: 84.25%
Percentage of CpG sites (in reference and sample genomes, and on phasable chroms) at which count-based methylation is phased to both parental haplotypes: 76.86%
Percentage of CpG sites (in reference and sample genomes, and on phasable chroms) at which count-based unphased methylation is reported: 97.45%
Percentage of CpG sites (in reference and sample genomes, and on phasable chroms) at which model-based methylation is phased to pat haplotype: 80.58%
Percentage of CpG sites (in reference and sample genomes, and on phasable chr

## Overlap CpGs with joint-called SNVs 

In [19]:
# Motivation: 
# slides: https://docs.google.com/presentation/d/11Pfax0wXh0E68C287lMaPoPvhq-OrFGxFOKE1gWOkDI/edit?slide=id.g39893c07c75_0_0#slide=id.g39893c07c75_0_0 
# slack thread: https://quinlangroup.slack.com/archives/C0803TM7X0X/p1762565840460019?thread_ts=1759348751.929209&cid=C0803TM7X0X 

In [46]:
importlib.reload(expand_to_all_cpgs)
from expand_to_all_cpgs import get_joint_called_variants

DF_JOINT_CALLED_VARIANTS = get_joint_called_variants(UID, VCF_JOINT_CALLED)
DF_JOINT_CALLED_VARIANTS

chrom,start,end,REF,ALT,allele_1,allele_2,phased
str,i64,i64,str,list[str],str,str,bool
"""chr1""",10290,10291,"""C""","[""T""]","""0""","""0""",false
"""chr1""",10296,10297,"""C""","[""T""]",""".""",""".""",false
"""chr1""",10302,10303,"""C""","[""T""]","""0""","""0""",false
"""chr1""",10308,10309,"""C""","[""T""]","""0""","""0""",false
"""chr1""",10314,10315,"""C""","[""T""]",""".""",""".""",false
"""chr1""",10449,10450,"""T""","[""G""]","""0""","""0""",false
"""chr1""",10491,10492,"""C""","[""T""]",""".""",""".""",false
"""chr1""",10531,10532,"""A""","[""G""]",""".""",""".""",false
"""chr1""",10591,10592,"""G""","[""A""]","""0""","""0""",false
"""chr1""",10602,10603,"""G""","[""A""]","""0""","""0""",false


In [21]:
# An example of CpG site creation 
# A site that is CpG in only one haplotype of the sample, and not CpG in the reference sequence

# IGV snapshot: 
# https://quinlangroup.slack.com/archives/C0803TM7X0X/p1759880434142149

In [48]:
# variant at this CpG site: 

DF_JOINT_CALLED_VARIANTS.filter(
    (pl.col("chrom") == 'chr20') & 
    (pl.col("start") == 101340)
)

chrom,start,end,REF,ALT,allele_1,allele_2,phased
str,i64,i64,str,list[str],str,str,bool
"""chr20""",101340,101341,"""A""","[""G""]","""0""","""1""",True


In [49]:
# Methylation at this CpG site on one haplotype is 0.0
# It should be None as there is no CpG on that haplotype

DF_METH_FOUNDER_PHASED_ALL_CPGS.filter(
    (pl.col("chrom") == 'chr20') & 
    (pl.col("start") == 101339)
)

chrom,start,end,total_read_count,methylation_level_count,methylation_level_model,start_hap_map_block,end_hap_map_block,haplotype_concordance_in_hap_map_block,num_het_SNVs_in_hap_map_block,total_read_count_pat,total_read_count_mat,founder_haplotype_pat,founder_haplotype_mat,methylation_level_pat_count,methylation_level_mat_count,methylation_level_pat_model,methylation_level_mat_model,is_within_50bp_of_mismatch_site
str,i64,i64,i64,f64,f64,i64,i64,f64,i64,i64,i64,str,str,f64,f64,f64,f64,bool
"""chr20""",101339,101340,45,0.4,0.478,67743,236639,1.0,258,23,22,"""A""","""I""",0.783,0.0,0.89,0.06,False


In [63]:
importlib.reload(expand_to_all_cpgs)
from expand_to_all_cpgs import label_with_variants

DF_METH_FOUNDER_PHASED_ALL_CPGS_WITH_VARIANT_LABEL = label_with_variants(DF_METH_FOUNDER_PHASED_ALL_CPGS, DF_JOINT_CALLED_VARIANTS)
DF_METH_FOUNDER_PHASED_ALL_CPGS_WITH_VARIANT_LABEL

chrom,start_cpg,end_cpg,total_read_count,methylation_level_count,methylation_level_model,start_hap_map_block,end_hap_map_block,haplotype_concordance_in_hap_map_block,num_het_SNVs_in_hap_map_block,total_read_count_pat,total_read_count_mat,founder_haplotype_pat,founder_haplotype_mat,methylation_level_pat_count,methylation_level_mat_count,methylation_level_pat_model,methylation_level_mat_model,cpg_is_within_50bp_of_mismatch_site,start_variant,end_variant,REF,ALT,allele_1,allele_2,snv_phased,num_SNVs_overlapping_CG
str,i64,i64,i64,f64,f64,i64,i64,f64,i64,i64,i64,str,str,f64,f64,f64,f64,bool,i64,i64,str,list[str],str,str,bool,u32
"""chr1""",10468,10470,13,0.769,0.868,,,,,,,,,,,,,false,,,,,,,,0
"""chr1""",10470,10472,13,0.769,0.878,,,,,,,,,,,,,false,,,,,,,,0
"""chr1""",10483,10485,13,0.923,0.929,,,,,,,,,,,,,false,,,,,,,,0
"""chr1""",10488,10490,14,1.0,0.95,,,,,,,,,,,,,false,,,,,,,,0
"""chr1""",10492,10494,13,1.0,0.964,,,,,,,,,,,,,false,,,,,,,,0
"""chr1""",10496,10498,14,0.857,0.954,,,,,,,,,,,,,false,,,,,,,,0
"""chr1""",10524,10526,14,0.643,0.951,,,,,,,,,,,,,false,,,,,,,,0
"""chr1""",10541,10543,14,0.857,0.956,,,,,,,,,,,,,false,,,,,,,,0
"""chr1""",10562,10564,14,0.857,0.932,,,,,,,,,,,,,false,,,,,,,,0
"""chr1""",10570,10572,14,0.786,0.877,,,,,,,,,,,,,false,,,,,,,,0


## SNVs with unknown genotypes overlap some CpGs, and therefore DO need to be acounted for when identifying allele-specific CpGs

In [64]:
DF_METH_FOUNDER_PHASED_ALL_CPGS_WITH_VARIANT_LABEL.filter(
    (pl.col('allele_1') == '.') | 
    (pl.col('allele_2') == '.') 
)

chrom,start_cpg,end_cpg,total_read_count,methylation_level_count,methylation_level_model,start_hap_map_block,end_hap_map_block,haplotype_concordance_in_hap_map_block,num_het_SNVs_in_hap_map_block,total_read_count_pat,total_read_count_mat,founder_haplotype_pat,founder_haplotype_mat,methylation_level_pat_count,methylation_level_mat_count,methylation_level_pat_model,methylation_level_mat_model,cpg_is_within_50bp_of_mismatch_site,start_variant,end_variant,REF,ALT,allele_1,allele_2,snv_phased,num_SNVs_overlapping_CG
str,i64,i64,i64,f64,f64,i64,i64,f64,i64,i64,i64,str,str,f64,f64,f64,f64,bool,i64,i64,str,list[str],str,str,bool,u32
"""chr1""",10860,10862,14,0.643,0.712,,,,,,,,,,,,,false,10861,10862,"""G""","[""A""]",""".""",""".""",false,1
"""chr1""",12505,12507,18,0.5,0.574,,,,,,,,,,,,,false,12505,12506,"""C""","[""G""]",""".""",""".""",false,1
"""chr1""",13078,13080,20,0.35,0.489,,,,,,,,,,,,,false,13078,13079,"""C""","[""G""]",""".""",""".""",false,1
"""chr1""",16242,16244,26,0.462,0.449,,,,,,,,,,,,,false,16242,16243,"""C""","[""T""]",""".""",""".""",false,1
"""chr1""",16618,16620,27,0.63,0.805,,,,,,,,,,,,,false,16618,16619,"""C""","[""T""]",""".""",""".""",false,1
"""chr1""",16973,16975,31,0.839,0.876,,,,,,,,,,,,,false,16973,16974,"""C""","[""T""]",""".""",""".""",false,1
"""chr1""",17902,17904,35,0.829,0.76,,,,,,,,,,,,,false,17903,17904,"""G""","[""A""]",""".""",""".""",false,1
"""chr1""",18872,18874,38,0.579,0.581,,,,,,,,,,,,,false,18872,18873,"""C""","[""T""]",""".""",""".""",false,1
"""chr1""",21092,21094,41,0.585,0.581,,,,,,,,,,,,,false,21092,21093,"""C""","[""T""]",""".""",""".""",false,1
"""chr1""",22181,22183,41,0.22,0.269,,,,,,,,,,,,,false,22181,22182,"""T""","[""C""]",""".""",""".""",false,1


## CpG sites that each overlap a single SNV: When scanning for imprinting, exclude such CpG sites if they overlap heterozygous (but not homozygous) SNVs 

In [114]:
def add_locus_cpg(df):
    return df.with_columns(
        locus_cpg = pl.format(
            "{}:{}-{}", 
            pl.col("chrom"),
            pl.col("start_cpg"),
            pl.col("end_cpg")
        )
    )

def subset_cpgs_at_variants(df, allele_1, allele_2, num_SNVs_overlapping_CG):
    df = df.filter(
        (pl.col('allele_1') == str(allele_1)) & 
        (pl.col('allele_2') == str(allele_2)) & 
        (pl.col('num_SNVs_overlapping_CG') == num_SNVs_overlapping_CG)
    )
    return add_locus_cpg(df)

### There are many CpG sites that overlap a single SNV 

In [115]:
print(f"Number of CpG sites that overlap a single SNV: {len(DF_METH_FOUNDER_PHASED_ALL_CPGS_WITH_VARIANT_LABEL.filter(pl.col('num_SNVs_overlapping_CG') == 1))}")

Number of CpG sites that overlap a single SNV: 1911602


### Homozygous CpG sites should be included in imprinting scans

In [116]:
# HOM REF 
# Both haplotypes must be CpG, by construction 
# Therefore both haplotypes must exhibit methylation 
# These sites could, in principle, be imprinted, in this particular sample, and therefore should be included in scans for imprinting, in this particular sample

subset_cpgs_at_variants(
    DF_METH_FOUNDER_PHASED_ALL_CPGS_WITH_VARIANT_LABEL, 
    allele_1=0, 
    allele_2=0,
    num_SNVs_overlapping_CG=1
).sample(5, seed=42)

chrom,start_cpg,end_cpg,total_read_count,methylation_level_count,methylation_level_model,start_hap_map_block,end_hap_map_block,haplotype_concordance_in_hap_map_block,num_het_SNVs_in_hap_map_block,total_read_count_pat,total_read_count_mat,founder_haplotype_pat,founder_haplotype_mat,methylation_level_pat_count,methylation_level_mat_count,methylation_level_pat_model,methylation_level_mat_model,cpg_is_within_50bp_of_mismatch_site,start_variant,end_variant,REF,ALT,allele_1,allele_2,snv_phased,num_SNVs_overlapping_CG,locus_cpg
str,i64,i64,i64,f64,f64,i64,i64,f64,i64,i64,i64,str,str,f64,f64,f64,f64,bool,i64,i64,str,list[str],str,str,bool,u32,str
"""chr3""",37992024,37992026,30,0.867,0.961,37697307.0,38763383.0,1.0,629.0,16.0,14.0,"""A""","""E""",0.813,0.929,0.948,0.959,False,37992024,37992025,"""C""","[""T""]","""0""","""0""",False,1,"""chr3:37992024-37992026"""
"""chr5""",96816966,96816968,29,0.138,0.026,96282519.0,96979616.0,1.0,801.0,15.0,14.0,"""A""","""G""",0.067,0.214,0.026,0.07,False,96816967,96816968,"""G""","[""A""]","""0""","""0""",False,1,"""chr5:96816966-96816968"""
"""chr14""",67271809,67271811,31,0.742,0.938,,,,,,,,,,,,,False,67271809,67271810,"""C""","[""T""]","""0""","""0""",False,1,"""chr14:67271809-67271811"""
"""chr10""",14725985,14725987,39,0.897,0.956,14088369.0,14869232.0,1.0,1005.0,25.0,14.0,"""A""","""K""",0.88,0.929,0.963,0.936,False,14725985,14725986,"""C""","[""T""]","""0""","""0""",False,1,"""chr10:14725985-14725987"""
"""chr3""",9780229,9780231,41,0.244,0.029,9481678.0,10081061.0,1.0,330.0,21.0,20.0,"""A""","""E""",0.19,0.3,0.033,0.041,False,9780229,9780230,"""C""","[""T""]","""0""","""0""",False,1,"""chr3:9780229-9780231"""


In [29]:
# Hom ALT sites fall into two classes: either the site is CpG (creation) on both haplotypes, or not CpG on both haplotypes (destruction)
# CpG creation sites have methylation and could be imprinted; These should be included in scans for imprinting 
# CpG destruction sites do not have methylation and therefore are ascribed "None" for their methylation values 
# Including these sites in imprinting scans doesn't hurt: In Polars, the result of any arithmetic operation where one or both operands are null is always null

In [117]:
subset_cpgs_at_variants(
    DF_METH_FOUNDER_PHASED_ALL_CPGS_WITH_VARIANT_LABEL, 
    allele_1=1, 
    allele_2=1,
    num_SNVs_overlapping_CG=1
).sample(10, seed=42)

chrom,start_cpg,end_cpg,total_read_count,methylation_level_count,methylation_level_model,start_hap_map_block,end_hap_map_block,haplotype_concordance_in_hap_map_block,num_het_SNVs_in_hap_map_block,total_read_count_pat,total_read_count_mat,founder_haplotype_pat,founder_haplotype_mat,methylation_level_pat_count,methylation_level_mat_count,methylation_level_pat_model,methylation_level_mat_model,cpg_is_within_50bp_of_mismatch_site,start_variant,end_variant,REF,ALT,allele_1,allele_2,snv_phased,num_SNVs_overlapping_CG,locus_cpg
str,i64,i64,i64,f64,f64,i64,i64,f64,i64,i64,i64,str,str,f64,f64,f64,f64,bool,i64,i64,str,list[str],str,str,bool,u32,str
"""chr3""",8137111,8137113,32.0,0.781,0.911,7516164.0,8181501.0,1.0,630.0,13.0,15.0,"""A""","""E""",0.692,0.867,0.885,0.866,False,8137112,8137113,"""A""","[""G""]","""1""","""1""",False,1,"""chr3:8137111-8137113"""
"""chr5""",49973538,49973540,22.0,0.955,0.948,,,,,,,,,,,,,False,49973539,49973540,"""A""","[""G""]","""1""","""1""",False,1,"""chr5:49973538-49973540"""
"""chr5""",26264297,26264299,50.0,0.58,0.727,,,,,,,,,,,,,False,26264298,26264299,"""A""","[""G"", ""T""]","""1""","""1""",False,1,"""chr5:26264297-26264299"""
"""chr13""",105734558,105734560,45.0,0.822,0.962,105549181.0,106313309.0,1.0,669.0,22.0,23.0,"""B""","""E""",0.727,0.913,0.961,0.958,False,105734559,105734560,"""T""","[""G""]","""1""","""1""",False,1,"""chr13:105734558-105734560"""
"""chr10""",2522382,2522384,,,,,,,,,,,,,,,,False,2522382,2522383,"""C""","[""G""]","""1""","""1""",False,1,"""chr10:2522382-2522384"""
"""chr2""",227308360,227308362,42.0,0.69,0.914,227118924.0,228651679.0,1.0,1765.0,26.0,16.0,"""A""","""E""",0.731,0.625,0.951,0.633,False,227308361,227308362,"""A""","[""G""]","""1""","""1""",False,1,"""chr2:227308360-227308362"""
"""chr5""",110065262,110065264,,,,,,,,,,,,,,,,False,110065263,110065264,"""G""","[""T""]","""1""","""1""",False,1,"""chr5:110065262-110065264"""
"""chr8""",25628140,25628142,,,,,,,,,,,,,,,,False,25628141,25628142,"""G""","[""A""]","""1""","""1""",False,1,"""chr8:25628140-25628142"""
"""chr1""",31443423,31443425,30.0,0.733,0.945,31384925.0,31840697.0,0.98806,335.0,13.0,17.0,"""B""","""K""",0.769,0.706,0.961,0.89,False,31443424,31443425,"""A""","[""G""]","""1""","""1""",False,1,"""chr1:31443423-31443425"""
"""chr4""",31670928,31670930,38.0,0.658,0.77,31666612.0,32192249.0,1.0,584.0,17.0,21.0,"""B""","""G""",0.588,0.714,0.654,0.807,False,31670928,31670929,"""T""","[""C""]","""1""","""1""",False,1,"""chr4:31670928-31670930"""


In [118]:
subset_cpgs_at_variants(
    DF_METH_FOUNDER_PHASED_ALL_CPGS_WITH_VARIANT_LABEL, 
    allele_1=2, 
    allele_2=2,
    num_SNVs_overlapping_CG=1
).sample(5, seed=42)

chrom,start_cpg,end_cpg,total_read_count,methylation_level_count,methylation_level_model,start_hap_map_block,end_hap_map_block,haplotype_concordance_in_hap_map_block,num_het_SNVs_in_hap_map_block,total_read_count_pat,total_read_count_mat,founder_haplotype_pat,founder_haplotype_mat,methylation_level_pat_count,methylation_level_mat_count,methylation_level_pat_model,methylation_level_mat_model,cpg_is_within_50bp_of_mismatch_site,start_variant,end_variant,REF,ALT,allele_1,allele_2,snv_phased,num_SNVs_overlapping_CG,locus_cpg
str,i64,i64,i64,f64,f64,i64,i64,f64,i64,i64,i64,str,str,f64,f64,f64,f64,bool,i64,i64,str,list[str],str,str,bool,u32,str
"""chrX""",76600888,76600890,36.0,0.833,0.905,76548737.0,76756745.0,1.0,21.0,20.0,16.0,"""A""","""K""",0.9,0.75,0.924,0.839,False,76600888,76600889,"""T""","[""A"", ""C""]","""2""","""2""",False,1,"""chrX:76600888-76600890"""
"""chr4""",8538880,8538882,,,,,,,,,,,,,,,,False,8538880,8538881,"""C""","[""A"", ""T""]","""2""","""2""",False,1,"""chr4:8538880-8538882"""
"""chr5""",181451010,181451012,21.0,0.571,0.572,,,,,,,,,,,,,False,181451011,181451012,"""A""","[""T"", ""G""]","""2""","""2""",False,1,"""chr5:181451010-181451012"""
"""chr12""",116855004,116855006,22.0,0.636,0.738,116777542.0,117236310.0,0.987179,312.0,,10.0,"""A""","""E""",,0.7,,0.933,False,116855004,116855005,"""T""","[""A"", ""C""]","""2""","""2""",False,1,"""chr12:116855004-116855006"""
"""chr10""",106257370,106257372,,,,,,,,,,,,,,,,False,106257371,106257372,"""G""","[""A"", ""T""]","""2""","""2""",False,1,"""chr10:106257370-106257372"""


### Heterozygous CpG sites should be excluded from imprinting scans

In [119]:
# [ALT=1] cpg sites harboring an ALT allele on one haplotype, create or destroy a cpg site on that haplotype
# This could potentially lead to false calls of imprinting (subject to depth constraints), and therefore such sites should be excluded in scans for imprinting 

subset_cpgs_at_variants(
    DF_METH_FOUNDER_PHASED_ALL_CPGS_WITH_VARIANT_LABEL, 
    allele_1=0, 
    allele_2=1,
    num_SNVs_overlapping_CG=1
).sample(5, seed=42)

chrom,start_cpg,end_cpg,total_read_count,methylation_level_count,methylation_level_model,start_hap_map_block,end_hap_map_block,haplotype_concordance_in_hap_map_block,num_het_SNVs_in_hap_map_block,total_read_count_pat,total_read_count_mat,founder_haplotype_pat,founder_haplotype_mat,methylation_level_pat_count,methylation_level_mat_count,methylation_level_pat_model,methylation_level_mat_model,cpg_is_within_50bp_of_mismatch_site,start_variant,end_variant,REF,ALT,allele_1,allele_2,snv_phased,num_SNVs_overlapping_CG,locus_cpg
str,i64,i64,i64,f64,f64,i64,i64,f64,i64,i64,i64,str,str,f64,f64,f64,f64,bool,i64,i64,str,list[str],str,str,bool,u32,str
"""chr3""",34416663,34416665,48,0.458,0.502,34361702,34844223,1.0,376,26.0,21,"""A""","""E""",0.846,0.0,0.943,0.052,False,34416663,34416664,"""T""","[""C""]","""0""","""1""",True,1,"""chr3:34416663-34416665"""
"""chr5""",112255385,112255387,47,0.362,0.488,111490587,112717908,1.0,1504,24.0,23,"""A""","""G""",0.0,0.739,0.045,0.936,False,112255386,112255387,"""G""","[""A""]","""0""","""1""",True,1,"""chr5:112255385-112255387"""
"""chr5""",78221537,78221539,34,0.206,0.405,77994687,79044915,1.0,922,,25,"""A""","""G""",,0.0,,0.063,False,78221537,78221538,"""T""","[""C""]","""0""","""1""",True,1,"""chr5:78221537-78221539"""
"""chr14""",56653503,56653505,34,0.471,0.522,56163814,56835658,1.0,987,23.0,11,"""A""","""I""",0.696,0.0,0.924,0.06,False,56653504,56653505,"""G""","[""A""]","""0""","""1""",True,1,"""chr14:56653503-56653505"""
"""chr10""",2384531,2384533,30,0.3,0.546,1927294,3354263,1.0,1771,17.0,13,"""A""","""K""",0.0,0.692,0.059,0.87,False,2384532,2384533,"""A""","[""G""]","""0""","""1""",True,1,"""chr10:2384531-2384533"""


In [120]:
# [ALT=2] cpg sites harboring an ALT allele on one haplotype, create or destroy a cpg site on that haplotype
# This could potentially lead to false calls of imprinting (subject to depth constraints), and therefore such sites should be excluded in scans for imprinting 

subset_cpgs_at_variants(
    DF_METH_FOUNDER_PHASED_ALL_CPGS_WITH_VARIANT_LABEL, 
    allele_1=0, 
    allele_2=2,
    num_SNVs_overlapping_CG=1
).sample(5, seed=42)

chrom,start_cpg,end_cpg,total_read_count,methylation_level_count,methylation_level_model,start_hap_map_block,end_hap_map_block,haplotype_concordance_in_hap_map_block,num_het_SNVs_in_hap_map_block,total_read_count_pat,total_read_count_mat,founder_haplotype_pat,founder_haplotype_mat,methylation_level_pat_count,methylation_level_mat_count,methylation_level_pat_model,methylation_level_mat_model,cpg_is_within_50bp_of_mismatch_site,start_variant,end_variant,REF,ALT,allele_1,allele_2,snv_phased,num_SNVs_overlapping_CG,locus_cpg
str,i64,i64,i64,f64,f64,i64,i64,f64,i64,i64,i64,str,str,f64,f64,f64,f64,bool,i64,i64,str,list[str],str,str,bool,u32,str
"""chr15""",101753936,101753938,27,0.407,0.48,101107421,101881930,1.0,1006,15,12.0,"""A""","""I""",0.533,0.25,0.568,0.325,False,101753936,101753937,"""G""","[""A"", ""C""]","""0""","""2""",False,1,"""chr15:101753936-101753938"""
"""chr10""",98714246,98714248,20,0.25,0.488,98321410,99965381,1.0,1443,10,10.0,"""B""","""E""",0.0,0.5,0.084,0.847,False,98714246,98714247,"""A""","[""T"", ""C""]","""0""","""2""",True,1,"""chr10:98714246-98714248"""
"""chr3""",195783859,195783861,14,0.286,0.56,195598494,196879669,0.968792,1474,10,,"""B""","""I""",0.1,,0.404,,False,195783860,195783861,"""G""","[""A"", ""C""]","""0""","""2""",False,1,"""chr3:195783859-195783861"""
"""chr1""",5602826,5602828,41,0.293,0.411,5097819,6082247,1.0,763,22,19.0,"""B""","""I""",0.545,0.0,0.508,0.049,False,5602826,5602827,"""C""","[""G"", ""A""]","""0""","""2""",True,1,"""chr1:5602826-5602828"""
"""chr4""",178490537,178490539,36,0.417,0.52,178475830,178799948,1.0,540,16,20.0,"""B""","""K""",0.938,0.0,0.939,0.045,False,178490538,178490539,"""G""","[""T"", ""A""]","""0""","""2""",True,1,"""chr4:178490537-178490539"""


In [121]:
# [ALT=1,2]
# Either site is CpG in reference or not 
# If YES, then it is not CpG in both haplotypes, and therefore ascribed None as methylation
# Such sites cannot be imprinted, and therefore may be excluded from imprinting scans
# If NO, then it must be CpG in one haplotype, and not in the other other
# The corresponding methylation levels will be a FLOAT > 0 and FLOAT = 0.0, yielding a false imprinting call 
# Therefore such sites must be excluded from imprinting scans

subset_cpgs_at_variants(
    DF_METH_FOUNDER_PHASED_ALL_CPGS_WITH_VARIANT_LABEL, 
    allele_1=1, 
    allele_2=2,
    num_SNVs_overlapping_CG=1
).sample(10, seed=42)

chrom,start_cpg,end_cpg,total_read_count,methylation_level_count,methylation_level_model,start_hap_map_block,end_hap_map_block,haplotype_concordance_in_hap_map_block,num_het_SNVs_in_hap_map_block,total_read_count_pat,total_read_count_mat,founder_haplotype_pat,founder_haplotype_mat,methylation_level_pat_count,methylation_level_mat_count,methylation_level_pat_model,methylation_level_mat_model,cpg_is_within_50bp_of_mismatch_site,start_variant,end_variant,REF,ALT,allele_1,allele_2,snv_phased,num_SNVs_overlapping_CG,locus_cpg
str,i64,i64,i64,f64,f64,i64,i64,f64,i64,i64,i64,str,str,f64,f64,f64,f64,bool,i64,i64,str,list[str],str,str,bool,u32,str
"""chr3""",137735834,137735836,39.0,0.385,0.542,137630747.0,137890915.0,1.0,169.0,19.0,20.0,"""B""","""E""",0.0,0.75,0.056,0.926,False,137735834,137735835,"""A""","[""C"", ""T""]","""1""","""2""",True,1,"""chr3:137735834-137735836"""
"""chr6""",170087956,170087958,30.0,0.267,0.031,169969426.0,170305792.0,1.0,387.0,11.0,19.0,"""B""","""G""",0.0,0.421,0.02,0.096,False,170087956,170087957,"""G""","[""T"", ""C""]","""1""","""2""",True,1,"""chr6:170087956-170087958"""
"""chr6""",137643791,137643793,33.0,0.455,0.563,137638097.0,138215928.0,1.0,503.0,15.0,18.0,"""A""","""G""",0.0,0.833,0.065,0.934,False,137643792,137643793,"""T""","[""G"", ""A""]","""1""","""2""",True,1,"""chr6:137643791-137643793"""
"""chr15""",92278096,92278098,,,,,,,,,,,,,,,,False,92278097,92278098,"""G""","[""C"", ""A""]","""1""","""2""",True,1,"""chr15:92278096-92278098"""
"""chr11""",82315631,82315633,34.0,0.118,0.108,82272616.0,82315633.0,1.0,2.0,14.0,20.0,"""B""","""E""",0.286,0.0,0.353,0.057,False,82315632,82315633,"""A""","[""G"", ""T""]","""1""","""2""",True,1,"""chr11:82315631-82315633"""
"""chr7""",45226941,45226943,30.0,0.333,0.498,44547000.0,45391684.0,1.0,642.0,10.0,20.0,"""A""","""G""",1.0,0.0,0.96,0.045,False,45226941,45226942,"""A""","[""T"", ""C""]","""1""","""2""",True,1,"""chr7:45226941-45226943"""
"""chr10""",43528671,43528673,34.0,0.324,0.477,43301344.0,43729972.0,1.0,410.0,15.0,19.0,"""A""","""E""",0.733,0.0,0.95,0.064,False,43528671,43528672,"""T""","[""C"", ""A""]","""1""","""2""",True,1,"""chr10:43528671-43528673"""
"""chr1""",45419921,45419923,32.0,0.25,0.395,44559606.0,46202920.0,1.0,1566.0,16.0,16.0,"""B""","""I""",0.5,0.0,0.801,0.057,False,45419921,45419922,"""A""","[""C"", ""T""]","""1""","""2""",True,1,"""chr1:45419921-45419923"""
"""chr5""",5286713,5286715,33.0,0.485,0.526,4992392.0,5434493.0,1.0,441.0,17.0,16.0,"""A""","""G""",0.0,1.0,0.059,0.953,False,5286714,5286715,"""A""","[""G"", ""T""]","""1""","""2""",True,1,"""chr5:5286713-5286715"""
"""chr5""",49975686,49975688,,,,,,,,,,,,,,,,False,49975687,49975688,"""G""","[""A"", ""C""]","""1""","""2""",False,1,"""chr5:49975686-49975688"""


## CpG sites that each overlap 2 SNVs: When scanning for imprinting, exclude these sites if at least one of the SNVs is heterozygous 

### There are very few CpG sites that overlap 2 SNVs 

In [122]:
# CpG sites that overlap 2 SNVs are rare among the full set of CpG sites in the reference (and sample) genome:
print(f"number of CpGs that overlap 2 SNVs: {len(DF_METH_FOUNDER_PHASED_ALL_CPGS_WITH_VARIANT_LABEL.filter((pl.col('num_SNVs_overlapping_CG') == 2)))}")

number of CpGs that overlap 2 SNVs: 57164


In [123]:
# Since CpG sites are 2 bases long, it is impossible for a CpG site to overlap more than 2 SNVs: 
print(f"number of CpGs that overlap more than 2 SNVs: {len(DF_METH_FOUNDER_PHASED_ALL_CPGS_WITH_VARIANT_LABEL.filter((pl.col('num_SNVs_overlapping_CG') > 2)))}")

number of CpGs that overlap more than 2 SNVs: 0


### CpG sites in which at least one of the 2 overlapping SNVs is heterozygous should be excluded prior to scanning for imprinting

In [127]:
# LOGIC: 
# 1. If the two SNVs are each homozygous, then the haplotypes are the same (either CpG or not), and therefore it is impossible to generate a false example of imprinting. 
# 2. If one SNV is homozygous and the other heterozygous, then the haplotypes are different. 
#   i. If one of those haplotypes is CpG, false imprinting is possible. 
#   ii. It neither haplotye is CpG, then throwing it out doesn't matter. 
# 3. If both SNVs are heterozygous, then there are two ways that the ALT alleles could segregate among the haplotypes: 
#   i. One ALT is on hap1; the second ALT is on hap2. 
#   ii. Both ALTs are on the same haplotype. 
#    Either way, the haplotypes are different, and we are back to case 2. 

def sample_cpgs_at_double_variants(df, sample_size, seed):
    number_variants = 2 
    coord_cols = ['chrom', 'start_cpg', 'end_cpg'] 

    # 1. Filter to find the valid rows
    # 2. Select ONLY the coord cols and get UNIQUE combinations
    # 3. Sample from those unique records
    sample_coordinates = (
        df
        .filter(pl.col('num_SNVs_overlapping_CG') == number_variants)
        .select(coord_cols)
        .unique()
        .sort(coord_cols)
        .sample(sample_size, seed=seed) 
    )

    # 4. Join the distinct coordinates back to the full dataframe
    result = df.join(
        sample_coordinates, 
        on=coord_cols, 
        how='inner'
    ).sort(coord_cols)

    pl.Config.set_tbl_rows(number_variants*sample_size)

    return add_locus_cpg(result)

# Visual inspection of many loci, including the following, in IGV confirmed the LOGIC presented above
sample_cpgs_at_double_variants(DF_METH_FOUNDER_PHASED_ALL_CPGS_WITH_VARIANT_LABEL, sample_size=10, seed=39)

chrom,start_cpg,end_cpg,total_read_count,methylation_level_count,methylation_level_model,start_hap_map_block,end_hap_map_block,haplotype_concordance_in_hap_map_block,num_het_SNVs_in_hap_map_block,total_read_count_pat,total_read_count_mat,founder_haplotype_pat,founder_haplotype_mat,methylation_level_pat_count,methylation_level_mat_count,methylation_level_pat_model,methylation_level_mat_model,cpg_is_within_50bp_of_mismatch_site,start_variant,end_variant,REF,ALT,allele_1,allele_2,snv_phased,num_SNVs_overlapping_CG,locus_cpg
str,i64,i64,i64,f64,f64,i64,i64,f64,i64,i64,i64,str,str,f64,f64,f64,f64,bool,i64,i64,str,list[str],str,str,bool,u32,str
"""chr1""",183998410,183998412,21,0.762,0.951,,,,,,,,,,,,,False,183998410,183998411,"""C""","[""T""]","""0""","""0""",False,2,"""chr1:183998410-183998412"""
"""chr1""",183998410,183998412,21,0.762,0.951,,,,,,,,,,,,,False,183998411,183998412,"""G""","[""A""]","""0""","""0""",False,2,"""chr1:183998410-183998412"""
"""chr12""",40614749,40614751,39,0.513,0.456,39760953.0,41776488.0,1.0,2726.0,17.0,22.0,"""A""","""G""",0.412,0.591,0.418,0.51,False,40614749,40614750,"""C""","[""T""]","""0""","""0""",False,2,"""chr12:40614749-40614751"""
"""chr12""",40614749,40614751,39,0.513,0.456,39760953.0,41776488.0,1.0,2726.0,17.0,22.0,"""A""","""G""",0.412,0.591,0.418,0.51,False,40614750,40614751,"""G""","[""A""]","""0""","""0""",False,2,"""chr12:40614749-40614751"""
"""chr16""",11785704,11785706,36,0.861,0.971,,,,,,,,,,,,,False,11785704,11785705,"""C""","[""T""]","""0""","""0""",False,2,"""chr16:11785704-11785706"""
"""chr16""",11785704,11785706,36,0.861,0.971,,,,,,,,,,,,,False,11785705,11785706,"""G""","[""A""]","""0""","""0""",False,2,"""chr16:11785704-11785706"""
"""chr17""",70074391,70074393,38,0.395,0.512,69819388.0,70104847.0,1.0,171.0,19.0,19.0,"""A""","""E""",0.0,0.789,0.05,0.955,False,70074391,70074392,"""C""","[""T""]","""0""","""0""",False,2,"""chr17:70074391-70074393"""
"""chr17""",70074391,70074393,38,0.395,0.512,69819388.0,70104847.0,1.0,171.0,19.0,19.0,"""A""","""E""",0.0,0.789,0.05,0.955,False,70074392,70074393,"""A""","[""G""]","""1""","""0""",True,2,"""chr17:70074391-70074393"""
"""chr22""",10764336,10764338,79,0.241,0.342,10743856.0,10784638.0,1.0,64.0,17.0,22.0,"""A""","""I""",0.0,0.318,0.036,0.691,False,10764336,10764337,"""C""","[""T""]","""0""","""0""",False,2,"""chr22:10764336-10764338"""
"""chr22""",10764336,10764338,79,0.241,0.342,10743856.0,10784638.0,1.0,64.0,17.0,22.0,"""A""","""I""",0.0,0.318,0.036,0.691,False,10764337,10764338,"""A""","[""G""]","""0""","""0""",False,2,"""chr22:10764336-10764338"""


In [None]:
# the identity of the ALT allele doesn't matter to the logic above: 
subset_cpgs_at_variants(
    DF_METH_FOUNDER_PHASED_ALL_CPGS_WITH_VARIANT_LABEL, 
    allele_1=0, 
    allele_2=2,
    num_SNVs_overlapping_CG=2
)
# Note that those CpGs that have non-zero count-based methylation on both haplotypes should probably have zero methylation on one of them (based on IGV inspection)

chrom,start_cpg,end_cpg,total_read_count,methylation_level_count,methylation_level_model,start_hap_map_block,end_hap_map_block,haplotype_concordance_in_hap_map_block,num_het_SNVs_in_hap_map_block,total_read_count_pat,total_read_count_mat,founder_haplotype_pat,founder_haplotype_mat,methylation_level_pat_count,methylation_level_mat_count,methylation_level_pat_model,methylation_level_mat_model,cpg_is_within_50bp_of_mismatch_site,start_variant,end_variant,REF,ALT,allele_1,allele_2,snv_phased,num_SNVs_overlapping_CG,locus_cpg
str,i64,i64,i64,f64,f64,i64,i64,f64,i64,i64,i64,str,str,f64,f64,f64,f64,bool,i64,i64,str,list[str],str,str,bool,u32,str
"""chr1""",34187773,34187775,48,0.25,0.392,33202440,34366901,1.0,1244,23,25,"""B""","""K""",0.522,0.0,0.59,0.048,false,34187774,34187775,"""C""","[""A"", ""G""]","""0""","""2""",true,2,"""chr1:34187773-34187775"""
"""chr3""",195713096,195713098,52,0.538,0.752,195598494,196879669,0.968792,1474,28,24,"""B""","""I""",0.429,0.667,0.666,0.833,false,195713097,195713098,"""G""","[""C"", ""A""]","""0""","""2""",true,2,"""chr3:195713096-195713098"""
"""chr3""",195714190,195714192,52,0.327,0.534,195598494,196879669,0.968792,1474,28,24,"""B""","""I""",0.214,0.458,0.557,0.511,false,195714190,195714191,"""C""","[""A"", ""G""]","""0""","""2""",true,2,"""chr3:195714190-195714192"""
"""chr5""",149655218,149655220,57,0.228,0.522,149626241,149690396,1.0,51,29,18,"""B""","""G""",0.0,0.722,0.142,0.849,false,149655219,149655220,"""C""","[""A"", ""G""]","""0""","""2""",true,2,"""chr5:149655218-149655220"""
"""chr6""",31340849,31340851,27,0.556,0.74,30317747,31517324,1.0,4447,18,,"""B""","""G""",0.833,,0.864,,false,31340850,31340851,"""G""","[""A"", ""C""]","""0""","""2""",true,2,"""chr6:31340849-31340851"""
"""chr6""",31356246,31356248,,,,,,,,,,,,,,,,false,31356247,31356248,"""G""","[""C"", ""A""]","""0""","""2""",true,2,"""chr6:31356246-31356248"""
"""chr6""",32540277,32540279,22,0.409,0.613,31645794,32555831,0.730584,1867,12,10,"""B""","""G""",0.0,0.9,0.038,0.963,true,32540278,32540279,"""G""","[""C"", ""T""]","""0""","""2""",true,2,"""chr6:32540277-32540279"""
"""chr6""",32663300,32663302,43,0.419,0.765,32583991,33246527,1.0,4619,19,24,"""B""","""G""",0.0,0.75,0.064,0.882,false,32663300,32663301,"""G""","[""T"", ""C""]","""0""","""2""",true,2,"""chr6:32663300-32663302"""
"""chr6""",32664302,32664304,41,0.341,0.44,32583991,33246527,1.0,4619,19,22,"""B""","""G""",0.737,0.0,0.817,0.055,false,32664302,32664303,"""C""","[""T"", ""G""]","""0""","""2""",true,2,"""chr6:32664302-32664304"""
"""chr6""",32672078,32672080,48,0.208,0.471,32583991,33246527,1.0,4619,19,29,"""B""","""G""",0.526,0.0,0.641,0.064,false,32672078,32672079,"""C""","[""T"", ""A""]","""0""","""2""",true,2,"""chr6:32672078-32672080"""


## Label each unique CpG record with a flag indicating whether it is allele-specific

In [102]:
importlib.reload(expand_to_all_cpgs)
from expand_to_all_cpgs import label_cpgs_as_allele_specific

DF_METH_FOUNDER_PHASED_ALL_CPGS_WITH_ALLELE_SPECIFIC_FLAG = label_cpgs_as_allele_specific(DF_METH_FOUNDER_PHASED_ALL_CPGS_WITH_VARIANT_LABEL) 
DF_METH_FOUNDER_PHASED_ALL_CPGS_WITH_ALLELE_SPECIFIC_FLAG.filter(
    pl.col('cpg_overlaps_at_least_one_snv') > 0
)

chrom,start_cpg,end_cpg,total_read_count,methylation_level_count,methylation_level_model,start_hap_map_block,end_hap_map_block,haplotype_concordance_in_hap_map_block,num_het_SNVs_in_hap_map_block,total_read_count_pat,total_read_count_mat,founder_haplotype_pat,founder_haplotype_mat,methylation_level_pat_count,methylation_level_mat_count,methylation_level_pat_model,methylation_level_mat_model,cpg_is_within_50bp_of_mismatch_site,cpg_overlaps_at_least_one_snv,snv_genotypes,cpg_is_allele_specific
str,i64,i64,i64,f64,f64,i64,i64,f64,i64,i64,i64,str,str,f64,f64,f64,f64,bool,bool,str,bool
"""chr1""",10622,10624,15,0.8,0.919,,,,,,,,,,,,,false,true,"""hom""",false
"""chr1""",10748,10750,15,0.6,0.534,,,,,,,,,,,,,false,true,"""hom""",false
"""chr1""",10925,10927,14,0.786,0.949,,,,,,,,,,,,,false,true,"""hom""",false
"""chr1""",10930,10932,,,,,,,,,,,,,,,,false,true,"""hom""",false
"""chr1""",10933,10935,,,,,,,,,,,,,,,,false,true,"""hom""",false
"""chr1""",11001,11003,15,0.667,0.721,,,,,,,,,,,,,false,true,"""hom""",false
"""chr1""",11112,11114,15,0.733,0.905,,,,,,,,,,,,,false,true,"""hom""",false
"""chr1""",11153,11155,15,0.8,0.934,,,,,,,,,,,,,false,true,"""hom""",false
"""chr1""",11166,11168,,,,,,,,,,,,,,,,false,true,"""hom""",false
"""chr1""",11407,11409,15,0.933,0.925,,,,,,,,,,,,,false,true,"""hom""",false


### Sanity checking 

In [113]:
# CGs that overlap 1 SNV that is het indeed have zero methylation on one haplotype, and can be flagged for exclusion in imprinting scans: 
DF_METH_FOUNDER_PHASED_ALL_CPGS_WITH_ALLELE_SPECIFIC_FLAG.filter(pl.col("snv_genotypes") == "het").sample(10, seed=42)

chrom,start_cpg,end_cpg,total_read_count,methylation_level_count,methylation_level_model,start_hap_map_block,end_hap_map_block,haplotype_concordance_in_hap_map_block,num_het_SNVs_in_hap_map_block,total_read_count_pat,total_read_count_mat,founder_haplotype_pat,founder_haplotype_mat,methylation_level_pat_count,methylation_level_mat_count,methylation_level_pat_model,methylation_level_mat_model,cpg_is_within_50bp_of_mismatch_site,cpg_overlaps_at_least_one_snv,snv_genotypes,cpg_is_allele_specific
str,i64,i64,i64,f64,f64,i64,i64,f64,i64,i64,i64,str,str,f64,f64,f64,f64,bool,bool,str,bool
"""chr11""",77099928,77099930,20,0.3,0.388,76043576.0,77560644.0,1.0,1250.0,10.0,10.0,"""B""","""E""",0.0,0.6,0.056,0.788,False,True,"""het""",True
"""chr15""",33587102,33587104,21,0.143,0.297,33236226.0,33844930.0,1.0,921.0,,14.0,"""B""","""G""",,0.0,,0.057,False,True,"""het""",True
"""chr20""",63937230,63937232,31,0.419,0.513,62686578.0,64293542.0,0.725935,1631.0,13.0,18.0,"""A""","""G""",1.0,0.0,0.962,0.054,False,True,"""het""",True
"""chr11""",54406640,54406642,78,0.167,0.116,,,,,,,,,,,,,False,True,"""het""",True
"""chr16""",1231632,1231634,17,0.235,0.331,954061.0,1626910.0,1.0,746.0,11.0,,"""B""","""G""",0.0,,0.116,,False,True,"""het""",True
"""chr1""",32596743,32596745,28,0.321,0.514,32405534.0,33175001.0,1.0,556.0,18.0,10.0,"""B""","""K""",0.0,0.9,0.061,0.961,False,True,"""het""",True
"""chr13""",72599365,72599367,40,0.45,0.539,71829043.0,73312576.0,0.997899,1428.0,20.0,20.0,"""B""","""E""",0.0,0.9,0.059,0.956,False,True,"""het""",True
"""chr10""",41342333,41342335,33,0.697,0.826,,,,,,,,,,,,,False,True,"""het""",True
"""chr6""",36509900,36509902,26,0.423,0.547,36262285.0,37059181.0,1.0,887.0,13.0,13.0,"""B""","""G""",0.0,0.846,0.058,0.953,False,True,"""het""",True
"""chr21""",46081829,46081831,38,0.5,0.594,45495607.0,46689414.0,0.929038,1071.0,23.0,15.0,"""A""","""G""",0.826,0.0,0.948,0.057,False,True,"""het""",True


In [110]:
# CGs that overlap 2 SNVs, the first of which is het, indeed have zero methylation on one haplotype, and can be flagged for exclusion in imprinting scans: 
DF_METH_FOUNDER_PHASED_ALL_CPGS_WITH_ALLELE_SPECIFIC_FLAG.filter(pl.col("snv_genotypes").str.contains("het,")).sample(10, seed=42)

chrom,start_cpg,end_cpg,total_read_count,methylation_level_count,methylation_level_model,start_hap_map_block,end_hap_map_block,haplotype_concordance_in_hap_map_block,num_het_SNVs_in_hap_map_block,total_read_count_pat,total_read_count_mat,founder_haplotype_pat,founder_haplotype_mat,methylation_level_pat_count,methylation_level_mat_count,methylation_level_pat_model,methylation_level_mat_model,cpg_is_within_50bp_of_mismatch_site,cpg_overlaps_at_least_one_snv,snv_genotypes,cpg_is_allele_specific
str,i64,i64,i64,f64,f64,i64,i64,f64,i64,i64,i64,str,str,f64,f64,f64,f64,bool,bool,str,bool
"""chr11""",116806653,116806655,40.0,0.45,0.517,116383126.0,117881278.0,1.0,1568.0,21.0,19.0,"""A""","""K""",0.857,0.0,0.959,0.055,False,True,"""het,hom""",True
"""chr15""",64833389,64833391,33.0,0.485,0.529,64137819.0,64874657.0,1.0,99.0,16.0,17.0,"""B""","""E""",0.0,0.941,0.052,0.958,False,True,"""het,hom""",True
"""chr15""",26781662,26781664,39.0,0.513,0.528,26732564.0,26874104.0,1.0,135.0,17.0,22.0,"""B""","""G""",0.0,0.909,0.051,0.951,False,True,"""het,hom""",True
"""chr5""",170058970,170058972,45.0,0.378,0.527,169532595.0,170465741.0,1.0,762.0,21.0,24.0,"""B""","""G""",0.0,0.708,0.051,0.931,False,True,"""het,hom""",True
"""chr21""",17807958,17807960,,,,,,,,,,,,,,,,True,True,"""het,het""",True
"""chr11""",80913914,80913916,38.0,0.526,0.618,80636411.0,81530255.0,0.704475,961.0,23.0,15.0,"""B""","""E""",0.87,0.0,0.939,0.07,False,True,"""het,het""",True
"""chr16""",8583639,8583641,42.0,0.476,0.505,8049954.0,9009074.0,0.992962,1705.0,26.0,16.0,"""B""","""E""",0.769,0.0,0.94,0.046,False,True,"""het,hom""",True
"""chr2""",63978779,63978781,,,,,,,,,,,,,,,,False,True,"""het,het""",True
"""chr1""",18873861,18873863,,,,,,,,,,,,,,,,False,True,"""het,het""",True
"""chr13""",35577599,35577601,38.0,0.395,0.527,35445782.0,35889991.0,1.0,539.0,19.0,19.0,"""B""","""E""",0.789,0.0,0.959,0.058,False,True,"""het,hom""",True


In [112]:
# CGs that overlap 2 SNVs, the second of which is het, indeed have zero methylation on one haplotype, and can be flagged for exclusion in imprinting scans: 
DF_METH_FOUNDER_PHASED_ALL_CPGS_WITH_ALLELE_SPECIFIC_FLAG.filter(pl.col("snv_genotypes").str.contains(",het")).sample(10, seed=42)

chrom,start_cpg,end_cpg,total_read_count,methylation_level_count,methylation_level_model,start_hap_map_block,end_hap_map_block,haplotype_concordance_in_hap_map_block,num_het_SNVs_in_hap_map_block,total_read_count_pat,total_read_count_mat,founder_haplotype_pat,founder_haplotype_mat,methylation_level_pat_count,methylation_level_mat_count,methylation_level_pat_model,methylation_level_mat_model,cpg_is_within_50bp_of_mismatch_site,cpg_overlaps_at_least_one_snv,snv_genotypes,cpg_is_allele_specific
str,i64,i64,i64,f64,f64,i64,i64,f64,i64,i64,i64,str,str,f64,f64,f64,f64,bool,bool,str,bool
"""chr11""",133643987,133643989,42,0.429,0.604,133427993,133917821,1.0,523,15.0,17,"""A""","""K""",0.8,0.0,0.935,0.043,False,True,"""hom,het""",True
"""chr15""",93356653,93356655,36,0.389,0.515,92738748,93483598,1.0,1015,18.0,18,"""B""","""I""",0.0,0.778,0.06,0.958,False,True,"""hom,het""",True
"""chr15""",55209351,55209353,37,0.378,0.699,55169940,55576760,1.0,441,17.0,20,"""B""","""G""",0.824,0.0,0.956,0.083,False,True,"""hom,het""",True
"""chr5""",165043734,165043736,35,0.486,0.729,164979390,165524681,1.0,640,15.0,20,"""B""","""G""",0.0,0.85,0.112,0.938,False,True,"""hom,het""",True
"""chr21""",27043995,27043997,39,0.462,0.524,27018003,27196946,1.0,195,21.0,18,"""A""","""I""",0.857,0.0,0.935,0.049,False,True,"""het,het""",True
"""chr11""",99617118,99617120,41,0.22,0.292,99428338,101734380,1.0,2952,19.0,22,"""B""","""E""",0.474,0.0,0.555,0.049,False,True,"""hom,het""",True
"""chr16""",25871823,25871825,45,0.444,0.513,24938773,25901584,1.0,1224,24.0,21,"""B""","""E""",0.833,0.0,0.941,0.061,False,True,"""het,het""",True
"""chr2""",102222995,102222997,39,0.359,0.461,101445111,102671355,1.0,1297,19.0,20,"""B""","""I""",0.737,0.0,0.947,0.055,False,True,"""hom,het""",True
"""chr1""",23880468,23880470,27,0.259,0.505,23110629,23930386,1.0,321,,19,"""B""","""K""",,0.0,,0.057,False,True,"""het,het""",True
"""chr13""",53412001,53412003,36,0.361,0.45,52563299,53516934,1.0,970,21.0,15,"""B""","""E""",0.619,0.0,0.695,0.04,False,True,"""hom,het""",True
