## Setup 

In [1]:
import importlib
from pathlib import Path 
import sys
import polars as pl 

OUTPUT_DIR = f"/scratch/ucgd/lustre-labs/quinlan/data-shared/dna-methylation/CEPH1463.GRCh38.hifi.founder-phased.all-cpgs" 
BED_ALL_CPGS_IN_REFERENCE = f"{OUTPUT_DIR}/all_cpg_sites_in_reference.bed" # output of src/write_all_cpgs.py
METH_FOUNDER_PHASED_DIR = f"/scratch/ucgd/lustre-labs/quinlan/data-shared/dna-methylation/CEPH1463.GRCh38.hifi.founder-phased" # output dir of phase_meth_to_founder_haps.py
METH_COUNT_READ_PHASED_DIR = Path(f"/scratch/ucgd/lustre-labs/quinlan/data-shared/dna-methylation/CEPH1463.GRCh38.hifi.count.read-backed-phased") # output dir of aligned_bam_to_cpg_scores (containing count-based unphased meth)
METH_MODEL_READ_PHASED_DIR = Path(f"/scratch/ucgd/lustre-labs/quinlan/data-shared/dna-methylation/CEPH1463.GRCh38.hifi.model.read-backed-phased") # output dir of aligned_bam_to_cpg_scores (containing model-based unphased meth)
UID = '200081' # much of the interpretation in this notebook is specific to this sample
BED_METH_FOUNDER_PHASED = f"{METH_FOUNDER_PHASED_DIR}/{UID}.dna-methylation.founder-phased.bed" # bed file of founder-phased methylation levels from src/phase_meth_to_founder_haps.py
BED_METH_COUNT_UNPHASED = Path(f"{METH_COUNT_READ_PHASED_DIR}/{UID}.GRCh38.haplotagged.combined.bed.gz") # bed file from aligned_bam_to_cpg_scores (unphased count-based meth)
BED_METH_MODEL_UNPHASED = Path(f"{METH_MODEL_READ_PHASED_DIR}/{UID}.GRCh38.haplotagged.combined.bed.gz") # bed file from aligned_bam_to_cpg_scores (unphased model-based meth)
BED_METH_FOUNDER_PHASED_ALL_CPGS = f"{OUTPUT_DIR}/{UID}.dna-methylation.founder-phased.all_cpgs.bed"
BED_HET_SITE_MISMATCHES = f"{METH_FOUNDER_PHASED_DIR}/{UID}.bit-vector-sites-mismatches.bed" # bed file of heterozygous sites at which bit-vectors are mismatched, from src/phase_meth_to_founder_haps.py

IHT_PHASED_DIR = Path('/scratch/ucgd/lustre-labs/quinlan/data-shared/haplotype-maps/CEPH1463.GRCh38')
VCF_IHT_PHASED = f"{IHT_PHASED_DIR}/CEPH1463.GRCh38.pass.sorted.vcf.gz" # joint-called multi-sample vcf from gtg-ped-map/gtg-concordance

REPO_DIR = Path('/scratch/ucgd/lustre-labs/quinlan/u6018199/tapestry')
sys.path.append(f"{REPO_DIR}/src/util") 

## Get all CpG sites in reference genome

In [2]:
import expand_to_all_cpgs
importlib.reload(expand_to_all_cpgs)
from expand_to_all_cpgs import read_all_cpgs_in_reference

DF_ALL_CPGS_IN_REFERENCE = read_all_cpgs_in_reference(BED_ALL_CPGS_IN_REFERENCE)
DF_ALL_CPGS_IN_REFERENCE

chrom,start,end
str,i64,i64
"""chr1""",10468,10469
"""chr1""",10470,10471
"""chr1""",10483,10484
"""chr1""",10488,10489
"""chr1""",10492,10493
…,…,…
"""chrY""",56887220,56887221
"""chrY""",56887399,56887400
"""chrY""",56887579,56887580
"""chrY""",56887581,56887582


## Read in unphased DNA methylation at CpG sites, both those in the reference genome, and those present in the sample but not in the reference genome

In [3]:
importlib.reload(expand_to_all_cpgs)
from expand_to_all_cpgs import read_meth_unphased

DF_METH_UNPHASED = read_meth_unphased(BED_METH_COUNT_UNPHASED, BED_METH_MODEL_UNPHASED) 
DF_METH_UNPHASED

chrom,start,end,total_read_count_count,methylation_level_count,total_read_count_model,methylation_level_model
str,i64,i64,i64,f64,i64,f64
"""chr1""",10468,10469,13,0.769,13,0.868
"""chr1""",10470,10471,13,0.769,13,0.878
"""chr1""",10483,10484,13,0.923,13,0.929
"""chr1""",10488,10489,14,1.0,14,0.95
"""chr1""",10492,10493,13,1.0,13,0.964
…,…,…,…,…,…,…
"""chrY""",56887220,56887221,38,0.947,38,0.962
"""chrY""",56887399,56887400,38,0.737,38,0.942
"""chrY""",56887579,56887580,38,0.895,38,0.945
"""chrY""",56887592,56887593,38,0.579,38,0.721


## Methylation levels are computed at CpG sites observed in the sample (which may or may not be in the reference)

The default value of the `--modsites-mode` argument of `aligned_bam_to_cpg_scores` is `denovo`, meaning that DNA methylation levels are computed at all CG sites in the sample's haplotypes: 

https://github.com/PacificBiosciences/pb-CpG-tools?tab=readme-ov-file#output-modes-and-option-details

https://quinlangroup.slack.com/archives/C0803TM7X0X/p1759348751929209

https://quinlangroup.slack.com/archives/C0803TM7X0X/p1759349045861589

## CpG site creation: Sites that are CpG in at least one haplotype of the sample, but not CpG in the reference sequence

In [4]:
# IGV snapshots: 
# https://quinlangroup.slack.com/archives/C0803TM7X0X/p1759879585412219 
# https://quinlangroup.slack.com/archives/C0803TM7X0X/p1759880211882149
# https://quinlangroup.slack.com/archives/C0803TM7X0X/p1759880434142149 
# https://quinlangroup.slack.com/archives/C0803TM7X0X/p1759880669955469

# These sites also appear as variants in /scratch/ucgd/lustre-labs/quinlan/data-shared/read-backed-phasing/200081.GRCh38.deepvariant.glnexus.phased.vcf.gz
# e.g., 
# $ tabix 200081.GRCh38.deepvariant.glnexus.phased.vcf.gz chr1:10623-10623 
# chr1    10623   chr1_10623_T_C  T       C       36      .       AF=1;AQ=36      GT:DP:AD:GQ:PL:RNC      1/1:23:0,23:22:33,22,0:..

DF_METH_UNPHASED.join(DF_ALL_CPGS_IN_REFERENCE, on=['chrom', 'start', 'end'], how='anti')

chrom,start,end,total_read_count_count,methylation_level_count,total_read_count_model,methylation_level_model
str,i64,i64,i64,f64,i64,f64
"""chr1""",10622,10623,15,0.8,15,0.919
"""chr1""",10804,10805,13,0.692,13,0.708
"""chr1""",10820,10821,13,0.615,13,0.778
"""chr1""",10828,10829,14,0.571,14,0.652
"""chr1""",10925,10926,14,0.786,14,0.949
…,…,…,…,…,…,…
"""chrY""",56885796,56885797,41,0.854,41,0.953
"""chrY""",56885831,56885832,41,0.805,41,0.948
"""chrY""",56885915,56885916,41,0.732,41,0.9
"""chrY""",56886309,56886310,41,0.854,41,0.927


## Sites that are CpG in the reference, but at which unphased DNA methylation is not reported

These sites fall into two classes: 

1. CpG site destruction: A variant destroyed the CpG (relative to the reference sequence), e.g., https://quinlangroup.slack.com/archives/C0803TM7X0X/p1759955795431799 We see these variants in the VCF too. 
2. The site is CpG in the sample, but read coverage was too low to report a reliable estimate of DNA methylation 

In [5]:
DF_ALL_CPGS_IN_REFERENCE.join(DF_METH_UNPHASED, on=['chrom', 'start', 'end'], how='anti')

chrom,start,end
str,i64,i64
"""chr1""",10930,10931
"""chr1""",10933,10934
"""chr1""",11166,11167
"""chr1""",12781,12782
"""chr1""",13301,13302
…,…,…
"""chrY""",56884829,56884830
"""chrY""",56885859,56885860
"""chrY""",56886407,56886408
"""chrY""",56886943,56886944


## Read in founder-phased DNA methylation at CpG sites

In [6]:
importlib.reload(expand_to_all_cpgs)
from expand_to_all_cpgs import read_meth_founder_phased

DF_METH_FOUNDER_PHASED = read_meth_founder_phased(BED_METH_FOUNDER_PHASED)
DF_METH_FOUNDER_PHASED

chrom,start,end,start_hap_map_block,end_hap_map_block,haplotype_concordance_in_hap_map_block,num_het_SNVs_in_hap_map_block,total_read_count_pat,total_read_count_mat,founder_haplotype_pat,founder_haplotype_mat,methylation_level_pat_count,methylation_level_mat_count,methylation_level_pat_model,methylation_level_mat_model
str,i64,i64,i64,i64,f64,i64,f64,f64,str,str,f64,f64,f64,f64
"""chr1""",14061,14062,,,,,,,,,,,,
"""chr1""",14178,14179,,,,,,,,,,,,
"""chr1""",14348,14349,,,,,,,,,,,,
"""chr1""",14353,14354,,,,,,,,,,,,
"""chr1""",14434,14435,,,,,,,,,,,,
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""chrY""",56887220,56887221,,,,,,,,,,,,
"""chrY""",56887399,56887400,,,,,,,,,,,,
"""chrY""",56887579,56887580,,,,,,,,,,,,
"""chrY""",56887592,56887593,,,,,,,,,,,,


## Expand the dataframe of founder-phased methylation levels to include all CpG sites in reference and sample genome, and unphased methylation levels (where available)

In [7]:
importlib.reload(expand_to_all_cpgs)
from expand_to_all_cpgs import expand_meth_to_all_cpgs

DF_METH_FOUNDER_PHASED_ALL_CPGS = expand_meth_to_all_cpgs(DF_ALL_CPGS_IN_REFERENCE, DF_METH_UNPHASED, DF_METH_FOUNDER_PHASED)
DF_METH_FOUNDER_PHASED_ALL_CPGS

chrom,start,end,total_read_count,methylation_level_count,methylation_level_model,start_hap_map_block,end_hap_map_block,haplotype_concordance_in_hap_map_block,num_het_SNVs_in_hap_map_block,total_read_count_pat,total_read_count_mat,founder_haplotype_pat,founder_haplotype_mat,methylation_level_pat_count,methylation_level_mat_count,methylation_level_pat_model,methylation_level_mat_model
str,i64,i64,i64,f64,f64,i64,i64,f64,i64,f64,f64,str,str,f64,f64,f64,f64
"""chr1""",10468,10469,13,0.769,0.868,,,,,,,,,,,,
"""chr1""",10470,10471,13,0.769,0.878,,,,,,,,,,,,
"""chr1""",10483,10484,13,0.923,0.929,,,,,,,,,,,,
"""chr1""",10488,10489,14,1.0,0.95,,,,,,,,,,,,
"""chr1""",10492,10493,13,1.0,0.964,,,,,,,,,,,,
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""chrY""",56887399,56887400,38,0.737,0.942,,,,,,,,,,,,
"""chrY""",56887579,56887580,38,0.895,0.945,,,,,,,,,,,,
"""chrY""",56887581,56887582,,,,,,,,,,,,,,,
"""chrY""",56887592,56887593,38,0.579,0.721,,,,,,,,,,,,


## Add proximity of each CpG site to heterozygous sites at which bit-vectors are mismatched 

In [8]:
importlib.reload(expand_to_all_cpgs)
from expand_to_all_cpgs import compute_proximity_to_mismatched_heterozygous_sites

DF_METH_FOUNDER_PHASED_ALL_CPGS = compute_proximity_to_mismatched_heterozygous_sites(DF_METH_FOUNDER_PHASED_ALL_CPGS, BED_HET_SITE_MISMATCHES)
DF_METH_FOUNDER_PHASED_ALL_CPGS

chrom,start,end,total_read_count,methylation_level_count,methylation_level_model,start_hap_map_block,end_hap_map_block,haplotype_concordance_in_hap_map_block,num_het_SNVs_in_hap_map_block,total_read_count_pat,total_read_count_mat,founder_haplotype_pat,founder_haplotype_mat,methylation_level_pat_count,methylation_level_mat_count,methylation_level_pat_model,methylation_level_mat_model,is_within_50bp_of_mismatch_site
str,i64,i64,i64,f64,f64,i64,i64,f64,i64,i64,i64,str,str,f64,f64,f64,f64,bool
"""chr1""",10468,10469,13,0.769,0.868,,,,,,,,,,,,,false
"""chr1""",10470,10471,13,0.769,0.878,,,,,,,,,,,,,false
"""chr1""",10483,10484,13,0.923,0.929,,,,,,,,,,,,,false
"""chr1""",10488,10489,14,1.0,0.95,,,,,,,,,,,,,false
"""chr1""",10492,10493,13,1.0,0.964,,,,,,,,,,,,,false
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""chrY""",56887399,56887400,38,0.737,0.942,,,,,,,,,,,,,
"""chrY""",56887579,56887580,38,0.895,0.945,,,,,,,,,,,,,
"""chrY""",56887581,56887582,,,,,,,,,,,,,,,,
"""chrY""",56887592,56887593,38,0.579,0.721,,,,,,,,,,,,,


## We don't phase methylation on chrM and chrY, and a note about gender

In [9]:
# we don't phase methylation on chrM (>2 copies per cell): 
DF_METH_FOUNDER_PHASED_ALL_CPGS.filter(pl.col('chrom') == 'chrM').filter(pl.col('start_hap_map_block').is_not_null())

chrom,start,end,total_read_count,methylation_level_count,methylation_level_model,start_hap_map_block,end_hap_map_block,haplotype_concordance_in_hap_map_block,num_het_SNVs_in_hap_map_block,total_read_count_pat,total_read_count_mat,founder_haplotype_pat,founder_haplotype_mat,methylation_level_pat_count,methylation_level_mat_count,methylation_level_pat_model,methylation_level_mat_model,is_within_50bp_of_mismatch_site
str,i64,i64,i64,f64,f64,i64,i64,f64,i64,i64,i64,str,str,f64,f64,f64,f64,bool


In [10]:
# we don't phase methylation on chrY (<2 copies per cell): 
DF_METH_FOUNDER_PHASED_ALL_CPGS.filter(pl.col('chrom') == 'chrY').filter(pl.col('start_hap_map_block').is_not_null())

chrom,start,end,total_read_count,methylation_level_count,methylation_level_model,start_hap_map_block,end_hap_map_block,haplotype_concordance_in_hap_map_block,num_het_SNVs_in_hap_map_block,total_read_count_pat,total_read_count_mat,founder_haplotype_pat,founder_haplotype_mat,methylation_level_pat_count,methylation_level_mat_count,methylation_level_pat_model,methylation_level_mat_model,is_within_50bp_of_mismatch_site
str,i64,i64,i64,f64,f64,i64,i64,f64,i64,i64,i64,str,str,f64,f64,f64,f64,bool


In [11]:
# since we don't phase methylation on chrM (>2 copies per cell) and chrY (<2 copies per cell), there are null values for "is_within_50bp_of_mismatch_site":
DF_METH_FOUNDER_PHASED_ALL_CPGS.filter(pl.col('is_within_50bp_of_mismatch_site').is_null())

chrom,start,end,total_read_count,methylation_level_count,methylation_level_model,start_hap_map_block,end_hap_map_block,haplotype_concordance_in_hap_map_block,num_het_SNVs_in_hap_map_block,total_read_count_pat,total_read_count_mat,founder_haplotype_pat,founder_haplotype_mat,methylation_level_pat_count,methylation_level_mat_count,methylation_level_pat_model,methylation_level_mat_model,is_within_50bp_of_mismatch_site
str,i64,i64,i64,f64,f64,i64,i64,f64,i64,i64,i64,str,str,f64,f64,f64,f64,bool
"""chrM""",32,33,192,0.25,0.071,,,,,,,,,,,,,
"""chrM""",60,61,190,0.242,0.039,,,,,,,,,,,,,
"""chrM""",77,78,190,0.089,0.029,,,,,,,,,,,,,
"""chrM""",79,80,190,0.121,0.034,,,,,,,,,,,,,
"""chrM""",90,91,190,0.232,0.032,,,,,,,,,,,,,
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""chrY""",56887399,56887400,38,0.737,0.942,,,,,,,,,,,,,
"""chrY""",56887579,56887580,38,0.895,0.945,,,,,,,,,,,,,
"""chrY""",56887581,56887582,,,,,,,,,,,,,,,,
"""chrY""",56887592,56887593,38,0.579,0.721,,,,,,,,,,,,,


In [12]:
# total read count (about half of 30X) suggests just one copy of chrX, i.e., a male: 
DF_METH_FOUNDER_PHASED_ALL_CPGS.filter(pl.col('chrom') == 'chrX').filter(pl.col('total_read_count').is_not_null())

chrom,start,end,total_read_count,methylation_level_count,methylation_level_model,start_hap_map_block,end_hap_map_block,haplotype_concordance_in_hap_map_block,num_het_SNVs_in_hap_map_block,total_read_count_pat,total_read_count_mat,founder_haplotype_pat,founder_haplotype_mat,methylation_level_pat_count,methylation_level_mat_count,methylation_level_pat_model,methylation_level_mat_model,is_within_50bp_of_mismatch_site
str,i64,i64,i64,f64,f64,i64,i64,f64,i64,i64,i64,str,str,f64,f64,f64,f64,bool
"""chrX""",25567,25568,10,1.0,0.962,,,,,,,,,,,,,false
"""chrX""",25575,25576,10,0.8,0.956,,,,,,,,,,,,,false
"""chrX""",25631,25632,10,0.7,0.732,,,,,,,,,,,,,false
"""chrX""",25638,25639,10,0.9,0.96,,,,,,,,,,,,,false
"""chrX""",25647,25648,10,1.0,0.969,,,,,,,,,,,,,false
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""chrX""",156020271,156020272,11,0.727,0.866,,,,,,,,,,,,,false
"""chrX""",156020287,156020288,11,0.909,0.933,,,,,,,,,,,,,false
"""chrX""",156020376,156020377,10,0.8,0.885,,,,,,,,,,,,,false
"""chrX""",156030064,156030065,15,0.133,0.429,,,,,,,,,,,,,false


In [13]:
# this phasing is probably incorrect due to technical errors, since there is only one X in a male, and it must come from the mother (with the Y coming from the father), 
# whereas these data say that the X comes from the father:
DF_METH_FOUNDER_PHASED_ALL_CPGS.filter(pl.col('chrom') == 'chrX').filter(pl.col('start_hap_map_block').is_not_null())

chrom,start,end,total_read_count,methylation_level_count,methylation_level_model,start_hap_map_block,end_hap_map_block,haplotype_concordance_in_hap_map_block,num_het_SNVs_in_hap_map_block,total_read_count_pat,total_read_count_mat,founder_haplotype_pat,founder_haplotype_mat,methylation_level_pat_count,methylation_level_mat_count,methylation_level_pat_model,methylation_level_mat_model,is_within_50bp_of_mismatch_site
str,i64,i64,i64,f64,f64,i64,i64,f64,i64,i64,i64,str,str,f64,f64,f64,f64,bool
"""chrX""",2241257,2241258,19,0.895,0.959,2240422,3036819,1.0,193,10,,"""A""","""G""",0.9,,0.962,,false
"""chrX""",2241263,2241264,19,0.632,0.652,2240422,3036819,1.0,193,10,,"""A""","""G""",0.7,,0.875,,false
"""chrX""",2241329,2241330,19,0.632,0.775,2240422,3036819,1.0,193,10,,"""A""","""G""",0.6,,0.872,,false
"""chrX""",2241404,2241405,19,0.421,0.27,2240422,3036819,1.0,193,10,,"""A""","""G""",0.4,,0.295,,false
"""chrX""",2241507,2241508,20,0.9,0.947,2240422,3036819,1.0,193,11,,"""A""","""G""",0.909,,0.951,,false
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""chrX""",155658077,155658078,35,0.629,0.854,155616130,155659104,1.0,1,20,15,"""A""","""K""",0.75,0.467,0.956,0.491,false
"""chrX""",155658233,155658234,34,0.588,0.721,155616130,155659104,1.0,1,19,15,"""A""","""K""",0.737,0.4,0.915,0.205,false
"""chrX""",155658380,155658381,34,0.647,0.704,155616130,155659104,1.0,1,19,15,"""A""","""K""",0.737,0.533,0.854,0.574,false
"""chrX""",155658475,155658476,34,0.647,0.821,155616130,155659104,1.0,1,19,15,"""A""","""K""",0.737,0.533,0.939,0.638,false


In [14]:
# TODO: check phasing of methylation on chrX in a female sample (XX karyotype)

## Examples of CpG sites where phasing is partial, even though they are in hap-map blocks 

In [15]:
DF_METH_FOUNDER_PHASED_ALL_CPGS.filter(
    (pl.col('chrom') == 'chr1') &
    (pl.col('start') > 3665300) & 
    (pl.col('end') < 3665600)
)

chrom,start,end,total_read_count,methylation_level_count,methylation_level_model,start_hap_map_block,end_hap_map_block,haplotype_concordance_in_hap_map_block,num_het_SNVs_in_hap_map_block,total_read_count_pat,total_read_count_mat,founder_haplotype_pat,founder_haplotype_mat,methylation_level_pat_count,methylation_level_mat_count,methylation_level_pat_model,methylation_level_mat_model,is_within_50bp_of_mismatch_site
str,i64,i64,i64,f64,f64,i64,i64,f64,i64,i64,i64,str,str,f64,f64,f64,f64,bool
"""chr1""",3665407,3665408,31,0.742,0.875,3399126,4207029,1.0,1114,21,10.0,"""B""","""I""",0.714,0.8,0.783,0.951,False
"""chr1""",3665514,3665515,29,0.69,0.878,3399126,4207029,1.0,1114,20,,"""B""","""I""",0.6,,0.799,,False
"""chr1""",3665526,3665527,29,0.69,0.833,3399126,4207029,1.0,1114,20,,"""B""","""I""",0.7,,0.774,,False
"""chr1""",3665561,3665562,30,0.433,0.487,3399126,4207029,1.0,1114,20,10.0,"""B""","""I""",0.45,0.4,0.53,0.478,False


In [16]:
DF_METH_FOUNDER_PHASED_ALL_CPGS.filter(
    (pl.col('chrom') == 'chr1') &
    (pl.col('start') > 36678000) & 
    (pl.col('end') < 36680000)
)

chrom,start,end,total_read_count,methylation_level_count,methylation_level_model,start_hap_map_block,end_hap_map_block,haplotype_concordance_in_hap_map_block,num_het_SNVs_in_hap_map_block,total_read_count_pat,total_read_count_mat,founder_haplotype_pat,founder_haplotype_mat,methylation_level_pat_count,methylation_level_mat_count,methylation_level_pat_model,methylation_level_mat_model,is_within_50bp_of_mismatch_site
str,i64,i64,i64,f64,f64,i64,i64,f64,i64,i64,i64,str,str,f64,f64,f64,f64,bool
"""chr1""",36678211,36678212,23,0.696,0.837,36572121,36737884,1.0,179,11.0,12,"""B""","""K""",0.636,0.75,0.674,0.942,False
"""chr1""",36678518,36678519,23,0.522,0.832,36572121,36737884,1.0,179,11.0,12,"""B""","""K""",0.636,0.417,0.916,0.704,False
"""chr1""",36678538,36678539,23,0.348,0.43,36572121,36737884,1.0,179,11.0,12,"""B""","""K""",0.727,0.0,0.862,0.055,False
"""chr1""",36678689,36678690,22,0.773,0.924,36572121,36737884,1.0,179,10.0,12,"""B""","""K""",0.7,0.833,0.812,0.937,False
"""chr1""",36679081,36679082,20,0.8,0.947,36572121,36737884,1.0,179,,12,"""B""","""K""",,0.917,,0.966,False
"""chr1""",36679125,36679126,20,0.9,0.952,36572121,36737884,1.0,179,,12,"""B""","""K""",,0.917,,0.956,False
"""chr1""",36679168,36679169,20,0.75,0.907,36572121,36737884,1.0,179,,12,"""B""","""K""",,0.833,,0.957,False
"""chr1""",36679367,36679368,19,0.789,0.901,36572121,36737884,1.0,179,,12,"""B""","""K""",,0.833,,0.915,False
"""chr1""",36679693,36679694,19,0.684,0.865,36572121,36737884,1.0,179,,12,"""B""","""K""",,0.75,,0.89,False
"""chr1""",36679896,36679897,19,0.737,0.934,36572121,36737884,1.0,179,,12,"""B""","""K""",,0.667,,0.943,False


## QC Statistics 

In [17]:
importlib.reload(expand_to_all_cpgs)
from expand_to_all_cpgs import compute_fraction_of_cpgs_that_are_close_to_mismatches

compute_fraction_of_cpgs_that_are_close_to_mismatches(DF_METH_FOUNDER_PHASED_ALL_CPGS)

Percentage of CpG sites (in reference and sample genome, and on phasable chroms) that are within 50bp of a heterozygous mismatch site: 0.173%


In [18]:
importlib.reload(expand_to_all_cpgs)
from expand_to_all_cpgs import compute_fraction_of_cpgs_at_which_meth_is_phased_wrapper

compute_fraction_of_cpgs_at_which_meth_is_phased_wrapper(DF_METH_FOUNDER_PHASED_ALL_CPGS)

Percentage of CpG sites (in reference and sample genomes, and on phasable chroms) at which count-based methylation is phased to pat haplotype: 80.58%
Percentage of CpG sites (in reference and sample genomes, and on phasable chroms) at which count-based methylation is phased to mat haplotype: 80.53%
Percentage of CpG sites (in reference and sample genomes, and on phasable chroms) at which count-based methylation is phased to at least one parental haplotype: 84.25%
Percentage of CpG sites (in reference and sample genomes, and on phasable chroms) at which count-based methylation is phased to both parental haplotypes: 76.86%
Percentage of CpG sites (in reference and sample genomes, and on phasable chroms) at which count-based unphased methylation is reported: 97.45%
Percentage of CpG sites (in reference and sample genomes, and on phasable chroms) at which model-based methylation is phased to pat haplotype: 80.58%
Percentage of CpG sites (in reference and sample genomes, and on phasable chr

## Use joint-called SNVs to flag CpG sites that have been created or destroyed, e.g., for use in scanning the genome for imprinted loci across a pedigree

In [None]:
# Motivation: 
# slides: https://docs.google.com/presentation/d/11Pfax0wXh0E68C287lMaPoPvhq-OrFGxFOKE1gWOkDI/edit?slide=id.g39893c07c75_0_0#slide=id.g39893c07c75_0_0 
# slack thread: https://quinlangroup.slack.com/archives/C0803TM7X0X/p1762565840460019?thread_ts=1759348751.929209&cid=C0803TM7X0X 

In [19]:
importlib.reload(expand_to_all_cpgs)
from expand_to_all_cpgs import get_iht_phased_variants

DF_IHT_PHASED_VARIANTS = get_iht_phased_variants(UID, VCF_IHT_PHASED)
DF_IHT_PHASED_VARIANTS

chrom,start,end,REF,ALT,allele_pat,allele_mat
str,i64,i64,str,list[str],str,str
"""chr1""",13979,13980,"""T""","[""C""]","""1""","""1"""
"""chr1""",18848,18849,"""C""","[""G""]","""1""","""1"""
"""chr1""",26995,26996,"""A""","[""G""]","""1""","""1"""
"""chr1""",29442,29443,"""A""","[""G""]","""1""","""1"""
"""chr1""",33410,33411,"""A""","[""C""]","""1""","""1"""
…,…,…,…,…,…,…
"""chrX""",155697537,155697538,"""C""","[""T""]","""0""","""0"""
"""chrX""",155697630,155697631,"""G""","[""A""]","""0""","""0"""
"""chrX""",155697919,155697920,"""G""","[""A""]","""0""","""0"""
"""chrX""",155699750,155699751,"""C""","[""T""]","""0""","""0"""


In [20]:
# An example of CpG site creation 
# A site that is CpG in only one haplotype of the sample, and not CpG in the reference sequence

# IGV snapshot: 
# https://quinlangroup.slack.com/archives/C0803TM7X0X/p1759880434142149

In [21]:
# variant at this CpG site: 

DF_IHT_PHASED_VARIANTS.filter(
    (pl.col("chrom") == 'chr20') & 
    (pl.col("start") == 101340)
)

chrom,start,end,REF,ALT,allele_pat,allele_mat
str,i64,i64,str,list[str],str,str
"""chr20""",101340,101341,"""A""","[""G""]","""1""","""0"""


In [22]:
# Methylation at this CpG site on one haplotype is 0.0
# It should be None as there is no CpG on that haplotype

DF_METH_FOUNDER_PHASED_ALL_CPGS.filter(
    (pl.col("chrom") == 'chr20') & 
    (pl.col("start") == 101339)
)

chrom,start,end,total_read_count,methylation_level_count,methylation_level_model,start_hap_map_block,end_hap_map_block,haplotype_concordance_in_hap_map_block,num_het_SNVs_in_hap_map_block,total_read_count_pat,total_read_count_mat,founder_haplotype_pat,founder_haplotype_mat,methylation_level_pat_count,methylation_level_mat_count,methylation_level_pat_model,methylation_level_mat_model,is_within_50bp_of_mismatch_site
str,i64,i64,i64,f64,f64,i64,i64,f64,i64,i64,i64,str,str,f64,f64,f64,f64,bool
"""chr20""",101339,101340,45,0.4,0.478,67743,236639,1.0,258,23,22,"""A""","""I""",0.783,0.0,0.89,0.06,False


In [23]:
importlib.reload(expand_to_all_cpgs)
from expand_to_all_cpgs import label_with_variants

DF_METH_FOUNDER_PHASED_ALL_CPGS_WITH_VARIANT_LABEL = label_with_variants(DF_METH_FOUNDER_PHASED_ALL_CPGS, DF_IHT_PHASED_VARIANTS)
DF_METH_FOUNDER_PHASED_ALL_CPGS_WITH_VARIANT_LABEL

chrom,start_cpg,end_cpg,total_read_count,methylation_level_count,methylation_level_model,start_hap_map_block,end_hap_map_block,haplotype_concordance_in_hap_map_block,num_het_SNVs_in_hap_map_block,total_read_count_pat,total_read_count_mat,founder_haplotype_pat,founder_haplotype_mat,methylation_level_pat_count,methylation_level_mat_count,methylation_level_pat_model,methylation_level_mat_model,cpg_is_within_50bp_of_mismatch_site,start_variant,end_variant,REF,ALT,allele_pat,allele_mat,num_SNVs_overlapping_CG
str,i64,i64,i64,f64,f64,i64,i64,f64,i64,i64,i64,str,str,f64,f64,f64,f64,bool,i64,i64,str,list[str],str,str,u32
"""chr1""",10468,10470,13,0.769,0.868,,,,,,,,,,,,,false,,,,,,,0
"""chr1""",10470,10472,13,0.769,0.878,,,,,,,,,,,,,false,,,,,,,0
"""chr1""",10483,10485,13,0.923,0.929,,,,,,,,,,,,,false,,,,,,,0
"""chr1""",10488,10490,14,1.0,0.95,,,,,,,,,,,,,false,,,,,,,0
"""chr1""",10492,10494,13,1.0,0.964,,,,,,,,,,,,,false,,,,,,,0
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""chrY""",56887399,56887401,38,0.737,0.942,,,,,,,,,,,,,,,,,,,,0
"""chrY""",56887579,56887581,38,0.895,0.945,,,,,,,,,,,,,,,,,,,,0
"""chrY""",56887581,56887583,,,,,,,,,,,,,,,,,,,,,,,0
"""chrY""",56887592,56887594,38,0.579,0.721,,,,,,,,,,,,,,,,,,,,0


## CpG sites with unknown genotypes are absent, and therefore don't need to be filtered prior to imprinting scans

In [24]:
DF_METH_FOUNDER_PHASED_ALL_CPGS_WITH_VARIANT_LABEL.filter(
    (pl.col('allele_pat') == '.') | 
    (pl.col('allele_mat') == '.') 
)

chrom,start_cpg,end_cpg,total_read_count,methylation_level_count,methylation_level_model,start_hap_map_block,end_hap_map_block,haplotype_concordance_in_hap_map_block,num_het_SNVs_in_hap_map_block,total_read_count_pat,total_read_count_mat,founder_haplotype_pat,founder_haplotype_mat,methylation_level_pat_count,methylation_level_mat_count,methylation_level_pat_model,methylation_level_mat_model,cpg_is_within_50bp_of_mismatch_site,start_variant,end_variant,REF,ALT,allele_pat,allele_mat,num_SNVs_overlapping_CG
str,i64,i64,i64,f64,f64,i64,i64,f64,i64,i64,i64,str,str,f64,f64,f64,f64,bool,i64,i64,str,list[str],str,str,u32


## CpG sites that each overlap a single SNV: When scanning for imprinting, exclude such CpG sites if they overlap heterozygous (but not homozygous) SNVs 

In [25]:
def add_locus_cpg(df):
    return df.with_columns(
        locus_cpg = pl.format(
            "{}:{}-{}", 
            pl.col("chrom"),
            pl.col("start_cpg"),
            pl.col("end_cpg")
        )
    )

def subset_cpgs_at_variants(df, allele_pat, allele_mat, num_SNVs_overlapping_CG):
    df = df.filter(
        (pl.col('allele_pat') == str(allele_pat)) & 
        (pl.col('allele_mat') == str(allele_mat)) & 
        (pl.col('num_SNVs_overlapping_CG') == num_SNVs_overlapping_CG)
    )
    return add_locus_cpg(df)

### There are many CpG sites that overlap a single SNV 

In [26]:
f"Number of CpG sites that overlap a single SNV: {len(DF_METH_FOUNDER_PHASED_ALL_CPGS_WITH_VARIANT_LABEL.filter(pl.col('num_SNVs_overlapping_CG') == 1))}"

'Number of CpG sites that overlap a single SNV: 1614111'

### Homozygous CpG sites should be included in imprinting scans

In [27]:
# HOM REF 
# Both haplotypes must be CpG, by construction 
# Therefore both haplotypes must exhibit methylation 
# These sites could, in principle, be imprinted, in this particular sample, and therefore should be included in scans for imprinting, in this particular sample

subset_cpgs_at_variants(
    DF_METH_FOUNDER_PHASED_ALL_CPGS_WITH_VARIANT_LABEL, 
    allele_pat=0, 
    allele_mat=0,
    num_SNVs_overlapping_CG=1
).sample(5, seed=42)

chrom,start_cpg,end_cpg,total_read_count,methylation_level_count,methylation_level_model,start_hap_map_block,end_hap_map_block,haplotype_concordance_in_hap_map_block,num_het_SNVs_in_hap_map_block,total_read_count_pat,total_read_count_mat,founder_haplotype_pat,founder_haplotype_mat,methylation_level_pat_count,methylation_level_mat_count,methylation_level_pat_model,methylation_level_mat_model,cpg_is_within_50bp_of_mismatch_site,start_variant,end_variant,REF,ALT,allele_pat,allele_mat,num_SNVs_overlapping_CG,locus_cpg
str,i64,i64,i64,f64,f64,i64,i64,f64,i64,i64,i64,str,str,f64,f64,f64,f64,bool,i64,i64,str,list[str],str,str,u32,str
"""chr3""",7501441,7501443,29,0.621,0.841,,,,,,,,,,,,,False,7501441,7501442,"""C""","[""T""]","""0""","""0""",1,"""chr3:7501441-7501443"""
"""chr5""",67580330,67580332,29,0.69,0.815,67568630.0,67609634.0,1.0,6.0,13.0,16.0,"""A""","""G""",0.692,0.688,0.893,0.767,False,67580330,67580331,"""C""","[""T""]","""0""","""0""",1,"""chr5:67580330-67580332"""
"""chr5""",34803512,34803514,31,0.968,0.973,34795294.0,34831740.0,1.0,19.0,15.0,16.0,"""A""","""G""",1.0,0.938,0.98,0.957,False,34803513,34803514,"""G""","[""A""]","""0""","""0""",1,"""chr5:34803512-34803514"""
"""chr2""",233197885,233197887,33,0.848,0.96,233054853.0,233232036.0,1.0,267.0,20.0,13.0,"""A""","""G""",0.85,0.846,0.967,0.928,False,233197886,233197887,"""G""","[""C""]","""0""","""0""",1,"""chr2:233197885-233197887"""
"""chr7""",155396763,155396765,32,0.844,0.965,155222320.0,155615738.0,1.0,547.0,16.0,16.0,"""B""","""I""",0.813,0.875,0.965,0.965,False,155396764,155396765,"""G""","[""A""]","""0""","""0""",1,"""chr7:155396763-155396765"""


In [28]:
# Hom ALT sites fall into two classes: either the site is CpG (creation) on both haplotypes, or not CpG on both haplotypes (destruction)
# CpG creation sites have methylation and could be imprinted; These should be included in scans for imprinting 
# CpG destruction sites do not have methylation and therefore are ascribed "None" for their methylation values 
# Including these sites in imprinting scans doesn't hurt: In Polars, the result of any arithmetic operation where one or both operands are null is always null

In [29]:
subset_cpgs_at_variants(
    DF_METH_FOUNDER_PHASED_ALL_CPGS_WITH_VARIANT_LABEL, 
    allele_pat=1, 
    allele_mat=1,
    num_SNVs_overlapping_CG=1
).sample(10, seed=42)

chrom,start_cpg,end_cpg,total_read_count,methylation_level_count,methylation_level_model,start_hap_map_block,end_hap_map_block,haplotype_concordance_in_hap_map_block,num_het_SNVs_in_hap_map_block,total_read_count_pat,total_read_count_mat,founder_haplotype_pat,founder_haplotype_mat,methylation_level_pat_count,methylation_level_mat_count,methylation_level_pat_model,methylation_level_mat_model,cpg_is_within_50bp_of_mismatch_site,start_variant,end_variant,REF,ALT,allele_pat,allele_mat,num_SNVs_overlapping_CG,locus_cpg
str,i64,i64,i64,f64,f64,i64,i64,f64,i64,i64,i64,str,str,f64,f64,f64,f64,bool,i64,i64,str,list[str],str,str,u32,str
"""chr3""",474781,474783,38.0,0.684,0.808,422385.0,587090.0,1.0,166.0,20.0,18.0,"""A""","""G""",0.65,0.722,0.782,0.84,False,474782,474783,"""C""","[""G""]","""1""","""1""",1,"""chr3:474781-474783"""
"""chr5""",26176636,26176638,22.0,0.773,0.94,,,,,,,,,,,,,False,26176636,26176637,"""T""","[""C""]","""1""","""1""",1,"""chr5:26176636-26176638"""
"""chr5""",4284728,4284730,,,,,,,,,,,,,,,,False,4284729,4284730,"""G""","[""A""]","""1""","""1""",1,"""chr5:4284728-4284730"""
"""chr13""",74980891,74980893,31.0,0.935,0.96,74367678.0,74987921.0,1.0,622.0,11.0,20.0,"""B""","""E""",1.0,0.9,0.963,0.954,False,74980892,74980893,"""A""","[""G""]","""1""","""1""",1,"""chr13:74980891-74980893"""
"""chr9""",109772989,109772991,,,,,,,,,,,,,,,,False,109772990,109772991,"""G""","[""A""]","""1""","""1""",1,"""chr9:109772989-109772991"""
"""chr2""",219058316,219058318,,,,,,,,,,,,,,,,False,219058316,219058317,"""C""","[""T""]","""1""","""1""",1,"""chr2:219058316-219058318"""
"""chr5""",69077023,69077025,35.0,0.686,0.939,68584858.0,69658609.0,1.0,881.0,13.0,22.0,"""A""","""G""",0.769,0.636,0.953,0.934,False,69077023,69077024,"""T""","[""C""]","""1""","""1""",1,"""chr5:69077023-69077025"""
"""chr7""",159240463,159240465,35.0,0.4,0.361,159231015.0,159323885.0,1.0,6.0,14.0,21.0,"""B""","""I""",0.429,0.381,0.416,0.386,False,159240464,159240465,"""A""","[""G""]","""1""","""1""",1,"""chr7:159240463-159240465"""
"""chr1""",31254562,31254564,,,,,,,,,,,,,,,,False,31254563,31254564,"""G""","[""A""]","""1""","""1""",1,"""chr1:31254562-31254564"""
"""chr4""",47495968,47495970,31.0,0.839,0.959,47211148.0,47600358.0,1.0,368.0,17.0,14.0,"""B""","""K""",0.882,0.786,0.958,0.961,False,47495968,47495969,"""T""","[""C""]","""1""","""1""",1,"""chr4:47495968-47495970"""


In [30]:
subset_cpgs_at_variants(
    DF_METH_FOUNDER_PHASED_ALL_CPGS_WITH_VARIANT_LABEL, 
    allele_pat=2, 
    allele_mat=2,
    num_SNVs_overlapping_CG=1
).sample(5, seed=42)

chrom,start_cpg,end_cpg,total_read_count,methylation_level_count,methylation_level_model,start_hap_map_block,end_hap_map_block,haplotype_concordance_in_hap_map_block,num_het_SNVs_in_hap_map_block,total_read_count_pat,total_read_count_mat,founder_haplotype_pat,founder_haplotype_mat,methylation_level_pat_count,methylation_level_mat_count,methylation_level_pat_model,methylation_level_mat_model,cpg_is_within_50bp_of_mismatch_site,start_variant,end_variant,REF,ALT,allele_pat,allele_mat,num_SNVs_overlapping_CG,locus_cpg
str,i64,i64,i64,f64,f64,i64,i64,f64,i64,i64,i64,str,str,f64,f64,f64,f64,bool,i64,i64,str,list[str],str,str,u32,str
"""chr3""",27836755,27836757,,,,,,,,,,,,,,,,False,27836756,27836757,"""G""","[""T"", ""A""]","""2""","""2""",1,"""chr3:27836755-27836757"""
"""chr22""",34797009,34797011,42.0,0.833,0.937,34742157.0,35158111.0,1.0,578.0,12.0,30.0,"""A""","""G""",0.917,0.8,0.93,0.929,False,34797010,34797011,"""A""","[""C"", ""G""]","""2""","""2""",1,"""chr22:34797009-34797011"""
"""chr5""",156821662,156821664,,,,,,,,,,,,,,,,False,156821662,156821663,"""C""","[""A"", ""T""]","""2""","""2""",1,"""chr5:156821662-156821664"""
"""chr12""",42402302,42402304,27.0,0.852,0.957,42215927.0,42434932.0,1.0,20.0,14.0,12.0,"""A""","""G""",0.786,0.917,0.952,0.957,False,42402303,42402304,"""T""","[""C"", ""G""]","""2""","""2""",1,"""chr12:42402302-42402304"""
"""chr10""",106257370,106257372,,,,,,,,,,,,,,,,False,106257371,106257372,"""G""","[""A"", ""T""]","""2""","""2""",1,"""chr10:106257370-106257372"""


### Heterozygous CpG sites should be excluded from imprinting scans

In [31]:
# [ALT=1] cpg sites harboring an ALT allele on one haplotype, create or destroy a cpg site on that haplotype
# This could potentially lead to false calls of imprinting (subject to depth constraints), and therefore such sites should be excluded in scans for imprinting 

subset_cpgs_at_variants(
    DF_METH_FOUNDER_PHASED_ALL_CPGS_WITH_VARIANT_LABEL, 
    allele_pat=0, 
    allele_mat=1,
    num_SNVs_overlapping_CG=1
).sample(5, seed=42)

chrom,start_cpg,end_cpg,total_read_count,methylation_level_count,methylation_level_model,start_hap_map_block,end_hap_map_block,haplotype_concordance_in_hap_map_block,num_het_SNVs_in_hap_map_block,total_read_count_pat,total_read_count_mat,founder_haplotype_pat,founder_haplotype_mat,methylation_level_pat_count,methylation_level_mat_count,methylation_level_pat_model,methylation_level_mat_model,cpg_is_within_50bp_of_mismatch_site,start_variant,end_variant,REF,ALT,allele_pat,allele_mat,num_SNVs_overlapping_CG,locus_cpg
str,i64,i64,i64,f64,f64,i64,i64,f64,i64,i64,i64,str,str,f64,f64,f64,f64,bool,i64,i64,str,list[str],str,str,u32,str
"""chr3""",3939055,3939057,29,0.448,0.491,3860042,4130309,1.0,437,14,15,"""A""","""E""",0.0,0.867,0.065,0.911,False,3939056,3939057,"""A""","[""G""]","""0""","""1""",1,"""chr3:3939055-3939057"""
"""chr5""",16590244,16590246,46,0.348,0.542,16488178,17686660,0.995803,953,24,22,"""A""","""G""",0.0,0.727,0.053,0.957,False,16590245,16590246,"""T""","[""G""]","""0""","""1""",1,"""chr5:16590244-16590246"""
"""chr9""",90569506,90569508,35,0.143,0.039,90535977,90696067,1.0,184,15,20,"""B""","""I""",0.0,0.25,0.024,0.093,False,90569506,90569507,"""T""","[""C""]","""0""","""1""",1,"""chr9:90569506-90569508"""
"""chr2""",230636751,230636753,27,0.074,0.044,229516785,231037600,1.0,1676,10,17,"""A""","""E""",0.2,0.0,0.089,0.043,False,230636751,230636752,"""C""","[""T""]","""0""","""1""",1,"""chr2:230636751-230636753"""
"""chr7""",148407187,148407189,35,0.429,0.625,148298342,148563070,1.0,310,18,17,"""B""","""I""",0.833,0.0,0.937,0.06,False,148407188,148407189,"""G""","[""A""]","""0""","""1""",1,"""chr7:148407187-148407189"""


In [32]:
# [ALT=2] cpg sites harboring an ALT allele on one haplotype, create or destroy a cpg site on that haplotype
# This could potentially lead to false calls of imprinting (subject to depth constraints), and therefore such sites should be excluded in scans for imprinting 

subset_cpgs_at_variants(
    DF_METH_FOUNDER_PHASED_ALL_CPGS_WITH_VARIANT_LABEL, 
    allele_pat=0, 
    allele_mat=2,
    num_SNVs_overlapping_CG=1
).sample(5, seed=42)

chrom,start_cpg,end_cpg,total_read_count,methylation_level_count,methylation_level_model,start_hap_map_block,end_hap_map_block,haplotype_concordance_in_hap_map_block,num_het_SNVs_in_hap_map_block,total_read_count_pat,total_read_count_mat,founder_haplotype_pat,founder_haplotype_mat,methylation_level_pat_count,methylation_level_mat_count,methylation_level_pat_model,methylation_level_mat_model,cpg_is_within_50bp_of_mismatch_site,start_variant,end_variant,REF,ALT,allele_pat,allele_mat,num_SNVs_overlapping_CG,locus_cpg
str,i64,i64,i64,f64,f64,i64,i64,f64,i64,i64,i64,str,str,f64,f64,f64,f64,bool,i64,i64,str,list[str],str,str,u32,str
"""chrX""",5088960,5088962,36,0.389,0.476,4506917,5185199,1.0,730,17,19,"""A""","""G""",0.0,0.737,0.05,0.911,False,5088961,5088962,"""A""","[""C"", ""G""]","""0""","""2""",1,"""chrX:5088960-5088962"""
"""chr4""",9527876,9527878,44,0.477,0.58,9345092,10421738,1.0,1474,25,19,"""B""","""E""",0.84,0.0,0.942,0.053,False,9527877,9527878,"""G""","[""A"", ""C""]","""0""","""2""",1,"""chr4:9527876-9527878"""
"""chr5""",149655218,149655220,57,0.228,0.522,149626241,149690396,1.0,51,29,18,"""B""","""G""",0.0,0.722,0.142,0.849,False,149655219,149655220,"""C""","[""A"", ""G""]","""0""","""2""",1,"""chr5:149655218-149655220"""
"""chr11""",55986503,55986505,32,0.375,0.591,54393461,58007713,0.999864,7377,19,13,"""B""","""G""",0.0,0.923,0.05,0.937,False,55986503,55986504,"""G""","[""T"", ""C""]","""0""","""2""",1,"""chr11:55986503-55986505"""
"""chr8""",5166623,5166625,43,0.349,0.493,4913211,5289143,1.0,499,20,23,"""B""","""E""",0.75,0.0,0.933,0.058,False,5166624,5166625,"""G""","[""C"", ""A""]","""0""","""2""",1,"""chr8:5166623-5166625"""


In [33]:
# [ALT=1,2]
# Either site is CpG in reference or not 
# If YES, then it is not CpG in both haplotypes, and therefore ascribed None as methylation
# Such sites cannot be imprinted, and therefore may be excluded from imprinting scans
# If NO, then it must be CpG in one haplotype, and not in the other other
# The corresponding methylation levels will be a FLOAT > 0 and FLOAT = 0.0, yielding a false imprinting call 
# Therefore such sites must be excluded from imprinting scans

subset_cpgs_at_variants(
    DF_METH_FOUNDER_PHASED_ALL_CPGS_WITH_VARIANT_LABEL, 
    allele_pat=1, 
    allele_mat=2,
    num_SNVs_overlapping_CG=1
).sample(10, seed=42)

chrom,start_cpg,end_cpg,total_read_count,methylation_level_count,methylation_level_model,start_hap_map_block,end_hap_map_block,haplotype_concordance_in_hap_map_block,num_het_SNVs_in_hap_map_block,total_read_count_pat,total_read_count_mat,founder_haplotype_pat,founder_haplotype_mat,methylation_level_pat_count,methylation_level_mat_count,methylation_level_pat_model,methylation_level_mat_model,cpg_is_within_50bp_of_mismatch_site,start_variant,end_variant,REF,ALT,allele_pat,allele_mat,num_SNVs_overlapping_CG,locus_cpg
str,i64,i64,i64,f64,f64,i64,i64,f64,i64,i64,i64,str,str,f64,f64,f64,f64,bool,i64,i64,str,list[str],str,str,u32,str
"""chr5""",27815038,27815040,39.0,0.333,0.477,27183813.0,28737129.0,1.0,1615.0,20.0,19.0,"""A""","""G""",0.65,0.0,0.869,0.057,False,27815039,27815040,"""T""","[""G"", ""A""]","""1""","""2""",1,"""chr5:27815038-27815040"""
"""chr12""",94498047,94498049,38.0,0.579,0.544,94459970.0,94651098.0,1.0,280.0,24.0,14.0,"""B""","""G""",0.917,0.0,0.962,0.056,False,94498047,94498048,"""T""","[""C"", ""G""]","""1""","""2""",1,"""chr12:94498047-94498049"""
"""chr9""",22570483,22570485,35.0,0.686,0.746,22492619.0,23241000.0,1.0,699.0,,26.0,"""A""","""K""",,0.923,,0.93,False,22570483,22570484,"""T""","[""G"", ""C""]","""1""","""2""",1,"""chr9:22570483-22570485"""
"""chr1""",41810695,41810697,50.0,0.12,0.125,40554249.0,42047893.0,1.0,1257.0,21.0,29.0,"""B""","""K""",0.286,0.0,0.317,0.042,False,41810695,41810696,"""A""","[""C"", ""T""]","""1""","""2""",1,"""chr1:41810695-41810697"""
"""chr6""",117100700,117100702,41.0,0.415,0.581,117017593.0,117401180.0,0.995098,204.0,19.0,22.0,"""B""","""E""",0.895,0.0,0.962,0.069,False,117100701,117100702,"""T""","[""G"", ""A""]","""1""","""2""",1,"""chr6:117100700-117100702"""
"""chr13""",63452556,63452558,41.0,0.39,0.473,63393940.0,63487796.0,1.0,11.0,23.0,18.0,"""B""","""E""",0.696,0.0,0.807,0.053,False,63452556,63452557,"""G""","[""C"", ""T""]","""1""","""2""",1,"""chr13:63452556-63452558"""
"""chr9""",133318056,133318058,41.0,0.39,0.533,133102198.0,133322198.0,1.0,282.0,23.0,18.0,"""B""","""K""",0.696,0.0,0.816,0.055,False,133318056,133318057,"""A""","[""C"", ""T""]","""1""","""2""",1,"""chr9:133318056-133318058"""
"""chr1""",194017482,194017484,,,,,,,,,,,,,,,,False,194017483,194017484,"""G""","[""T"", ""A""]","""1""","""2""",1,"""chr1:194017482-194017484"""
"""chr4""",165365173,165365175,,,,,,,,,,,,,,,,False,165365173,165365174,"""C""","[""G"", ""A""]","""1""","""2""",1,"""chr4:165365173-165365175"""
"""chr11""",55132855,55132857,,,,,,,,,,,,,,,,False,55132855,55132856,"""C""","[""A"", ""G""]","""1""","""2""",1,"""chr11:55132855-55132857"""


## CpG sites that each overlap 2 SNVs: When scanning for imprinting, exclude these sites if at least one of the SNVs is heterozygous 

### There are very few CpG sites that overlap 2 SNVs 

In [34]:
# CpG sites that overlap 2 SNVs are rare among the full set of CpG sites in the reference (and sample) genome:
print(f"number of CpGs that overlap 2 SNVs: {len(DF_METH_FOUNDER_PHASED_ALL_CPGS_WITH_VARIANT_LABEL.filter((pl.col('num_SNVs_overlapping_CG') == 2)))}")

number of CpGs that overlap 2 SNVs: 30168


In [35]:
# Since CpG sites are 2 bases long, it is impossible for a CpG site to overlap more than 2 SNVs: 
print(f"number of CpGs that overlap more than 2 SNVs: {len(DF_METH_FOUNDER_PHASED_ALL_CPGS_WITH_VARIANT_LABEL.filter((pl.col('num_SNVs_overlapping_CG') > 2)))}")

number of CpGs that overlap more than 2 SNVs: 0


### CpG sites in which at least one of the 2 overlapping SNVs is heterozygous should be excluded prior to scanning for imprinting

In [36]:
# LOGIC: 
# 1. If the two SNVs are each homozygous, then the haplotypes are the same (either CpG or not), and therefore it is impossible to generate a false example of imprinting. 
# 2. If one SNV is homozygous and the other heterozygous, then the haplotypes are different. 
#   i. If one of those haplotypes is CpG, false imprinting is possible. 
#   ii. It neither haplotye is CpG, then throwing it out doesn't matter. 
# 3. If both SNVs are heterozygous, then there are two ways that the ALT alleles could segregate among the haplotypes: 
#   i. One ALT is on hap1; the second ALT is on hap2. 
#   ii. Both ALTs are on the same haplotype. 
#    Either way, the haplotypes are different, and we are back to case 2. 

def sample_cpgs_at_double_variants(df, sample_size, seed):
    number_variants = 2 
    coord_cols = ['chrom', 'start_cpg', 'end_cpg'] 

    # 1. Filter to find the valid rows
    # 2. Select ONLY the coord cols and get UNIQUE combinations
    # 3. Sample from those unique records
    sample_coordinates = (
        df
        .filter(pl.col('num_SNVs_overlapping_CG') == number_variants)
        .select(coord_cols)
        .unique()
        .sort(coord_cols)
        .sample(sample_size, seed=seed) 
    )

    # 4. Join the distinct coordinates back to the full dataframe
    result = df.join(
        sample_coordinates, 
        on=coord_cols, 
        how='inner'
    ).sort(coord_cols)

    pl.Config.set_tbl_rows(number_variants*sample_size)

    return add_locus_cpg(result)

# Visual inspection of many loci, including the following, in IGV confirmed the LOGIC presented above
sample_cpgs_at_double_variants(DF_METH_FOUNDER_PHASED_ALL_CPGS_WITH_VARIANT_LABEL, sample_size=10, seed=39)

chrom,start_cpg,end_cpg,total_read_count,methylation_level_count,methylation_level_model,start_hap_map_block,end_hap_map_block,haplotype_concordance_in_hap_map_block,num_het_SNVs_in_hap_map_block,total_read_count_pat,total_read_count_mat,founder_haplotype_pat,founder_haplotype_mat,methylation_level_pat_count,methylation_level_mat_count,methylation_level_pat_model,methylation_level_mat_model,cpg_is_within_50bp_of_mismatch_site,start_variant,end_variant,REF,ALT,allele_pat,allele_mat,num_SNVs_overlapping_CG,locus_cpg
str,i64,i64,i64,f64,f64,i64,i64,f64,i64,i64,i64,str,str,f64,f64,f64,f64,bool,i64,i64,str,list[str],str,str,u32,str
"""chr1""",199504475,199504477,39.0,0.385,0.477,,,,,,,,,,,,,False,199504475,199504476,"""T""","[""C""]","""1""","""1""",2,"""chr1:199504475-199504477"""
"""chr1""",199504475,199504477,39.0,0.385,0.477,,,,,,,,,,,,,False,199504476,199504477,"""G""","[""A""]","""0""","""0""",2,"""chr1:199504475-199504477"""
"""chr12""",76304662,76304664,,,,,,,,,,,,,,,,False,76304662,76304663,"""C""","[""T""]","""1""","""1""",2,"""chr12:76304662-76304664"""
"""chr12""",76304662,76304664,,,,,,,,,,,,,,,,False,76304663,76304664,"""G""","[""A""]","""0""","""0""",2,"""chr12:76304662-76304664"""
"""chr15""",22613259,22613261,,,,,,,,,,,,,,,,False,22613259,22613260,"""C""","[""T""]","""1""","""0""",2,"""chr15:22613259-22613261"""
"""chr15""",22613259,22613261,,,,,,,,,,,,,,,,False,22613260,22613261,"""G""","[""A""]","""0""","""1""",2,"""chr15:22613259-22613261"""
"""chr16""",20400082,20400084,28.0,0.714,0.889,20396177.0,21251958.0,1.0,559.0,13.0,15.0,"""B""","""E""",0.769,0.667,0.942,0.718,False,20400082,20400083,"""C""","[""T""]","""0""","""0""",2,"""chr16:20400082-20400084"""
"""chr16""",20400082,20400084,28.0,0.714,0.889,20396177.0,21251958.0,1.0,559.0,13.0,15.0,"""B""","""E""",0.769,0.667,0.942,0.718,False,20400083,20400084,"""G""","[""A""]","""0""","""0""",2,"""chr16:20400082-20400084"""
"""chr18""",1641597,1641599,34.0,0.441,0.53,1518117.0,2028753.0,1.0,843.0,16.0,18.0,"""A""","""I""",0.938,0.0,0.955,0.058,False,1641597,1641598,"""T""","[""C""]","""1""","""0""",2,"""chr18:1641597-1641599"""
"""chr18""",1641597,1641599,34.0,0.441,0.53,1518117.0,2028753.0,1.0,843.0,16.0,18.0,"""A""","""I""",0.938,0.0,0.955,0.058,False,1641598,1641599,"""G""","[""A""]","""0""","""0""",2,"""chr18:1641597-1641599"""


In [37]:
# the identity of the ALT allele doesn't matter to the logic above, e.g., 
subset_cpgs_at_variants(
    DF_METH_FOUNDER_PHASED_ALL_CPGS_WITH_VARIANT_LABEL, 
    allele_pat=0, 
    allele_mat=2,
    num_SNVs_overlapping_CG=2
)

chrom,start_cpg,end_cpg,total_read_count,methylation_level_count,methylation_level_model,start_hap_map_block,end_hap_map_block,haplotype_concordance_in_hap_map_block,num_het_SNVs_in_hap_map_block,total_read_count_pat,total_read_count_mat,founder_haplotype_pat,founder_haplotype_mat,methylation_level_pat_count,methylation_level_mat_count,methylation_level_pat_model,methylation_level_mat_model,cpg_is_within_50bp_of_mismatch_site,start_variant,end_variant,REF,ALT,allele_pat,allele_mat,num_SNVs_overlapping_CG,locus_cpg
str,i64,i64,i64,f64,f64,i64,i64,f64,i64,i64,i64,str,str,f64,f64,f64,f64,bool,i64,i64,str,list[str],str,str,u32,str
"""chr3""",192576768,192576770,32,0.344,0.481,191966863,192682426,1.0,770,14,18.0,"""B""","""I""",0.786,0.0,0.901,0.043,False,192576768,192576769,"""C""","[""T"", ""A""]","""0""","""2""",2,"""chr3:192576768-192576770"""
"""chr6""",31340849,31340851,27,0.556,0.74,30317747,31517324,1.0,4447,18,,"""B""","""G""",0.833,,0.864,,False,31340850,31340851,"""G""","[""A"", ""C""]","""0""","""2""",2,"""chr6:31340849-31340851"""
"""chr6""",32663300,32663302,43,0.419,0.765,32583991,33246527,1.0,4619,19,24.0,"""B""","""G""",0.0,0.75,0.064,0.882,False,32663300,32663301,"""G""","[""T"", ""C""]","""0""","""2""",2,"""chr6:32663300-32663302"""
"""chr6""",32672078,32672080,48,0.208,0.471,32583991,33246527,1.0,4619,19,29.0,"""B""","""G""",0.526,0.0,0.641,0.064,False,32672078,32672079,"""C""","[""T"", ""A""]","""0""","""2""",2,"""chr6:32672078-32672080"""
"""chr8""",128717466,128717468,33,0.152,0.349,128195729,129219897,1.0,1238,12,21.0,"""A""","""K""",0.417,0.0,0.622,0.055,False,128717467,128717468,"""G""","[""C"", ""T""]","""0""","""2""",2,"""chr8:128717466-128717468"""
"""chr12""",30388120,30388122,38,0.553,0.546,30183674,30926426,1.0,1094,15,23.0,"""A""","""G""",0.0,0.913,0.056,0.944,False,30388121,30388122,"""A""","[""T"", ""G""]","""0""","""2""",2,"""chr12:30388120-30388122"""


## Label each unique CpG record with a flag indicating whether the record should be included in scans for imprinting 

In [44]:
importlib.reload(expand_to_all_cpgs)
from expand_to_all_cpgs import label_cpgs_as_allele_specific

DF_METH_FOUNDER_PHASED_ALL_CPGS_WITH_ALLELE_SPECIFIC_FLAG = label_cpgs_as_allele_specific(DF_METH_FOUNDER_PHASED_ALL_CPGS_WITH_VARIANT_LABEL) 
DF_METH_FOUNDER_PHASED_ALL_CPGS_WITH_ALLELE_SPECIFIC_FLAG

chrom,start_cpg,end_cpg,total_read_count,methylation_level_count,methylation_level_model,start_hap_map_block,end_hap_map_block,haplotype_concordance_in_hap_map_block,num_het_SNVs_in_hap_map_block,total_read_count_pat,total_read_count_mat,founder_haplotype_pat,founder_haplotype_mat,methylation_level_pat_count,methylation_level_mat_count,methylation_level_pat_model,methylation_level_mat_model,cpg_is_within_50bp_of_mismatch_site,cpg_overlaps_at_least_one_snv,snv_genotypes,cpg_is_allele_specific
str,i64,i64,i64,f64,f64,i64,i64,f64,i64,i64,i64,str,str,f64,f64,f64,f64,bool,bool,str,bool
"""chr1""",10468,10470,13,0.769,0.868,,,,,,,,,,,,,false,false,""".""",false
"""chr1""",10470,10472,13,0.769,0.878,,,,,,,,,,,,,false,false,""".""",false
"""chr1""",10483,10485,13,0.923,0.929,,,,,,,,,,,,,false,false,""".""",false
"""chr1""",10488,10490,14,1.0,0.95,,,,,,,,,,,,,false,false,""".""",false
"""chr1""",10492,10494,13,1.0,0.964,,,,,,,,,,,,,false,false,""".""",false
"""chr1""",10496,10498,14,0.857,0.954,,,,,,,,,,,,,false,false,""".""",false
"""chr1""",10524,10526,14,0.643,0.951,,,,,,,,,,,,,false,false,""".""",false
"""chr1""",10541,10543,14,0.857,0.956,,,,,,,,,,,,,false,false,""".""",false
"""chr1""",10562,10564,14,0.857,0.932,,,,,,,,,,,,,false,false,""".""",false
"""chr1""",10570,10572,14,0.786,0.877,,,,,,,,,,,,,false,false,""".""",false


### Sanity checking 

In [None]:
# CGs that overlap 1 SNV that is het indeed have zero methylation on one haplotype, and can be flagged for exclusion in imprinting scans: 
DF_METH_FOUNDER_PHASED_ALL_CPGS_WITH_ALLELE_SPECIFIC_FLAG.filter(pl.col("snv_genotypes").str.contains("het"))

chrom,start_cpg,end_cpg,total_read_count,methylation_level_count,methylation_level_model,start_hap_map_block,end_hap_map_block,haplotype_concordance_in_hap_map_block,num_het_SNVs_in_hap_map_block,total_read_count_pat,total_read_count_mat,founder_haplotype_pat,founder_haplotype_mat,methylation_level_pat_count,methylation_level_mat_count,methylation_level_pat_model,methylation_level_mat_model,cpg_is_within_50bp_of_mismatch_site,cpg_overlaps_at_least_one_snv,snv_genotypes,cpg_is_allele_specific
str,i64,i64,i64,f64,f64,i64,i64,f64,i64,i64,i64,str,str,f64,f64,f64,f64,bool,bool,str,bool
"""chr1""",504316,504318,48,0.458,0.477,492339,532812,1.0,16,18,27,"""B""","""I""",0.0,0.778,0.063,0.935,false,true,"""het""",true
"""chr1""",516229,516231,58,0.466,0.602,492339,532812,1.0,16,25,33,"""B""","""I""",0.0,0.818,0.049,0.926,false,true,"""het""",true
"""chr1""",516337,516339,58,0.345,0.533,492339,532812,1.0,16,25,33,"""B""","""I""",0.8,0.0,0.934,0.057,false,true,"""het""",true
"""chr1""",699537,699539,45,0.311,0.457,593123,1314109,0.993119,436,19,16,"""B""","""I""",0.737,0.0,0.932,0.057,false,true,"""het""",true
"""chr1""",700917,700919,43,0.233,0.267,593123,1314109,0.993119,436,19,14,"""B""","""I""",0.526,0.0,0.67,0.055,false,true,"""het""",true
"""chr1""",749854,749856,54,0.426,0.524,593123,1314109,0.993119,436,40,14,"""B""","""I""",0.275,0.857,0.471,0.912,false,true,"""het""",true
"""chr1""",770500,770502,35,0.486,0.535,593123,1314109,0.993119,436,18,17,"""B""","""I""",0.944,0.0,0.972,0.049,false,true,"""het""",true
"""chr1""",770986,770988,37,0.378,0.521,593123,1314109,0.993119,436,19,18,"""B""","""I""",0.0,0.778,0.069,0.948,false,true,"""het""",true
"""chr1""",771396,771398,35,0.4,0.532,593123,1314109,0.993119,436,19,16,"""B""","""I""",0.737,0.0,0.971,0.056,false,true,"""het""",true
"""chr1""",802842,802844,25,0.32,0.472,593123,1314109,0.993119,436,12,13,"""B""","""I""",0.667,0.0,0.91,0.055,false,true,"""het""",true


In [None]:
# CGs that overlap 2 SNVs, the first of which is het, indeed have zero methylation on one haplotype, and can be flagged for exclusion in imprinting scans: 
DF_METH_FOUNDER_PHASED_ALL_CPGS_WITH_ALLELE_SPECIFIC_FLAG.filter(pl.col("snv_genotypes").str.contains("het,"))

chrom,start_cpg,end_cpg,total_read_count,methylation_level_count,methylation_level_model,start_hap_map_block,end_hap_map_block,haplotype_concordance_in_hap_map_block,num_het_SNVs_in_hap_map_block,total_read_count_pat,total_read_count_mat,founder_haplotype_pat,founder_haplotype_mat,methylation_level_pat_count,methylation_level_mat_count,methylation_level_pat_model,methylation_level_mat_model,cpg_is_within_50bp_of_mismatch_site,cpg_overlaps_at_least_one_snv,snv_genotypes,cpg_is_allele_specific
str,i64,i64,i64,f64,f64,i64,i64,f64,i64,i64,i64,str,str,f64,f64,f64,f64,bool,bool,str,bool
"""chr1""",1134061,1134063,,,,,,,,,,,,,,,,false,true,"""het,het""",true
"""chr1""",1134069,1134071,,,,,,,,,,,,,,,,false,true,"""het,het""",true
"""chr1""",3082637,3082639,32,0.344,0.489,2961801,3184712,1.0,392,13,19,"""B""","""I""",0.846,0.0,0.944,0.055,false,true,"""het,hom""",true
"""chr1""",3083592,3083594,31,0.387,0.538,2961801,3184712,1.0,392,12,19,"""B""","""I""",1.0,0.0,0.959,0.057,false,true,"""het,het""",true
"""chr1""",3163418,3163420,23,0.391,0.463,2961801,3184712,1.0,392,13,10,"""B""","""I""",0.0,0.9,0.048,0.929,false,true,"""het,hom""",true
"""chr1""",3609074,3609076,,,,,,,,,,,,,,,,false,true,"""het,het""",true
"""chr1""",3778230,3778232,36,0.5,0.513,3399126,4207029,1.0,1114,15,21,"""B""","""I""",0.0,0.857,0.062,0.916,false,true,"""het,hom""",true
"""chr1""",3801504,3801506,26,0.038,0.033,3399126,4207029,1.0,1114,17,,"""B""","""I""",0.0,,0.046,,false,true,"""het,hom""",true
"""chr1""",3805920,3805922,32,0.531,0.567,3399126,4207029,1.0,1114,21,11,"""B""","""I""",0.81,0.0,0.943,0.053,false,true,"""het,hom""",true
"""chr1""",3833082,3833084,35,0.514,0.538,3399126,4207029,1.0,1114,21,14,"""B""","""I""",0.857,0.0,0.965,0.064,false,true,"""het,het""",true


In [None]:
# CGs that overlap 2 SNVs, the second of which is het, indeed have zero methylation on one haplotype, and can be flagged for exclusion in imprinting scans: 
DF_METH_FOUNDER_PHASED_ALL_CPGS_WITH_ALLELE_SPECIFIC_FLAG.filter(pl.col("snv_genotypes").str.contains(",het"))

chrom,start_cpg,end_cpg,total_read_count,methylation_level_count,methylation_level_model,start_hap_map_block,end_hap_map_block,haplotype_concordance_in_hap_map_block,num_het_SNVs_in_hap_map_block,total_read_count_pat,total_read_count_mat,founder_haplotype_pat,founder_haplotype_mat,methylation_level_pat_count,methylation_level_mat_count,methylation_level_pat_model,methylation_level_mat_model,cpg_is_within_50bp_of_mismatch_site,cpg_overlaps_at_least_one_snv,snv_genotypes,cpg_is_allele_specific
str,i64,i64,i64,f64,f64,i64,i64,f64,i64,i64,i64,str,str,f64,f64,f64,f64,bool,bool,str,bool
"""chr1""",1134061,1134063,,,,,,,,,,,,,,,,false,true,"""het,het""",true
"""chr1""",1134069,1134071,,,,,,,,,,,,,,,,false,true,"""het,het""",true
"""chr1""",2364483,2364485,25,0.4,0.561,1613420,2931691,1.0,1201,10,15,"""B""","""I""",0.0,0.667,0.06,0.943,false,true,"""hom,het""",true
"""chr1""",3083592,3083594,31,0.387,0.538,2961801,3184712,1.0,392,12,19,"""B""","""I""",1.0,0.0,0.959,0.057,false,true,"""het,het""",true
"""chr1""",3304518,3304520,39,0.436,0.51,3203589,3398794,1.0,243,20,19,"""B""","""I""",0.0,0.895,0.057,0.965,false,true,"""hom,het""",true
"""chr1""",3435662,3435664,25,0.44,0.511,3399126,4207029,1.0,1114,13,12,"""B""","""I""",0.846,0.0,0.939,0.077,false,true,"""hom,het""",true
"""chr1""",3609074,3609076,,,,,,,,,,,,,,,,false,true,"""het,het""",true
"""chr1""",3784925,3784927,29,0.345,0.522,3399126,4207029,1.0,1114,12,17,"""B""","""I""",0.833,0.0,0.965,0.057,false,true,"""hom,het""",true
"""chr1""",3828567,3828569,38,0.553,0.533,3399126,4207029,1.0,1114,24,14,"""B""","""I""",0.875,0.0,0.958,0.049,false,true,"""hom,het""",true
"""chr1""",3833082,3833084,35,0.514,0.538,3399126,4207029,1.0,1114,21,14,"""B""","""I""",0.857,0.0,0.965,0.064,false,true,"""het,het""",true
