## Setup 

In [2]:
import importlib
from pathlib import Path 
import sys
import polars as pl

OUTPUT_DIR = f"/scratch/ucgd/lustre-labs/quinlan/data-shared/dna-methylation/CEPH1463.GRCh38.hifi.founder-phased.all-cpgs" 
BED_ALL_CPGS_IN_REFERENCE = f"{OUTPUT_DIR}/all_cpg_sites_in_reference.bed" # output of src/write_all_cpgs.py
METH_FOUNDER_PHASED_DIR = f"/scratch/ucgd/lustre-labs/quinlan/data-shared/dna-methylation/CEPH1463.GRCh38.hifi.founder-phased" # output dir of phase_meth_to_founder_haps.py
METH_COUNT_READ_PHASED_DIR = Path(f"/scratch/ucgd/lustre-labs/quinlan/data-shared/dna-methylation/CEPH1463.GRCh38.hifi.count.read-backed-phased") # output dir of aligned_bam_to_cpg_scores (containing count-based unphased meth)
METH_MODEL_READ_PHASED_DIR = Path(f"/scratch/ucgd/lustre-labs/quinlan/data-shared/dna-methylation/CEPH1463.GRCh38.hifi.model.read-backed-phased") # output dir of aligned_bam_to_cpg_scores (containing model-based unphased meth)
UID = 200081 # sample 
BED_METH_FOUNDER_PHASED = f"{METH_FOUNDER_PHASED_DIR}/{UID}.dna-methylation.founder-phased.bed" # bed file of founder-phased methylation levels from src/phase_meth_to_founder_haps.py
BED_METH_COUNT_UNPHASED = Path(f"{METH_COUNT_READ_PHASED_DIR}/{UID}.GRCh38.haplotagged.combined.bed.gz") # bed file from aligned_bam_to_cpg_scores (unphased count-based meth)
BED_METH_MODEL_UNPHASED = Path(f"{METH_MODEL_READ_PHASED_DIR}/{UID}.GRCh38.haplotagged.combined.bed.gz") # bed file from aligned_bam_to_cpg_scores (unphased model-based meth)
BED_METH_FOUNDER_PHASED_ALL_CPGS = f"{OUTPUT_DIR}/{UID}.dna-methylation.founder-phased.all_cpgs.bed"
BED_HET_SITE_MISMATCHES = f"{METH_FOUNDER_PHASED_DIR}/{UID}.bit-vector-sites-mismatches.bed" # bed file of heterozygous sites at which bit-vectors are mismatched, from src/phase_meth_to_founder_haps.py

REPO_DIR = Path('/scratch/ucgd/lustre-labs/quinlan/u6018199/tapestry')
sys.path.append(f"{REPO_DIR}/src/util") 

## Get all CpG sites in reference genome

In [3]:
import expand_to_all_cpgs
importlib.reload(expand_to_all_cpgs)
from expand_to_all_cpgs import read_all_cpgs_in_reference

DF_ALL_CPGS_IN_REFERENCE = read_all_cpgs_in_reference(BED_ALL_CPGS_IN_REFERENCE)
DF_ALL_CPGS_IN_REFERENCE

chrom,start,end
str,i64,i64
"""chr1""",10468,10469
"""chr1""",10470,10471
"""chr1""",10483,10484
"""chr1""",10488,10489
"""chr1""",10492,10493
…,…,…
"""chrY""",56887220,56887221
"""chrY""",56887399,56887400
"""chrY""",56887579,56887580
"""chrY""",56887581,56887582


## Read in unphased DNA methylation at CpG sites, both those in the reference genome, and those present in the sample but not in the reference genome

In [4]:
importlib.reload(expand_to_all_cpgs)
from expand_to_all_cpgs import read_meth_unphased

DF_METH_UNPHASED = read_meth_unphased(BED_METH_COUNT_UNPHASED, BED_METH_MODEL_UNPHASED) 
DF_METH_UNPHASED

chrom,start,end,total_read_count_count,methylation_level_count,total_read_count_model,methylation_level_model
str,i64,i64,i64,f64,i64,f64
"""chr1""",10468,10469,13,0.769,13,0.868
"""chr1""",10470,10471,13,0.769,13,0.878
"""chr1""",10483,10484,13,0.923,13,0.929
"""chr1""",10488,10489,14,1.0,14,0.95
"""chr1""",10492,10493,13,1.0,13,0.964
…,…,…,…,…,…,…
"""chrY""",56887220,56887221,38,0.947,38,0.962
"""chrY""",56887399,56887400,38,0.737,38,0.942
"""chrY""",56887579,56887580,38,0.895,38,0.945
"""chrY""",56887592,56887593,38,0.579,38,0.721


## Methylation levels are computed at CpG sites observed in the sample (which may or may not be in the reference)

The default value of the `--modsites-mode` argument of `aligned_bam_to_cpg_scores` is `denovo`, meaning that DNA methylation levels are computed at all CG sites in the sample's haplotypes: 

https://github.com/PacificBiosciences/pb-CpG-tools?tab=readme-ov-file#output-modes-and-option-details

https://quinlangroup.slack.com/archives/C0803TM7X0X/p1759348751929209

https://quinlangroup.slack.com/archives/C0803TM7X0X/p1759349045861589

## CpG site creation: Sites that are CpG in at least one haplotype of the sample, but not CpG in the reference sequence

In [5]:
# IGV snapshots: 
# https://quinlangroup.slack.com/archives/C0803TM7X0X/p1759879585412219 
# https://quinlangroup.slack.com/archives/C0803TM7X0X/p1759880211882149
# https://quinlangroup.slack.com/archives/C0803TM7X0X/p1759880434142149 
# https://quinlangroup.slack.com/archives/C0803TM7X0X/p1759880669955469

# These sites also appear as variants in /scratch/ucgd/lustre-labs/quinlan/data-shared/read-backed-phasing/200081.GRCh38.deepvariant.glnexus.phased.vcf.gz
# e.g., 
# $ tabix 200081.GRCh38.deepvariant.glnexus.phased.vcf.gz chr1:10623-10623 
# chr1    10623   chr1_10623_T_C  T       C       36      .       AF=1;AQ=36      GT:DP:AD:GQ:PL:RNC      1/1:23:0,23:22:33,22,0:..

DF_METH_UNPHASED.join(DF_ALL_CPGS_IN_REFERENCE, on=['chrom', 'start', 'end'], how='anti')

chrom,start,end,total_read_count_count,methylation_level_count,total_read_count_model,methylation_level_model
str,i64,i64,i64,f64,i64,f64
"""chr1""",10622,10623,15,0.8,15,0.919
"""chr1""",10804,10805,13,0.692,13,0.708
"""chr1""",10820,10821,13,0.615,13,0.778
"""chr1""",10828,10829,14,0.571,14,0.652
"""chr1""",10925,10926,14,0.786,14,0.949
…,…,…,…,…,…,…
"""chrY""",56885796,56885797,41,0.854,41,0.953
"""chrY""",56885831,56885832,41,0.805,41,0.948
"""chrY""",56885915,56885916,41,0.732,41,0.9
"""chrY""",56886309,56886310,41,0.854,41,0.927


## Sites that are CpG in the reference, but at which unphased DNA methylation is not reported

These sites fall into two classes: 

1. CpG site destruction: A variant destroyed the CpG (relative to the reference sequence), e.g., https://quinlangroup.slack.com/archives/C0803TM7X0X/p1759955795431799 We see these variants in the VCF too. 
2. The site is CpG in the sample, but read coverage was too low to report a reliable estimate of DNA methylation 

In [6]:
DF_ALL_CPGS_IN_REFERENCE.join(DF_METH_UNPHASED, on=['chrom', 'start', 'end'], how='anti')

chrom,start,end
str,i64,i64
"""chr1""",10930,10931
"""chr1""",10933,10934
"""chr1""",11166,11167
"""chr1""",12781,12782
"""chr1""",13301,13302
…,…,…
"""chrY""",56884829,56884830
"""chrY""",56885859,56885860
"""chrY""",56886407,56886408
"""chrY""",56886943,56886944


## Read in founder-phased DNA methylation at CpG sites

In [7]:
importlib.reload(expand_to_all_cpgs)
from expand_to_all_cpgs import read_meth_founder_phased

DF_METH_FOUNDER_PHASED = read_meth_founder_phased(BED_METH_FOUNDER_PHASED)
DF_METH_FOUNDER_PHASED

chrom,start,end,start_hap_map_block,end_hap_map_block,haplotype_concordance_in_hap_map_block,num_het_SNVs_in_hap_map_block,total_read_count_pat,total_read_count_mat,founder_haplotype_pat,founder_haplotype_mat,methylation_level_pat_count,methylation_level_mat_count,methylation_level_pat_model,methylation_level_mat_model
str,i64,i64,i64,i64,f64,i64,f64,f64,str,str,f64,f64,f64,f64
"""chr1""",14061,14062,,,,,,,,,,,,
"""chr1""",14178,14179,,,,,,,,,,,,
"""chr1""",14348,14349,,,,,,,,,,,,
"""chr1""",14353,14354,,,,,,,,,,,,
"""chr1""",14434,14435,,,,,,,,,,,,
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""chrY""",56887220,56887221,,,,,,,,,,,,
"""chrY""",56887399,56887400,,,,,,,,,,,,
"""chrY""",56887579,56887580,,,,,,,,,,,,
"""chrY""",56887592,56887593,,,,,,,,,,,,


## Expand the dataframe of founder-phased methylation levels to include all CpG sites in reference and sample genome, and unphased methylation levels (where available)

In [8]:
importlib.reload(expand_to_all_cpgs)
from expand_to_all_cpgs import expand_meth_to_all_cpgs

DF_METH_FOUNDER_PHASED_ALL_CPGS = expand_meth_to_all_cpgs(DF_ALL_CPGS_IN_REFERENCE, DF_METH_UNPHASED, DF_METH_FOUNDER_PHASED)
DF_METH_FOUNDER_PHASED_ALL_CPGS

chrom,start,end,total_read_count,methylation_level_count,methylation_level_model,start_hap_map_block,end_hap_map_block,haplotype_concordance_in_hap_map_block,num_het_SNVs_in_hap_map_block,total_read_count_pat,total_read_count_mat,founder_haplotype_pat,founder_haplotype_mat,methylation_level_pat_count,methylation_level_mat_count,methylation_level_pat_model,methylation_level_mat_model
str,i64,i64,i64,f64,f64,i64,i64,f64,i64,f64,f64,str,str,f64,f64,f64,f64
"""chr1""",10468,10469,13,0.769,0.868,,,,,,,,,,,,
"""chr1""",10470,10471,13,0.769,0.878,,,,,,,,,,,,
"""chr1""",10483,10484,13,0.923,0.929,,,,,,,,,,,,
"""chr1""",10488,10489,14,1.0,0.95,,,,,,,,,,,,
"""chr1""",10492,10493,13,1.0,0.964,,,,,,,,,,,,
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""chrY""",56887399,56887400,38,0.737,0.942,,,,,,,,,,,,
"""chrY""",56887579,56887580,38,0.895,0.945,,,,,,,,,,,,
"""chrY""",56887581,56887582,,,,,,,,,,,,,,,
"""chrY""",56887592,56887593,38,0.579,0.721,,,,,,,,,,,,


## Add proximity of each CpG site to heterozygous sites at which bit-vectors are mismatched 

In [9]:
importlib.reload(expand_to_all_cpgs)
from expand_to_all_cpgs import compute_proximity_to_mismatched_heterozygous_sites

DF_METH_FOUNDER_PHASED_ALL_CPGS = compute_proximity_to_mismatched_heterozygous_sites(DF_METH_FOUNDER_PHASED_ALL_CPGS, BED_HET_SITE_MISMATCHES)
DF_METH_FOUNDER_PHASED_ALL_CPGS

chrom,start,end,total_read_count,methylation_level_count,methylation_level_model,start_hap_map_block,end_hap_map_block,haplotype_concordance_in_hap_map_block,num_het_SNVs_in_hap_map_block,total_read_count_pat,total_read_count_mat,founder_haplotype_pat,founder_haplotype_mat,methylation_level_pat_count,methylation_level_mat_count,methylation_level_pat_model,methylation_level_mat_model,is_within_50bp_of_mismatch_site
str,i64,i64,i64,f64,f64,i64,i64,f64,i64,i64,i64,str,str,f64,f64,f64,f64,bool
"""chr1""",10468,10469,13,0.769,0.868,,,,,,,,,,,,,false
"""chr1""",10470,10471,13,0.769,0.878,,,,,,,,,,,,,false
"""chr1""",10483,10484,13,0.923,0.929,,,,,,,,,,,,,false
"""chr1""",10488,10489,14,1.0,0.95,,,,,,,,,,,,,false
"""chr1""",10492,10493,13,1.0,0.964,,,,,,,,,,,,,false
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""chrY""",56887399,56887400,38,0.737,0.942,,,,,,,,,,,,,
"""chrY""",56887579,56887580,38,0.895,0.945,,,,,,,,,,,,,
"""chrY""",56887581,56887582,,,,,,,,,,,,,,,,
"""chrY""",56887592,56887593,38,0.579,0.721,,,,,,,,,,,,,


In [10]:
# we don't phase methylation on chrM (>2 copies per cell): 
DF_METH_FOUNDER_PHASED_ALL_CPGS.filter(pl.col('chrom') == 'chrM').filter(pl.col('start_hap_map_block').is_not_null())

chrom,start,end,total_read_count,methylation_level_count,methylation_level_model,start_hap_map_block,end_hap_map_block,haplotype_concordance_in_hap_map_block,num_het_SNVs_in_hap_map_block,total_read_count_pat,total_read_count_mat,founder_haplotype_pat,founder_haplotype_mat,methylation_level_pat_count,methylation_level_mat_count,methylation_level_pat_model,methylation_level_mat_model,is_within_50bp_of_mismatch_site
str,i64,i64,i64,f64,f64,i64,i64,f64,i64,i64,i64,str,str,f64,f64,f64,f64,bool


In [11]:
# we don't phase methylation on chrY (<2 copies per cell): 
DF_METH_FOUNDER_PHASED_ALL_CPGS.filter(pl.col('chrom') == 'chrY').filter(pl.col('start_hap_map_block').is_not_null())

chrom,start,end,total_read_count,methylation_level_count,methylation_level_model,start_hap_map_block,end_hap_map_block,haplotype_concordance_in_hap_map_block,num_het_SNVs_in_hap_map_block,total_read_count_pat,total_read_count_mat,founder_haplotype_pat,founder_haplotype_mat,methylation_level_pat_count,methylation_level_mat_count,methylation_level_pat_model,methylation_level_mat_model,is_within_50bp_of_mismatch_site
str,i64,i64,i64,f64,f64,i64,i64,f64,i64,i64,i64,str,str,f64,f64,f64,f64,bool


In [12]:
# since we don't phase methylation on chrM (>2 copies per cell) and chrY (<2 copies per cell), there are null values for "is_within_50bp_of_mismatch_site":
DF_METH_FOUNDER_PHASED_ALL_CPGS.filter(pl.col('is_within_50bp_of_mismatch_site').is_null())

chrom,start,end,total_read_count,methylation_level_count,methylation_level_model,start_hap_map_block,end_hap_map_block,haplotype_concordance_in_hap_map_block,num_het_SNVs_in_hap_map_block,total_read_count_pat,total_read_count_mat,founder_haplotype_pat,founder_haplotype_mat,methylation_level_pat_count,methylation_level_mat_count,methylation_level_pat_model,methylation_level_mat_model,is_within_50bp_of_mismatch_site
str,i64,i64,i64,f64,f64,i64,i64,f64,i64,i64,i64,str,str,f64,f64,f64,f64,bool
"""chrM""",32,33,192,0.25,0.071,,,,,,,,,,,,,
"""chrM""",60,61,190,0.242,0.039,,,,,,,,,,,,,
"""chrM""",77,78,190,0.089,0.029,,,,,,,,,,,,,
"""chrM""",79,80,190,0.121,0.034,,,,,,,,,,,,,
"""chrM""",90,91,190,0.232,0.032,,,,,,,,,,,,,
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""chrY""",56887399,56887400,38,0.737,0.942,,,,,,,,,,,,,
"""chrY""",56887579,56887580,38,0.895,0.945,,,,,,,,,,,,,
"""chrY""",56887581,56887582,,,,,,,,,,,,,,,,
"""chrY""",56887592,56887593,38,0.579,0.721,,,,,,,,,,,,,


In [13]:
# total read count (about half of 30X) suggests just one copy of chrX, i.e., a male: 
DF_METH_FOUNDER_PHASED_ALL_CPGS.filter(pl.col('chrom') == 'chrX').filter(pl.col('total_read_count').is_not_null())

chrom,start,end,total_read_count,methylation_level_count,methylation_level_model,start_hap_map_block,end_hap_map_block,haplotype_concordance_in_hap_map_block,num_het_SNVs_in_hap_map_block,total_read_count_pat,total_read_count_mat,founder_haplotype_pat,founder_haplotype_mat,methylation_level_pat_count,methylation_level_mat_count,methylation_level_pat_model,methylation_level_mat_model,is_within_50bp_of_mismatch_site
str,i64,i64,i64,f64,f64,i64,i64,f64,i64,i64,i64,str,str,f64,f64,f64,f64,bool
"""chrX""",25567,25568,10,1.0,0.962,,,,,,,,,,,,,false
"""chrX""",25575,25576,10,0.8,0.956,,,,,,,,,,,,,false
"""chrX""",25631,25632,10,0.7,0.732,,,,,,,,,,,,,false
"""chrX""",25638,25639,10,0.9,0.96,,,,,,,,,,,,,false
"""chrX""",25647,25648,10,1.0,0.969,,,,,,,,,,,,,false
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""chrX""",156020271,156020272,11,0.727,0.866,,,,,,,,,,,,,false
"""chrX""",156020287,156020288,11,0.909,0.933,,,,,,,,,,,,,false
"""chrX""",156020376,156020377,10,0.8,0.885,,,,,,,,,,,,,false
"""chrX""",156030064,156030065,15,0.133,0.429,,,,,,,,,,,,,false


In [14]:
# this phasing is probably incorrect due to technical errors, since there is only one X in a male, and it must come from the mother (with the Y coming from the father), 
# whereas these data say that the X comes from the father:
DF_METH_FOUNDER_PHASED_ALL_CPGS.filter(pl.col('chrom') == 'chrX').filter(pl.col('start_hap_map_block').is_not_null())

chrom,start,end,total_read_count,methylation_level_count,methylation_level_model,start_hap_map_block,end_hap_map_block,haplotype_concordance_in_hap_map_block,num_het_SNVs_in_hap_map_block,total_read_count_pat,total_read_count_mat,founder_haplotype_pat,founder_haplotype_mat,methylation_level_pat_count,methylation_level_mat_count,methylation_level_pat_model,methylation_level_mat_model,is_within_50bp_of_mismatch_site
str,i64,i64,i64,f64,f64,i64,i64,f64,i64,i64,i64,str,str,f64,f64,f64,f64,bool
"""chrX""",2241257,2241258,19,0.895,0.959,2240422,3036819,1.0,193,10,,"""A""","""G""",0.9,,0.962,,false
"""chrX""",2241263,2241264,19,0.632,0.652,2240422,3036819,1.0,193,10,,"""A""","""G""",0.7,,0.875,,false
"""chrX""",2241329,2241330,19,0.632,0.775,2240422,3036819,1.0,193,10,,"""A""","""G""",0.6,,0.872,,false
"""chrX""",2241404,2241405,19,0.421,0.27,2240422,3036819,1.0,193,10,,"""A""","""G""",0.4,,0.295,,false
"""chrX""",2241507,2241508,20,0.9,0.947,2240422,3036819,1.0,193,11,,"""A""","""G""",0.909,,0.951,,false
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""chrX""",155658077,155658078,35,0.629,0.854,155616130,155659104,1.0,1,20,15,"""A""","""K""",0.75,0.467,0.956,0.491,false
"""chrX""",155658233,155658234,34,0.588,0.721,155616130,155659104,1.0,1,19,15,"""A""","""K""",0.737,0.4,0.915,0.205,false
"""chrX""",155658380,155658381,34,0.647,0.704,155616130,155659104,1.0,1,19,15,"""A""","""K""",0.737,0.533,0.854,0.574,false
"""chrX""",155658475,155658476,34,0.647,0.821,155616130,155659104,1.0,1,19,15,"""A""","""K""",0.737,0.533,0.939,0.638,false


In [15]:
# TODO: check phasing of methylation on chrX in a female sample (XX karyotype)

## An example of CpG site creation (see above)

A site that is CpG in only one haplotype of the sample, and not CpG in the reference sequence



In [16]:
# IGV snapshot: 
# https://quinlangroup.slack.com/archives/C0803TM7X0X/p1759880434142149

DF_METH_FOUNDER_PHASED_ALL_CPGS.filter(
    (pl.col("chrom") == 'chr20') & 
    (pl.col("start") == 101339)
)

chrom,start,end,total_read_count,methylation_level_count,methylation_level_model,start_hap_map_block,end_hap_map_block,haplotype_concordance_in_hap_map_block,num_het_SNVs_in_hap_map_block,total_read_count_pat,total_read_count_mat,founder_haplotype_pat,founder_haplotype_mat,methylation_level_pat_count,methylation_level_mat_count,methylation_level_pat_model,methylation_level_mat_model,is_within_50bp_of_mismatch_site
str,i64,i64,i64,f64,f64,i64,i64,f64,i64,i64,i64,str,str,f64,f64,f64,f64,bool
"""chr20""",101339,101340,45,0.4,0.478,67743,236639,1.0,258,23,22,"""A""","""I""",0.783,0.0,0.89,0.06,False


## Examples of CpG sites where phasing is partial, even though they are in hap-map blocks 

In [17]:
DF_METH_FOUNDER_PHASED_ALL_CPGS.filter(
    (pl.col('chrom') == 'chr1') &
    (pl.col('start') > 3665300) & 
    (pl.col('end') < 3665600)
)

chrom,start,end,total_read_count,methylation_level_count,methylation_level_model,start_hap_map_block,end_hap_map_block,haplotype_concordance_in_hap_map_block,num_het_SNVs_in_hap_map_block,total_read_count_pat,total_read_count_mat,founder_haplotype_pat,founder_haplotype_mat,methylation_level_pat_count,methylation_level_mat_count,methylation_level_pat_model,methylation_level_mat_model,is_within_50bp_of_mismatch_site
str,i64,i64,i64,f64,f64,i64,i64,f64,i64,i64,i64,str,str,f64,f64,f64,f64,bool
"""chr1""",3665407,3665408,31,0.742,0.875,3399126,4207029,1.0,1114,21,10.0,"""B""","""I""",0.714,0.8,0.783,0.951,False
"""chr1""",3665514,3665515,29,0.69,0.878,3399126,4207029,1.0,1114,20,,"""B""","""I""",0.6,,0.799,,False
"""chr1""",3665526,3665527,29,0.69,0.833,3399126,4207029,1.0,1114,20,,"""B""","""I""",0.7,,0.774,,False
"""chr1""",3665561,3665562,30,0.433,0.487,3399126,4207029,1.0,1114,20,10.0,"""B""","""I""",0.45,0.4,0.53,0.478,False


In [18]:
DF_METH_FOUNDER_PHASED_ALL_CPGS.filter(
    (pl.col('chrom') == 'chr1') &
    (pl.col('start') > 36678000) & 
    (pl.col('end') < 36680000)
)

chrom,start,end,total_read_count,methylation_level_count,methylation_level_model,start_hap_map_block,end_hap_map_block,haplotype_concordance_in_hap_map_block,num_het_SNVs_in_hap_map_block,total_read_count_pat,total_read_count_mat,founder_haplotype_pat,founder_haplotype_mat,methylation_level_pat_count,methylation_level_mat_count,methylation_level_pat_model,methylation_level_mat_model,is_within_50bp_of_mismatch_site
str,i64,i64,i64,f64,f64,i64,i64,f64,i64,i64,i64,str,str,f64,f64,f64,f64,bool
"""chr1""",36678211,36678212,23,0.696,0.837,36572121,36737884,1.0,179,11.0,12,"""B""","""K""",0.636,0.75,0.674,0.942,False
"""chr1""",36678518,36678519,23,0.522,0.832,36572121,36737884,1.0,179,11.0,12,"""B""","""K""",0.636,0.417,0.916,0.704,False
"""chr1""",36678538,36678539,23,0.348,0.43,36572121,36737884,1.0,179,11.0,12,"""B""","""K""",0.727,0.0,0.862,0.055,False
"""chr1""",36678689,36678690,22,0.773,0.924,36572121,36737884,1.0,179,10.0,12,"""B""","""K""",0.7,0.833,0.812,0.937,False
"""chr1""",36679081,36679082,20,0.8,0.947,36572121,36737884,1.0,179,,12,"""B""","""K""",,0.917,,0.966,False
"""chr1""",36679125,36679126,20,0.9,0.952,36572121,36737884,1.0,179,,12,"""B""","""K""",,0.917,,0.956,False
"""chr1""",36679168,36679169,20,0.75,0.907,36572121,36737884,1.0,179,,12,"""B""","""K""",,0.833,,0.957,False
"""chr1""",36679367,36679368,19,0.789,0.901,36572121,36737884,1.0,179,,12,"""B""","""K""",,0.833,,0.915,False
"""chr1""",36679693,36679694,19,0.684,0.865,36572121,36737884,1.0,179,,12,"""B""","""K""",,0.75,,0.89,False
"""chr1""",36679896,36679897,19,0.737,0.934,36572121,36737884,1.0,179,,12,"""B""","""K""",,0.667,,0.943,False


## QC Statistics 

In [26]:
importlib.reload(expand_to_all_cpgs)
from expand_to_all_cpgs import compute_fraction_of_cpgs_that_are_close_to_mismatches

compute_fraction_of_cpgs_that_are_close_to_mismatches(DF_METH_FOUNDER_PHASED_ALL_CPGS)

Percentage of CpG sites (in reference and sample genome, and on phasable chroms) that are within 50bp of a heterozygous mismatch site: 0.167%


In [None]:
importlib.reload(expand_to_all_cpgs)
from expand_to_all_cpgs import compute_fraction_of_cpgs_at_which_meth_is_phased

for parental in ['pat', 'mat']: 
    for mode in ['count', 'model']:
        compute_fraction_of_cpgs_at_which_meth_is_phased(DF_METH_FOUNDER_PHASED_ALL_CPGS, parental, mode)

Percentage of CpG sites (in reference and sample genomes, and on phasable chroms) at which count-based methylation is phased to pat haplotype: 79.97%
Percentage of CpG sites (in reference and sample genomes, and on phasable chroms) at which model-based methylation is phased to pat haplotype: 79.97%
Percentage of CpG sites (in reference and sample genomes, and on phasable chroms) at which count-based methylation is phased to mat haplotype: 79.90%
Percentage of CpG sites (in reference and sample genomes, and on phasable chroms) at which model-based methylation is phased to mat haplotype: 79.90%
