## Setup 

In [1]:
import importlib
from pathlib import Path 
import sys
import polars as pl 

OUTPUT_DIR = f"/scratch/ucgd/lustre-labs/quinlan/data-shared/dna-methylation/CEPH1463.GRCh38.hifi.founder-phased.all-cpgs" 
BED_ALL_CPGS_IN_REFERENCE = f"{OUTPUT_DIR}/all_cpg_sites_in_reference.bed" # output of src/write_all_cpgs.py
METH_FOUNDER_PHASED_DIR = f"/scratch/ucgd/lustre-labs/quinlan/data-shared/dna-methylation/CEPH1463.GRCh38.hifi.founder-phased" # output dir of phase_meth_to_founder_haps.py
METH_COUNT_READ_PHASED_DIR = Path(f"/scratch/ucgd/lustre-labs/quinlan/data-shared/dna-methylation/CEPH1463.GRCh38.hifi.count.read-backed-phased") # output dir of aligned_bam_to_cpg_scores (containing count-based unphased meth)
METH_MODEL_READ_PHASED_DIR = Path(f"/scratch/ucgd/lustre-labs/quinlan/data-shared/dna-methylation/CEPH1463.GRCh38.hifi.model.read-backed-phased") # output dir of aligned_bam_to_cpg_scores (containing model-based unphased meth)
UID = '200081' # much of the interpretation in this notebook is specific to this sample
BED_METH_FOUNDER_PHASED = f"{METH_FOUNDER_PHASED_DIR}/{UID}.dna-methylation.founder-phased.bed" # bed file of founder-phased methylation levels from src/phase_meth_to_founder_haps.py
BED_METH_COUNT_UNPHASED = Path(f"{METH_COUNT_READ_PHASED_DIR}/{UID}.GRCh38.haplotagged.combined.bed.gz") # bed file from aligned_bam_to_cpg_scores (unphased count-based meth)
BED_METH_MODEL_UNPHASED = Path(f"{METH_MODEL_READ_PHASED_DIR}/{UID}.GRCh38.haplotagged.combined.bed.gz") # bed file from aligned_bam_to_cpg_scores (unphased model-based meth)
BED_METH_FOUNDER_PHASED_ALL_CPGS = f"{OUTPUT_DIR}/{UID}.dna-methylation.founder-phased.all_cpgs.bed"
BED_HET_SITE_MISMATCHES = f"{METH_FOUNDER_PHASED_DIR}/{UID}.bit-vector-sites-mismatches.bed" # bed file of heterozygous sites at which bit-vectors are mismatched, from src/phase_meth_to_founder_haps.py

VCF_JOINT_CALLED = "/scratch/ucgd/lustre-labs/quinlan/data-shared/datasets/Palladium/deepvariant/CEPH-1463.joint.GRCh38.deepvariant.glnexus.phased.vcf.gz"

REPO_DIR = Path('/scratch/ucgd/lustre-labs/quinlan/u6018199/tapestry')
sys.path.append(f"{REPO_DIR}/src/util") 

## Get all CpG sites in reference genome

In [2]:
import expand_to_all_cpgs
importlib.reload(expand_to_all_cpgs)
from expand_to_all_cpgs import read_all_cpgs_in_reference

DF_ALL_CPGS_IN_REFERENCE = read_all_cpgs_in_reference(BED_ALL_CPGS_IN_REFERENCE)
DF_ALL_CPGS_IN_REFERENCE

chrom,start,end
str,i64,i64
"""chr1""",10468,10469
"""chr1""",10470,10471
"""chr1""",10483,10484
"""chr1""",10488,10489
"""chr1""",10492,10493
…,…,…
"""chrM""",16448,16449
"""chrM""",16453,16454
"""chrM""",16494,16495
"""chrM""",16541,16542


## Read in unphased DNA methylation at CpG sites, both those in the reference genome, and those present in the sample but not in the reference genome

In [3]:
importlib.reload(expand_to_all_cpgs)
from expand_to_all_cpgs import read_meth_unphased

DF_METH_UNPHASED = read_meth_unphased(BED_METH_COUNT_UNPHASED, BED_METH_MODEL_UNPHASED) 
DF_METH_UNPHASED

chrom,start,end,total_read_count_count,methylation_level_count,total_read_count_model,methylation_level_model
str,i64,i64,i64,f64,i64,f64
"""chr1""",10468,10469,13,0.769,13,0.868
"""chr1""",10470,10471,13,0.769,13,0.878
"""chr1""",10483,10484,13,0.923,13,0.929
"""chr1""",10488,10489,14,1.0,14,0.95
"""chr1""",10492,10493,13,1.0,13,0.964
…,…,…,…,…,…,…
"""chrM""",16426,16427,192,0.276,192,0.037
"""chrM""",16448,16449,192,0.161,192,0.029
"""chrM""",16453,16454,192,0.146,192,0.032
"""chrM""",16494,16495,192,0.083,192,0.043


## Methylation levels are computed at CpG sites observed in the sample (which may or may not be in the reference)

The default value of the `--modsites-mode` argument of `aligned_bam_to_cpg_scores` is `denovo`, meaning that DNA methylation levels are computed at all CG sites in the sample's haplotypes: 

https://github.com/PacificBiosciences/pb-CpG-tools?tab=readme-ov-file#output-modes-and-option-details

https://quinlangroup.slack.com/archives/C0803TM7X0X/p1759348751929209

https://quinlangroup.slack.com/archives/C0803TM7X0X/p1759349045861589

## CpG site creation: Sites that are CpG in at least one haplotype of the sample, but not CpG in the reference sequence

In [5]:
# IGV snapshots: 
# https://quinlangroup.slack.com/archives/C0803TM7X0X/p1759879585412219 
# https://quinlangroup.slack.com/archives/C0803TM7X0X/p1759880211882149
# https://quinlangroup.slack.com/archives/C0803TM7X0X/p1759880434142149 
# https://quinlangroup.slack.com/archives/C0803TM7X0X/p1759880669955469

# These sites also appear as variants in /scratch/ucgd/lustre-labs/quinlan/data-shared/read-backed-phasing/200081.GRCh38.deepvariant.glnexus.phased.vcf.gz
# e.g., 
# $ tabix 200081.GRCh38.deepvariant.glnexus.phased.vcf.gz chr1:10623-10623 
# chr1    10623   chr1_10623_T_C  T       C       36      .       AF=1;AQ=36      GT:DP:AD:GQ:PL:RNC      1/1:23:0,23:22:33,22,0:..

DF_METH_UNPHASED.join(DF_ALL_CPGS_IN_REFERENCE, on=['chrom', 'start', 'end'], how='anti')

chrom,start,end,total_read_count_count,methylation_level_count,total_read_count_model,methylation_level_model
str,i64,i64,i64,f64,i64,f64
"""chr1""",10622,10623,15,0.8,15,0.919
"""chr1""",10804,10805,13,0.692,13,0.708
"""chr1""",10820,10821,13,0.615,13,0.778
"""chr1""",10828,10829,14,0.571,14,0.652
"""chr1""",10925,10926,14,0.786,14,0.949
…,…,…,…,…,…,…
"""chrY""",56885915,56885916,41,0.732,41,0.9
"""chrY""",56886309,56886310,41,0.854,41,0.927
"""chrY""",56887592,56887593,38,0.579,38,0.721
"""chrM""",261,262,188,0.154,188,0.024


## Sites that are CpG in the reference, but at which unphased DNA methylation is not reported

These sites fall into two classes: 

1. CpG site destruction: A variant destroyed the CpG (relative to the reference sequence), e.g., https://quinlangroup.slack.com/archives/C0803TM7X0X/p1759955795431799 We see these variants in the VCF too. 
2. The site is CpG in the sample, but read coverage was too low to report a reliable estimate of DNA methylation 

In [6]:
DF_ALL_CPGS_IN_REFERENCE.join(DF_METH_UNPHASED, on=['chrom', 'start', 'end'], how='anti')

chrom,start,end
str,i64,i64
"""chr1""",10930,10931
"""chr1""",10933,10934
"""chr1""",11166,11167
"""chr1""",12781,12782
"""chr1""",13301,13302
…,…,…
"""chrY""",56886943,56886944
"""chrY""",56887581,56887582
"""chrM""",7335,7336
"""chrM""",14829,14830


## Read in founder-phased DNA methylation at CpG sites

In [7]:
importlib.reload(expand_to_all_cpgs)
from expand_to_all_cpgs import read_meth_founder_phased

DF_METH_FOUNDER_PHASED = read_meth_founder_phased(BED_METH_FOUNDER_PHASED)
DF_METH_FOUNDER_PHASED

chrom,start,end,start_hap_map_block,end_hap_map_block,haplotype_concordance_in_hap_map_block,num_het_SNVs_in_hap_map_block,total_read_count_pat,total_read_count_mat,founder_haplotype_pat,founder_haplotype_mat,methylation_level_pat_count,methylation_level_mat_count,methylation_level_pat_model,methylation_level_mat_model
str,i64,i64,i64,i64,f64,i64,f64,f64,str,str,f64,f64,f64,f64
"""chr1""",14061,14062,,,,,,,,,,,,
"""chr1""",14178,14179,,,,,,,,,,,,
"""chr1""",14348,14349,,,,,,,,,,,,
"""chr1""",14353,14354,,,,,,,,,,,,
"""chr1""",14434,14435,,,,,,,,,,,,
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""chrY""",56887220,56887221,,,,,,,,,,,,
"""chrY""",56887399,56887400,,,,,,,,,,,,
"""chrY""",56887579,56887580,,,,,,,,,,,,
"""chrY""",56887592,56887593,,,,,,,,,,,,


## Expand the dataframe of founder-phased methylation levels to include all CpG sites in reference and sample genome, and unphased methylation levels (where available)

In [8]:
importlib.reload(expand_to_all_cpgs)
from expand_to_all_cpgs import expand_meth_to_all_cpgs

DF_METH_FOUNDER_PHASED_ALL_CPGS = expand_meth_to_all_cpgs(DF_ALL_CPGS_IN_REFERENCE, DF_METH_UNPHASED, DF_METH_FOUNDER_PHASED)
DF_METH_FOUNDER_PHASED_ALL_CPGS

chrom,start,end,total_read_count,methylation_level_count,methylation_level_model,start_hap_map_block,end_hap_map_block,haplotype_concordance_in_hap_map_block,num_het_SNVs_in_hap_map_block,total_read_count_pat,total_read_count_mat,founder_haplotype_pat,founder_haplotype_mat,methylation_level_pat_count,methylation_level_mat_count,methylation_level_pat_model,methylation_level_mat_model
str,i64,i64,i64,f64,f64,i64,i64,f64,i64,f64,f64,str,str,f64,f64,f64,f64
"""chr1""",10468,10469,13,0.769,0.868,,,,,,,,,,,,
"""chr1""",10470,10471,13,0.769,0.878,,,,,,,,,,,,
"""chr1""",10483,10484,13,0.923,0.929,,,,,,,,,,,,
"""chr1""",10488,10489,14,1.0,0.95,,,,,,,,,,,,
"""chr1""",10492,10493,13,1.0,0.964,,,,,,,,,,,,
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""chr3""",93218694,93218695,83,0.566,0.558,,,,,,,,,,,,
"""chr1""",124175332,124175333,13,0.308,0.379,,,,,,,,,,,,
"""chrX""",62257330,62257331,17,0.765,0.892,,,,,,,,,,,,
"""chr2""",201063760,201063761,38,0.816,0.951,200913994,201867283,1.0,486,16.0,22.0,"""B""","""I""",0.875,0.773,0.961,0.931


## Add proximity of each CpG site to heterozygous sites at which bit-vectors are mismatched 

In [9]:
importlib.reload(expand_to_all_cpgs)
from expand_to_all_cpgs import compute_proximity_to_mismatched_heterozygous_sites

DF_METH_FOUNDER_PHASED_ALL_CPGS = compute_proximity_to_mismatched_heterozygous_sites(DF_METH_FOUNDER_PHASED_ALL_CPGS, BED_HET_SITE_MISMATCHES)
DF_METH_FOUNDER_PHASED_ALL_CPGS

chrom,start,end,total_read_count,methylation_level_count,methylation_level_model,start_hap_map_block,end_hap_map_block,haplotype_concordance_in_hap_map_block,num_het_SNVs_in_hap_map_block,total_read_count_pat,total_read_count_mat,founder_haplotype_pat,founder_haplotype_mat,methylation_level_pat_count,methylation_level_mat_count,methylation_level_pat_model,methylation_level_mat_model,is_within_50bp_of_mismatch_site
str,i64,i64,i64,f64,f64,i64,i64,f64,i64,i64,i64,str,str,f64,f64,f64,f64,bool
"""chr21""",5010008,5010009,,,,,,,,,,,,,,,,false
"""chr21""",5010053,5010054,,,,,,,,,,,,,,,,false
"""chr21""",5010215,5010216,,,,,,,,,,,,,,,,false
"""chr21""",5010331,5010332,,,,,,,,,,,,,,,,false
"""chr21""",5010335,5010336,,,,,,,,,,,,,,,,false
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""chr6""",168379625,168379626,31,0.323,0.523,168157897,169033370,1.0,1676,14,17,"""B""","""G""",0.714,0.0,0.835,0.044,false
"""chr6""",59046365,59046366,551,0.751,0.887,,,,,,,,,,,,,false
"""chr6""",32885726,32885727,43,0.512,0.498,32583991,33246527,1.0,4619,20,23,"""B""","""G""",0.4,0.609,0.384,0.674,false
"""chr6""",131727479,131727480,35,0.257,0.464,131339161,132225418,1.0,974,12,23,"""A""","""E""",0.75,0.0,0.944,0.054,false


## We don't phase methylation on chrM and chrY, and a note about gender

In [10]:
# we don't phase methylation on chrM (>2 copies per cell): 
DF_METH_FOUNDER_PHASED_ALL_CPGS.filter(pl.col('chrom') == 'chrM').filter(pl.col('start_hap_map_block').is_not_null())

chrom,start,end,total_read_count,methylation_level_count,methylation_level_model,start_hap_map_block,end_hap_map_block,haplotype_concordance_in_hap_map_block,num_het_SNVs_in_hap_map_block,total_read_count_pat,total_read_count_mat,founder_haplotype_pat,founder_haplotype_mat,methylation_level_pat_count,methylation_level_mat_count,methylation_level_pat_model,methylation_level_mat_model,is_within_50bp_of_mismatch_site
str,i64,i64,i64,f64,f64,i64,i64,f64,i64,i64,i64,str,str,f64,f64,f64,f64,bool


In [11]:
# we don't phase methylation on chrY (<2 copies per cell): 
DF_METH_FOUNDER_PHASED_ALL_CPGS.filter(pl.col('chrom') == 'chrY').filter(pl.col('start_hap_map_block').is_not_null())

chrom,start,end,total_read_count,methylation_level_count,methylation_level_model,start_hap_map_block,end_hap_map_block,haplotype_concordance_in_hap_map_block,num_het_SNVs_in_hap_map_block,total_read_count_pat,total_read_count_mat,founder_haplotype_pat,founder_haplotype_mat,methylation_level_pat_count,methylation_level_mat_count,methylation_level_pat_model,methylation_level_mat_model,is_within_50bp_of_mismatch_site
str,i64,i64,i64,f64,f64,i64,i64,f64,i64,i64,i64,str,str,f64,f64,f64,f64,bool


In [12]:
# since we don't phase methylation on chrM (>2 copies per cell) and chrY (<2 copies per cell), there are null values for "is_within_50bp_of_mismatch_site":
DF_METH_FOUNDER_PHASED_ALL_CPGS.filter(pl.col('is_within_50bp_of_mismatch_site').is_null())

chrom,start,end,total_read_count,methylation_level_count,methylation_level_model,start_hap_map_block,end_hap_map_block,haplotype_concordance_in_hap_map_block,num_het_SNVs_in_hap_map_block,total_read_count_pat,total_read_count_mat,founder_haplotype_pat,founder_haplotype_mat,methylation_level_pat_count,methylation_level_mat_count,methylation_level_pat_model,methylation_level_mat_model,is_within_50bp_of_mismatch_site
str,i64,i64,i64,f64,f64,i64,i64,f64,i64,i64,i64,str,str,f64,f64,f64,f64,bool
"""chrM""",32,33,192,0.25,0.071,,,,,,,,,,,,,
"""chrM""",60,61,190,0.242,0.039,,,,,,,,,,,,,
"""chrM""",77,78,190,0.089,0.029,,,,,,,,,,,,,
"""chrM""",79,80,190,0.121,0.034,,,,,,,,,,,,,
"""chrM""",90,91,190,0.232,0.032,,,,,,,,,,,,,
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""chrY""",56826241,56826242,526,0.394,0.636,,,,,,,,,,,,,
"""chrY""",11316800,11316801,1650,0.433,0.663,,,,,,,,,,,,,
"""chrY""",56838569,56838570,223,0.0,0.146,,,,,,,,,,,,,
"""chrY""",10769316,10769317,25,0.36,0.664,,,,,,,,,,,,,


In [13]:
# total read count (about half of 30X) suggests just one copy of chrX, i.e., a male: 
DF_METH_FOUNDER_PHASED_ALL_CPGS.filter(pl.col('chrom') == 'chrX').filter(pl.col('total_read_count').is_not_null())

chrom,start,end,total_read_count,methylation_level_count,methylation_level_model,start_hap_map_block,end_hap_map_block,haplotype_concordance_in_hap_map_block,num_het_SNVs_in_hap_map_block,total_read_count_pat,total_read_count_mat,founder_haplotype_pat,founder_haplotype_mat,methylation_level_pat_count,methylation_level_mat_count,methylation_level_pat_model,methylation_level_mat_model,is_within_50bp_of_mismatch_site
str,i64,i64,i64,f64,f64,i64,i64,f64,i64,i64,i64,str,str,f64,f64,f64,f64,bool
"""chrX""",25567,25568,10,1.0,0.962,,,,,,,,,,,,,false
"""chrX""",25575,25576,10,0.8,0.956,,,,,,,,,,,,,false
"""chrX""",25631,25632,10,0.7,0.732,,,,,,,,,,,,,false
"""chrX""",25638,25639,10,0.9,0.96,,,,,,,,,,,,,false
"""chrX""",25647,25648,10,1.0,0.969,,,,,,,,,,,,,false
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""chrX""",101367903,101367904,34,0.412,0.473,101280328,101405223,1.0,21,16,18,"""A""","""K""",0.0,0.778,0.056,0.867,false
"""chrX""",36854166,36854167,42,0.286,0.332,36628813,37290261,1.0,272,17,25,"""A""","""K""",0.706,0.0,0.939,0.047,false
"""chrX""",59925528,59925529,15,0.533,0.583,,,,,,,,,,,,,false
"""chrX""",69057989,69057990,24,0.333,0.499,68700102,69322531,1.0,219,14,10,"""A""","""K""",0.0,0.8,0.063,0.948,false


In [14]:
# this phasing is probably incorrect due to technical errors, since there is only one X in a male, and it must come from the mother (with the Y coming from the father), 
# whereas these data say that the X comes from the father:
DF_METH_FOUNDER_PHASED_ALL_CPGS.filter(pl.col('chrom') == 'chrX').filter(pl.col('start_hap_map_block').is_not_null())

chrom,start,end,total_read_count,methylation_level_count,methylation_level_model,start_hap_map_block,end_hap_map_block,haplotype_concordance_in_hap_map_block,num_het_SNVs_in_hap_map_block,total_read_count_pat,total_read_count_mat,founder_haplotype_pat,founder_haplotype_mat,methylation_level_pat_count,methylation_level_mat_count,methylation_level_pat_model,methylation_level_mat_model,is_within_50bp_of_mismatch_site
str,i64,i64,i64,f64,f64,i64,i64,f64,i64,i64,i64,str,str,f64,f64,f64,f64,bool
"""chrX""",2241257,2241258,19,0.895,0.959,2240422,3036819,1.0,193,10,,"""A""","""G""",0.9,,0.962,,false
"""chrX""",2241263,2241264,19,0.632,0.652,2240422,3036819,1.0,193,10,,"""A""","""G""",0.7,,0.875,,false
"""chrX""",2241329,2241330,19,0.632,0.775,2240422,3036819,1.0,193,10,,"""A""","""G""",0.6,,0.872,,false
"""chrX""",2241404,2241405,19,0.421,0.27,2240422,3036819,1.0,193,10,,"""A""","""G""",0.4,,0.295,,false
"""chrX""",2241507,2241508,20,0.9,0.947,2240422,3036819,1.0,193,11,,"""A""","""G""",0.909,,0.951,,false
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""chrX""",115081911,115081912,40,0.325,0.475,114871636,115273961,1.0,134,16,24,"""A""","""K""",0.0,0.542,0.064,0.853,false
"""chrX""",23518727,23518728,33,0.394,0.528,23469232,24624260,1.0,924,16,17,"""A""","""K""",0.813,0.0,0.962,0.043,false
"""chrX""",101367903,101367904,34,0.412,0.473,101280328,101405223,1.0,21,16,18,"""A""","""K""",0.0,0.778,0.056,0.867,false
"""chrX""",36854166,36854167,42,0.286,0.332,36628813,37290261,1.0,272,17,25,"""A""","""K""",0.706,0.0,0.939,0.047,false


In [15]:
# TODO: check phasing of methylation on chrX in a female sample (XX karyotype)

## Examples of CpG sites where phasing is partial, even though they are in hap-map blocks 

In [16]:
DF_METH_FOUNDER_PHASED_ALL_CPGS.filter(
    (pl.col('chrom') == 'chr1') &
    (pl.col('start') > 3665300) & 
    (pl.col('end') < 3665600)
)

chrom,start,end,total_read_count,methylation_level_count,methylation_level_model,start_hap_map_block,end_hap_map_block,haplotype_concordance_in_hap_map_block,num_het_SNVs_in_hap_map_block,total_read_count_pat,total_read_count_mat,founder_haplotype_pat,founder_haplotype_mat,methylation_level_pat_count,methylation_level_mat_count,methylation_level_pat_model,methylation_level_mat_model,is_within_50bp_of_mismatch_site
str,i64,i64,i64,f64,f64,i64,i64,f64,i64,i64,i64,str,str,f64,f64,f64,f64,bool
"""chr1""",3665407,3665408,31,0.742,0.875,3399126,4207029,1.0,1114,21,10.0,"""B""","""I""",0.714,0.8,0.783,0.951,False
"""chr1""",3665514,3665515,29,0.69,0.878,3399126,4207029,1.0,1114,20,,"""B""","""I""",0.6,,0.799,,False
"""chr1""",3665526,3665527,29,0.69,0.833,3399126,4207029,1.0,1114,20,,"""B""","""I""",0.7,,0.774,,False
"""chr1""",3665561,3665562,30,0.433,0.487,3399126,4207029,1.0,1114,20,10.0,"""B""","""I""",0.45,0.4,0.53,0.478,False


In [17]:
DF_METH_FOUNDER_PHASED_ALL_CPGS.filter(
    (pl.col('chrom') == 'chr1') &
    (pl.col('start') > 36678000) & 
    (pl.col('end') < 36680000)
)

chrom,start,end,total_read_count,methylation_level_count,methylation_level_model,start_hap_map_block,end_hap_map_block,haplotype_concordance_in_hap_map_block,num_het_SNVs_in_hap_map_block,total_read_count_pat,total_read_count_mat,founder_haplotype_pat,founder_haplotype_mat,methylation_level_pat_count,methylation_level_mat_count,methylation_level_pat_model,methylation_level_mat_model,is_within_50bp_of_mismatch_site
str,i64,i64,i64,f64,f64,i64,i64,f64,i64,i64,i64,str,str,f64,f64,f64,f64,bool
"""chr1""",36678211,36678212,23,0.696,0.837,36572121,36737884,1.0,179,11.0,12,"""B""","""K""",0.636,0.75,0.674,0.942,False
"""chr1""",36678518,36678519,23,0.522,0.832,36572121,36737884,1.0,179,11.0,12,"""B""","""K""",0.636,0.417,0.916,0.704,False
"""chr1""",36678538,36678539,23,0.348,0.43,36572121,36737884,1.0,179,11.0,12,"""B""","""K""",0.727,0.0,0.862,0.055,False
"""chr1""",36678689,36678690,22,0.773,0.924,36572121,36737884,1.0,179,10.0,12,"""B""","""K""",0.7,0.833,0.812,0.937,False
"""chr1""",36679081,36679082,20,0.8,0.947,36572121,36737884,1.0,179,,12,"""B""","""K""",,0.917,,0.966,False
"""chr1""",36679125,36679126,20,0.9,0.952,36572121,36737884,1.0,179,,12,"""B""","""K""",,0.917,,0.956,False
"""chr1""",36679168,36679169,20,0.75,0.907,36572121,36737884,1.0,179,,12,"""B""","""K""",,0.833,,0.957,False
"""chr1""",36679367,36679368,19,0.789,0.901,36572121,36737884,1.0,179,,12,"""B""","""K""",,0.833,,0.915,False
"""chr1""",36679693,36679694,19,0.684,0.865,36572121,36737884,1.0,179,,12,"""B""","""K""",,0.75,,0.89,False
"""chr1""",36679896,36679897,19,0.737,0.934,36572121,36737884,1.0,179,,12,"""B""","""K""",,0.667,,0.943,False


## QC Statistics 

In [18]:
importlib.reload(expand_to_all_cpgs)
from expand_to_all_cpgs import compute_fraction_of_cpgs_that_are_close_to_mismatches

compute_fraction_of_cpgs_that_are_close_to_mismatches(DF_METH_FOUNDER_PHASED_ALL_CPGS)

Percentage of CpG sites (in reference and sample genome, and on phasable chroms) that are within 50bp of a heterozygous mismatch site: 0.173%


In [19]:
importlib.reload(expand_to_all_cpgs)
from expand_to_all_cpgs import compute_fraction_of_cpgs_at_which_meth_is_phased_wrapper

compute_fraction_of_cpgs_at_which_meth_is_phased_wrapper(DF_METH_FOUNDER_PHASED_ALL_CPGS)

Percentage of CpG sites (in reference and sample genomes, and on phasable chroms) at which count-based methylation is phased to pat haplotype: 80.58%
Percentage of CpG sites (in reference and sample genomes, and on phasable chroms) at which count-based methylation is phased to mat haplotype: 80.53%
Percentage of CpG sites (in reference and sample genomes, and on phasable chroms) at which count-based methylation is phased to at least one parental haplotype: 84.25%
Percentage of CpG sites (in reference and sample genomes, and on phasable chroms) at which count-based methylation is phased to both parental haplotypes: 76.86%
Percentage of CpG sites (in reference and sample genomes, and on phasable chroms) at which count-based unphased methylation is reported: 97.45%
Percentage of CpG sites (in reference and sample genomes, and on phasable chroms) at which model-based methylation is phased to pat haplotype: 80.58%
Percentage of CpG sites (in reference and sample genomes, and on phasable chr

## Overlap CpGs with joint-called SNVs 

In [20]:
# Motivation: 
# slides: https://docs.google.com/presentation/d/11Pfax0wXh0E68C287lMaPoPvhq-OrFGxFOKE1gWOkDI/edit?slide=id.g39893c07c75_0_0#slide=id.g39893c07c75_0_0 
# slack thread: https://quinlangroup.slack.com/archives/C0803TM7X0X/p1762565840460019?thread_ts=1759348751.929209&cid=C0803TM7X0X 

In [72]:
importlib.reload(expand_to_all_cpgs)
from expand_to_all_cpgs import get_joint_called_variants

pl.Config.set_tbl_rows(30)

DF_JOINT_CALLED_VARIANTS = get_joint_called_variants(UID, VCF_JOINT_CALLED) # TESTING: 'CEPH-1463.joint.GRCh38.deepvariant.glnexus.phased.vcf'
DF_JOINT_CALLED_VARIANTS

chrom,start,end,allele_1,allele_2
str,i64,i64,str,str
"""chr1""",10290,10291,"""0""","""0"""
"""chr1""",10296,10297,""".""","""."""
"""chr1""",10302,10303,"""0""","""0"""
"""chr1""",10308,10309,"""0""","""0"""
"""chr1""",10314,10315,""".""","""."""
"""chr1""",10449,10450,"""0""","""0"""
"""chr1""",10491,10492,""".""","""."""
"""chr1""",10531,10532,""".""","""."""
"""chr1""",10591,10592,"""0""","""0"""
"""chr1""",10602,10603,"""0""","""0"""


In [None]:
# An example of CpG site creation 
# A site that is CpG in only one haplotype of the sample, and not CpG in the reference sequence

# IGV snapshot: 
# https://quinlangroup.slack.com/archives/C0803TM7X0X/p1759880434142149

In [73]:
pl.Config.set_tbl_rows(10)

# variant at this CpG site: 

DF_JOINT_CALLED_VARIANTS.filter(
    (pl.col("chrom") == 'chr20') & 
    (pl.col("start") == 101340)
)

chrom,start,end,allele_1,allele_2
str,i64,i64,str,str
"""chr20""",101340,101341,"""0""","""1"""


In [74]:
# Methylation at this CpG site on one haplotype is 0.0
# It should be None as there is no CpG on that haplotype

DF_METH_FOUNDER_PHASED_ALL_CPGS.filter(
    (pl.col("chrom") == 'chr20') & 
    (pl.col("start") == 101339)
)

chrom,start,end,total_read_count,methylation_level_count,methylation_level_model,start_hap_map_block,end_hap_map_block,haplotype_concordance_in_hap_map_block,num_het_SNVs_in_hap_map_block,total_read_count_pat,total_read_count_mat,founder_haplotype_pat,founder_haplotype_mat,methylation_level_pat_count,methylation_level_mat_count,methylation_level_pat_model,methylation_level_mat_model,is_within_50bp_of_mismatch_site
str,i64,i64,i64,f64,f64,i64,i64,f64,i64,i64,i64,str,str,f64,f64,f64,f64,bool
"""chr20""",101339,101340,45,0.4,0.478,67743,236639,1.0,258,23,22,"""A""","""I""",0.783,0.0,0.89,0.06,False


In [75]:
importlib.reload(expand_to_all_cpgs)
from expand_to_all_cpgs import label_with_variants

DF_METH_FOUNDER_PHASED_ALL_CPGS_WITH_VARIANT_LABEL = label_with_variants(DF_METH_FOUNDER_PHASED_ALL_CPGS, DF_JOINT_CALLED_VARIANTS)
DF_METH_FOUNDER_PHASED_ALL_CPGS_WITH_VARIANT_LABEL

chrom,start_cpg,end_cpg,total_read_count,methylation_level_count,methylation_level_model,start_hap_map_block,end_hap_map_block,haplotype_concordance_in_hap_map_block,num_het_SNVs_in_hap_map_block,total_read_count_pat,total_read_count_mat,founder_haplotype_pat,founder_haplotype_mat,methylation_level_pat_count,methylation_level_mat_count,methylation_level_pat_model,methylation_level_mat_model,cpg_is_within_50bp_of_mismatch_site,start_variant,end_variant,allele_1,allele_2,num_SNVs_overlapping_CG
str,i64,i64,i64,f64,f64,i64,i64,f64,i64,i64,i64,str,str,f64,f64,f64,f64,bool,i64,i64,str,str,u32
"""chr21""",5010008,5010010,,,,,,,,,,,,,,,,false,,,,,0
"""chr21""",5010053,5010055,,,,,,,,,,,,,,,,false,,,,,0
"""chr21""",5010215,5010217,,,,,,,,,,,,,,,,false,,,,,0
"""chr21""",5010331,5010333,,,,,,,,,,,,,,,,false,,,,,0
"""chr21""",5010335,5010337,,,,,,,,,,,,,,,,false,,,,,0
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""chr6""",168379625,168379627,31,0.323,0.523,168157897,169033370,1.0,1676,14,17,"""B""","""G""",0.714,0.0,0.835,0.044,false,168379626,168379627,""".""",""".""",1
"""chr6""",59046365,59046367,551,0.751,0.887,,,,,,,,,,,,,false,59046365,59046366,"""1""","""1""",1
"""chr6""",32885726,32885728,43,0.512,0.498,32583991,33246527,1.0,4619,20,23,"""B""","""G""",0.4,0.609,0.384,0.674,false,32885726,32885727,"""1""","""1""",1
"""chr6""",131727479,131727481,35,0.257,0.464,131339161,132225418,1.0,974,12,23,"""A""","""E""",0.75,0.0,0.944,0.054,false,131727479,131727480,"""0""","""1""",1


## SNVs with unknown genotypes overlap some CpGs, and therefore DO need to be acounted for when identifying allele-specific CpGs

In [76]:
DF_METH_FOUNDER_PHASED_ALL_CPGS_WITH_VARIANT_LABEL.filter(
    (pl.col('allele_1') == '.') | 
    (pl.col('allele_2') == '.') 
)

chrom,start_cpg,end_cpg,total_read_count,methylation_level_count,methylation_level_model,start_hap_map_block,end_hap_map_block,haplotype_concordance_in_hap_map_block,num_het_SNVs_in_hap_map_block,total_read_count_pat,total_read_count_mat,founder_haplotype_pat,founder_haplotype_mat,methylation_level_pat_count,methylation_level_mat_count,methylation_level_pat_model,methylation_level_mat_model,cpg_is_within_50bp_of_mismatch_site,start_variant,end_variant,allele_1,allele_2,num_SNVs_overlapping_CG
str,i64,i64,i64,f64,f64,i64,i64,f64,i64,i64,i64,str,str,f64,f64,f64,f64,bool,i64,i64,str,str,u32
"""chr21""",5011155,5011157,,,,,,,,,,,,,,,,false,5011156,5011157,""".""",""".""",1
"""chr21""",5044299,5044301,17,0.706,0.89,,,,,,,,,,,,,false,5044300,5044301,""".""",""".""",1
"""chr21""",5044860,5044862,17,0.647,0.703,,,,,,,,,,,,,false,5044861,5044862,""".""",""".""",1
"""chr21""",5044958,5044960,17,0.529,0.524,,,,,,,,,,,,,false,5044959,5044960,""".""",""".""",1
"""chr21""",5046750,5046752,16,0.75,0.896,,,,,,,,,,,,,false,5046750,5046751,""".""",""".""",1
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""chr6""",168931567,168931569,39,0.231,0.502,168157897,169033370,1.0,1676,21,18,"""B""","""G""",0.429,0.0,0.676,0.046,false,168931568,168931569,""".""",""".""",2
"""chr6""",160622543,160622545,174,0.293,0.481,160260282,160944403,1.0,881,81,56,"""A""","""G""",0.42,0.0,0.498,0.05,false,160622543,160622544,""".""",""".""",1
"""chr6""",32553129,32553131,32,0.313,0.501,31645794,32555831,0.730584,1867,12,20,"""B""","""G""",0.833,0.0,0.92,0.043,false,32553129,32553130,""".""",""".""",1
"""chr6""",59087440,59087442,21,0.571,0.637,,,,,,,,,,,,,false,59087441,59087442,""".""",""".""",1


## CpG sites that each overlap a single SNV: When scanning for imprinting, exclude such CpG sites if they overlap heterozygous (but not homozygous) SNVs 

In [61]:
def add_locus_cpg(df):
    return df.with_columns(
        locus_cpg = pl.format(
            "{}:{}-{}", 
            pl.col("chrom"),
            pl.col("start_cpg"),
            pl.col("end_cpg")
        )
    )

def subset_cpgs_at_variants(df, allele_1, allele_2, num_SNVs_overlapping_CG):
    df = df.filter(
        (pl.col('allele_1') == str(allele_1)) & 
        (pl.col('allele_2') == str(allele_2)) & 
        (pl.col('num_SNVs_overlapping_CG') == num_SNVs_overlapping_CG)
    )
    return add_locus_cpg(df)

### There are many CpG sites that overlap a single SNV 

In [77]:
print(f"Number of CpG sites that overlap a single SNV: {len(DF_METH_FOUNDER_PHASED_ALL_CPGS_WITH_VARIANT_LABEL.filter(pl.col('num_SNVs_overlapping_CG') == 1))}")

Number of CpG sites that overlap a single SNV: 1911602


### Homozygous CpG sites should be included in imprinting scans

In [78]:
# HOM REF 
# Both haplotypes must be CpG, by construction 
# Therefore both haplotypes must exhibit methylation 
# These sites could, in principle, be imprinted, in this particular sample, and therefore should be included in scans for imprinting, in this particular sample

subset_cpgs_at_variants(
    DF_METH_FOUNDER_PHASED_ALL_CPGS_WITH_VARIANT_LABEL, 
    allele_1=0, 
    allele_2=0,
    num_SNVs_overlapping_CG=1
).sample(5, seed=42)

chrom,start_cpg,end_cpg,total_read_count,methylation_level_count,methylation_level_model,start_hap_map_block,end_hap_map_block,haplotype_concordance_in_hap_map_block,num_het_SNVs_in_hap_map_block,total_read_count_pat,total_read_count_mat,founder_haplotype_pat,founder_haplotype_mat,methylation_level_pat_count,methylation_level_mat_count,methylation_level_pat_model,methylation_level_mat_model,cpg_is_within_50bp_of_mismatch_site,start_variant,end_variant,allele_1,allele_2,num_SNVs_overlapping_CG,locus_cpg
str,i64,i64,i64,f64,f64,i64,i64,f64,i64,i64,i64,str,str,f64,f64,f64,f64,bool,i64,i64,str,str,u32,str
"""chr4""",188882646,188882648,39,0.872,0.965,188305540.0,189417551.0,1.0,2088.0,17.0,22.0,"""A""","""I""",0.882,0.864,0.966,0.965,False,188882646,188882647,"""0""","""0""",1,"""chr4:188882646-188882648"""
"""chr11""",56751292,56751294,47,0.723,0.962,54393461.0,58007713.0,0.999864,7377.0,30.0,17.0,"""B""","""G""",0.667,0.824,0.954,0.962,False,56751292,56751293,"""0""","""0""",1,"""chr11:56751292-56751294"""
"""chr6""",123270278,123270280,38,0.711,0.875,123049804.0,123609770.0,1.0,362.0,21.0,17.0,"""B""","""E""",0.667,0.765,0.875,0.902,False,123270278,123270279,"""0""","""0""",1,"""chr6:123270278-123270280"""
"""chr15""",84381024,84381026,32,0.625,0.902,84358263.0,84997487.0,1.0,544.0,17.0,14.0,"""B""","""K""",0.529,0.714,0.84,0.896,False,84381024,84381025,"""0""","""0""",1,"""chr15:84381024-84381026"""
"""chr4""",117704697,117704699,35,0.743,0.949,,,,,,,,,,,,,False,117704697,117704698,"""0""","""0""",1,"""chr4:117704697-117704699"""


In [None]:
# Hom ALT sites fall into two classes: either the site is CpG (creation) on both haplotypes, or not CpG on both haplotypes (destruction)
# CpG creation sites have methylation and could be imprinted; These should be included in scans for imprinting 
# CpG destruction sites do not have methylation and therefore are ascribed "None" for their methylation values 
# Including these sites in imprinting scans doesn't hurt: In Polars, the result of any arithmetic operation where one or both operands are null is always null

In [79]:
subset_cpgs_at_variants(
    DF_METH_FOUNDER_PHASED_ALL_CPGS_WITH_VARIANT_LABEL, 
    allele_1=1, 
    allele_2=1,
    num_SNVs_overlapping_CG=1
).sample(10, seed=42)

chrom,start_cpg,end_cpg,total_read_count,methylation_level_count,methylation_level_model,start_hap_map_block,end_hap_map_block,haplotype_concordance_in_hap_map_block,num_het_SNVs_in_hap_map_block,total_read_count_pat,total_read_count_mat,founder_haplotype_pat,founder_haplotype_mat,methylation_level_pat_count,methylation_level_mat_count,methylation_level_pat_model,methylation_level_mat_model,cpg_is_within_50bp_of_mismatch_site,start_variant,end_variant,allele_1,allele_2,num_SNVs_overlapping_CG,locus_cpg
str,i64,i64,i64,f64,f64,i64,i64,f64,i64,i64,i64,str,str,f64,f64,f64,f64,bool,i64,i64,str,str,u32,str
"""chr4""",81764057,81764059,32.0,0.906,0.969,81542850.0,81830333.0,1.0,169.0,18.0,14.0,"""B""","""I""",0.889,0.929,0.966,0.966,False,81764058,81764059,"""1""","""1""",1,"""chr4:81764057-81764059"""
"""chr11""",64003406,64003408,32.0,0.688,0.857,,,,,,,,,,,,,False,64003406,64003407,"""1""","""1""",1,"""chr11:64003406-64003408"""
"""chr6""",57394856,57394858,38.0,0.711,0.948,57226368.0,57492542.0,1.0,235.0,15.0,23.0,"""B""","""E""",0.733,0.696,0.922,0.938,False,57394856,57394857,"""1""","""1""",1,"""chr6:57394856-57394858"""
"""chr15""",54770800,54770802,41.0,0.927,0.968,,,,,,,,,,,,,False,54770800,54770801,"""1""","""1""",1,"""chr15:54770800-54770802"""
"""chr4""",189308105,189308107,,,,,,,,,,,,,,,,False,189308105,189308106,"""1""","""1""",1,"""chr4:189308105-189308107"""
"""chr13""",21605644,21605646,28.0,0.714,0.928,20444541.0,21665585.0,1.0,1450.0,14.0,14.0,"""B""","""E""",0.857,0.571,0.948,0.89,False,21605644,21605645,"""1""","""1""",1,"""chr13:21605644-21605646"""
"""chr14""",61320257,61320259,,,,,,,,,,,,,,,,False,61320257,61320258,"""1""","""1""",1,"""chr14:61320257-61320259"""
"""chr13""",46854111,46854113,35.0,0.743,0.939,46479914.0,46859362.0,1.0,304.0,20.0,15.0,"""B""","""E""",0.75,0.733,0.944,0.92,False,46854112,46854113,"""1""","""1""",1,"""chr13:46854111-46854113"""
"""chr10""",39516350,39516352,22.0,0.682,0.931,39501966.0,39584573.0,1.0,8.0,12.0,10.0,"""A""","""E""",0.667,0.7,0.8,0.94,False,39516350,39516351,"""1""","""1""",1,"""chr10:39516350-39516352"""
"""chr12""",119625018,119625020,36.0,0.778,0.893,118866000.0,119970047.0,1.0,944.0,17.0,19.0,"""A""","""E""",0.647,0.895,0.784,0.932,False,119625019,119625020,"""1""","""1""",1,"""chr12:119625018-119625020"""


In [80]:
subset_cpgs_at_variants(
    DF_METH_FOUNDER_PHASED_ALL_CPGS_WITH_VARIANT_LABEL, 
    allele_1=2, 
    allele_2=2,
    num_SNVs_overlapping_CG=1
).sample(5, seed=42)

chrom,start_cpg,end_cpg,total_read_count,methylation_level_count,methylation_level_model,start_hap_map_block,end_hap_map_block,haplotype_concordance_in_hap_map_block,num_het_SNVs_in_hap_map_block,total_read_count_pat,total_read_count_mat,founder_haplotype_pat,founder_haplotype_mat,methylation_level_pat_count,methylation_level_mat_count,methylation_level_pat_model,methylation_level_mat_model,cpg_is_within_50bp_of_mismatch_site,start_variant,end_variant,allele_1,allele_2,num_SNVs_overlapping_CG,locus_cpg
str,i64,i64,i64,f64,f64,i64,i64,f64,i64,i64,i64,str,str,f64,f64,f64,f64,bool,i64,i64,str,str,u32,str
"""chr4""",138961732,138961734,32.0,0.75,0.914,138868819.0,140229046.0,1.0,1142.0,19.0,13.0,"""B""","""G""",0.737,0.769,0.931,0.864,False,138961733,138961734,"""2""","""2""",1,"""chr4:138961732-138961734"""
"""chr11""",25626615,25626617,25.0,0.8,0.897,24796437.0,25717239.0,1.0,1434.0,14.0,11.0,"""B""","""K""",0.786,0.818,0.882,0.92,False,25626616,25626617,"""2""","""2""",1,"""chr11:25626615-25626617"""
"""chr6""",29944714,29944716,,,,,,,,,,,,,,,,False,29944715,29944716,"""2""","""2""",1,"""chr6:29944714-29944716"""
"""chr8""",68227595,68227597,,,,,,,,,,,,,,,,False,68227595,68227596,"""2""","""2""",1,"""chr8:68227595-68227597"""
"""chr7""",61974050,61974052,,,,,,,,,,,,,,,,False,61974051,61974052,"""2""","""2""",1,"""chr7:61974050-61974052"""


### Heterozygous CpG sites should be excluded from imprinting scans

In [81]:
# [ALT=1] cpg sites harboring an ALT allele on one haplotype, create or destroy a cpg site on that haplotype
# This could potentially lead to false calls of imprinting (subject to depth constraints), and therefore such sites should be excluded in scans for imprinting 

subset_cpgs_at_variants(
    DF_METH_FOUNDER_PHASED_ALL_CPGS_WITH_VARIANT_LABEL, 
    allele_1=0, 
    allele_2=1,
    num_SNVs_overlapping_CG=1
).sample(5, seed=42)

chrom,start_cpg,end_cpg,total_read_count,methylation_level_count,methylation_level_model,start_hap_map_block,end_hap_map_block,haplotype_concordance_in_hap_map_block,num_het_SNVs_in_hap_map_block,total_read_count_pat,total_read_count_mat,founder_haplotype_pat,founder_haplotype_mat,methylation_level_pat_count,methylation_level_mat_count,methylation_level_pat_model,methylation_level_mat_model,cpg_is_within_50bp_of_mismatch_site,start_variant,end_variant,allele_1,allele_2,num_SNVs_overlapping_CG,locus_cpg
str,i64,i64,i64,f64,f64,i64,i64,f64,i64,i64,i64,str,str,f64,f64,f64,f64,bool,i64,i64,str,str,u32,str
"""chr4""",79875517,79875519,40,0.5,0.548,79704587,80068168,1.0,351,17,23,"""B""","""I""",0.0,0.87,0.058,0.94,False,79875518,79875519,"""0""","""1""",1,"""chr4:79875517-79875519"""
"""chr11""",126432963,126432965,31,0.258,0.401,125243329,128655985,1.0,3449,12,19,"""A""","""K""",0.667,0.0,0.88,0.059,False,126432964,126432965,"""0""","""1""",1,"""chr11:126432963-126432965"""
"""chr6""",161815756,161815758,39,0.205,0.177,161128224,162120673,1.0,1401,19,20,"""A""","""G""",0.0,0.4,0.054,0.332,False,161815757,161815758,"""0""","""1""",1,"""chr6:161815756-161815758"""
"""chr15""",52926593,52926595,37,0.405,0.614,52606318,53156299,1.0,460,19,18,"""B""","""G""",0.789,0.0,0.931,0.065,False,52926593,52926594,"""0""","""1""",1,"""chr15:52926593-52926595"""
"""chr4""",35553349,35553351,31,0.387,0.504,34669849,36156354,1.0,2285,15,16,"""B""","""G""",0.0,0.75,0.059,0.897,False,35553349,35553350,"""0""","""1""",1,"""chr4:35553349-35553351"""


In [82]:
# [ALT=2] cpg sites harboring an ALT allele on one haplotype, create or destroy a cpg site on that haplotype
# This could potentially lead to false calls of imprinting (subject to depth constraints), and therefore such sites should be excluded in scans for imprinting 

subset_cpgs_at_variants(
    DF_METH_FOUNDER_PHASED_ALL_CPGS_WITH_VARIANT_LABEL, 
    allele_1=0, 
    allele_2=2,
    num_SNVs_overlapping_CG=1
).sample(5, seed=42)

chrom,start_cpg,end_cpg,total_read_count,methylation_level_count,methylation_level_model,start_hap_map_block,end_hap_map_block,haplotype_concordance_in_hap_map_block,num_het_SNVs_in_hap_map_block,total_read_count_pat,total_read_count_mat,founder_haplotype_pat,founder_haplotype_mat,methylation_level_pat_count,methylation_level_mat_count,methylation_level_pat_model,methylation_level_mat_model,cpg_is_within_50bp_of_mismatch_site,start_variant,end_variant,allele_1,allele_2,num_SNVs_overlapping_CG,locus_cpg
str,i64,i64,i64,f64,f64,i64,i64,f64,i64,i64,i64,str,str,f64,f64,f64,f64,bool,i64,i64,str,str,u32,str
"""chr4""",157524206,157524208,,,,,,,,,,,,,,,,False,157524206,157524207,"""0""","""2""",1,"""chr4:157524206-157524208"""
"""chrX""",61797923,61797925,,,,,,,,,,,,,,,,False,61797923,61797924,"""0""","""2""",1,"""chrX:61797923-61797925"""
"""chr6""",32705697,32705699,36.0,0.444,0.603,32583991.0,33246527.0,1.0,4619.0,15.0,21.0,"""B""","""G""",0.0,0.762,0.086,0.865,False,32705697,32705698,"""0""","""2""",1,"""chr6:32705697-32705699"""
"""chr15""",101762500,101762502,33.0,0.515,0.575,101107421.0,101881930.0,1.0,1006.0,20.0,13.0,"""A""","""I""",0.5,0.538,0.531,0.645,False,101762500,101762501,"""0""","""2""",1,"""chr15:101762500-101762502"""
"""chr4""",119087126,119087128,29.0,0.379,0.529,119017009.0,119094285.0,1.0,198.0,14.0,15.0,"""B""","""G""",0.0,0.733,0.041,0.9,False,119087126,119087127,"""0""","""2""",1,"""chr4:119087126-119087128"""


In [83]:
# [ALT=1,2]
# Either site is CpG in reference or not 
# If YES, then it is not CpG in both haplotypes, and therefore ascribed None as methylation
# Such sites cannot be imprinted, and therefore may be excluded from imprinting scans
# If NO, then it must be CpG in one haplotype, and not in the other other
# The corresponding methylation levels will be a FLOAT > 0 and FLOAT = 0.0, yielding a false imprinting call 
# Therefore such sites must be excluded from imprinting scans

subset_cpgs_at_variants(
    DF_METH_FOUNDER_PHASED_ALL_CPGS_WITH_VARIANT_LABEL, 
    allele_1=1, 
    allele_2=2,
    num_SNVs_overlapping_CG=1
).sample(10, seed=42)

chrom,start_cpg,end_cpg,total_read_count,methylation_level_count,methylation_level_model,start_hap_map_block,end_hap_map_block,haplotype_concordance_in_hap_map_block,num_het_SNVs_in_hap_map_block,total_read_count_pat,total_read_count_mat,founder_haplotype_pat,founder_haplotype_mat,methylation_level_pat_count,methylation_level_mat_count,methylation_level_pat_model,methylation_level_mat_model,cpg_is_within_50bp_of_mismatch_site,start_variant,end_variant,allele_1,allele_2,num_SNVs_overlapping_CG,locus_cpg
str,i64,i64,i64,f64,f64,i64,i64,f64,i64,i64,i64,str,str,f64,f64,f64,f64,bool,i64,i64,str,str,u32,str
"""chr4""",188007969,188007971,23.0,0.217,0.274,187828524.0,188110779.0,1.0,363.0,13.0,10.0,"""A""","""I""",0.0,0.5,0.06,0.564,False,188007970,188007971,"""1""","""2""",1,"""chr4:188007969-188007971"""
"""chr11""",118808702,118808704,18.0,0.444,0.538,118482697.0,120138600.0,1.0,1446.0,11.0,,"""A""","""K""",0.727,,0.885,,False,118808702,118808703,"""1""","""2""",1,"""chr11:118808702-118808704"""
"""chr6""",67239974,67239976,33.0,0.212,0.108,66847728.0,67294176.0,1.0,883.0,16.0,17.0,"""B""","""E""",0.0,0.412,0.055,0.352,False,67239975,67239976,"""1""","""2""",1,"""chr6:67239974-67239976"""
"""chr2""",95943461,95943463,198.0,0.48,0.514,95714505.0,96238889.0,0.54386,228.0,26.0,172.0,"""B""","""K""",0.0,0.552,0.061,0.549,False,95943461,95943462,"""1""","""2""",1,"""chr2:95943461-95943463"""
"""chr4""",40056441,40056443,21.0,0.048,0.033,39424788.0,40743633.0,1.0,1378.0,,12.0,"""B""","""K""",,0.0,,0.037,False,40056442,40056443,"""1""","""2""",1,"""chr4:40056441-40056443"""
"""chr13""",65440211,65440213,,,,,,,,,,,,,,,,False,65440212,65440213,"""1""","""2""",1,"""chr13:65440211-65440213"""
"""chr16""",22596132,22596134,,,,,,,,,,,,,,,,False,22596133,22596134,"""1""","""2""",1,"""chr16:22596132-22596134"""
"""chr13""",113876467,113876469,32.0,0.531,0.627,113654599.0,114345091.0,1.0,727.0,21.0,11.0,"""A""","""E""",0.524,0.545,0.634,0.755,False,113876468,113876469,"""1""","""2""",1,"""chr13:113876467-113876469"""
"""chrX""",59550639,59550641,,,,,,,,,,,,,,,,False,59550639,59550640,"""1""","""2""",1,"""chrX:59550639-59550641"""
"""chr12""",8863260,8863262,26.0,0.5,0.507,8508604.0,8893292.0,0.865285,193.0,11.0,15.0,"""B""","""G""",0.0,0.867,0.058,0.96,False,8863261,8863262,"""1""","""2""",1,"""chr12:8863260-8863262"""


## CpG sites that each overlap 2 SNVs: When scanning for imprinting, exclude these sites if at least one of the SNVs is heterozygous 

### There are very few CpG sites that overlap 2 SNVs 

In [84]:
# CpG sites that overlap 2 SNVs are rare among the full set of CpG sites in the reference (and sample) genome:
print(f"number of CpGs that overlap 2 SNVs: {len(DF_METH_FOUNDER_PHASED_ALL_CPGS_WITH_VARIANT_LABEL.filter((pl.col('num_SNVs_overlapping_CG') == 2)))}")

number of CpGs that overlap 2 SNVs: 57164


In [85]:
# Since CpG sites are 2 bases long, it is impossible for a CpG site to overlap more than 2 SNVs: 
print(f"number of CpGs that overlap more than 2 SNVs: {len(DF_METH_FOUNDER_PHASED_ALL_CPGS_WITH_VARIANT_LABEL.filter((pl.col('num_SNVs_overlapping_CG') > 2)))}")

number of CpGs that overlap more than 2 SNVs: 0


### CpG sites in which at least one of the 2 overlapping SNVs is heterozygous should be excluded prior to scanning for imprinting

In [87]:
# LOGIC: 
# 1. If the two SNVs are each homozygous, then the haplotypes are the same (either CpG or not), and therefore it is impossible to generate a false example of imprinting. 
# 2. If one SNV is homozygous and the other heterozygous, then the haplotypes are different. 
#   i. If one of those haplotypes is CpG, false imprinting is possible. 
#   ii. It neither haplotye is CpG, then throwing it out doesn't matter. 
# 3. If both SNVs are heterozygous, then there are two ways that the ALT alleles could segregate among the haplotypes: 
#   i. One ALT is on hap1; the second ALT is on hap2. 
#   ii. Both ALTs are on the same haplotype. 
#    Either way, the haplotypes are different, and we are back to case 2. 

def sample_cpgs_at_double_variants(df, sample_size, seed):
    number_variants = 2 
    coord_cols = ['chrom', 'start_cpg', 'end_cpg'] 

    # 1. Filter to find the valid rows
    # 2. Select ONLY the coord cols and get UNIQUE combinations
    # 3. Sample from those unique records
    sample_coordinates = (
        df
        .filter(pl.col('num_SNVs_overlapping_CG') == number_variants)
        .filter(
            pl.col('methylation_level_pat_count').is_not_null() & 
            pl.col('methylation_level_mat_count').is_not_null()
        )
        .select(coord_cols)
        .unique()
        .sort(coord_cols)
        .sample(sample_size, seed=seed) 
    )

    # 4. Join the distinct coordinates back to the full dataframe
    result = df.join(
        sample_coordinates, 
        on=coord_cols, 
        how='inner'
    ).sort(coord_cols)

    pl.Config.set_tbl_rows(number_variants*sample_size)

    return add_locus_cpg(result)

# Visual inspection of many loci in IGV confirmed the LOGIC presented above
sample_cpgs_at_double_variants(DF_METH_FOUNDER_PHASED_ALL_CPGS_WITH_VARIANT_LABEL, sample_size=10, seed=39)

chrom,start_cpg,end_cpg,total_read_count,methylation_level_count,methylation_level_model,start_hap_map_block,end_hap_map_block,haplotype_concordance_in_hap_map_block,num_het_SNVs_in_hap_map_block,total_read_count_pat,total_read_count_mat,founder_haplotype_pat,founder_haplotype_mat,methylation_level_pat_count,methylation_level_mat_count,methylation_level_pat_model,methylation_level_mat_model,cpg_is_within_50bp_of_mismatch_site,start_variant,end_variant,allele_1,allele_2,num_SNVs_overlapping_CG,locus_cpg
str,i64,i64,i64,f64,f64,i64,i64,f64,i64,i64,i64,str,str,f64,f64,f64,f64,bool,i64,i64,str,str,u32,str
"""chr1""",88688170,88688172,30,0.533,0.543,88523427,89081456,1.0,386,12,18,"""B""","""I""",0.0,0.889,0.057,0.968,False,88688170,88688171,"""0""","""0""",2,"""chr1:88688170-88688172"""
"""chr1""",88688170,88688172,30,0.533,0.543,88523427,89081456,1.0,386,12,18,"""B""","""I""",0.0,0.889,0.057,0.968,False,88688171,88688172,"""1""","""0""",2,"""chr1:88688170-88688172"""
"""chr1""",121751074,121751076,118,0.492,0.568,121168799,121775150,0.824675,308,24,14,"""B""","""I""",0.667,0.143,0.918,0.137,False,121751074,121751075,"""0""","""0""",2,"""chr1:121751074-121751076"""
"""chr1""",121751074,121751076,118,0.492,0.568,121168799,121775150,0.824675,308,24,14,"""B""","""I""",0.667,0.143,0.918,0.137,False,121751075,121751076,"""0""","""0""",2,"""chr1:121751074-121751076"""
"""chr10""",128545619,128545621,56,0.304,0.472,128352302,128583679,0.704846,227,23,33,"""B""","""E""",0.739,0.0,0.949,0.049,True,128545619,128545620,"""1""","""1""",2,"""chr10:128545619-128545621"""
"""chr10""",128545619,128545621,56,0.304,0.472,128352302,128583679,0.704846,227,23,33,"""B""","""E""",0.739,0.0,0.949,0.049,True,128545620,128545621,"""0""","""1""",2,"""chr10:128545619-128545621"""
"""chr11""",24829345,24829347,34,0.206,0.177,24796437,25717239,1.0,1434,15,19,"""B""","""K""",0.333,0.105,0.362,0.057,False,24829345,24829346,"""1""","""1""",2,"""chr11:24829345-24829347"""
"""chr11""",24829345,24829347,34,0.206,0.177,24796437,25717239,1.0,1434,15,19,"""B""","""K""",0.333,0.105,0.362,0.057,False,24829346,24829347,"""0""","""0""",2,"""chr11:24829345-24829347"""
"""chr11""",54582760,54582762,34,0.382,0.48,54393461,58007713,0.999864,7377,19,15,"""B""","""G""",0.684,0.0,0.857,0.051,False,54582760,54582761,"""1""","""0""",2,"""chr11:54582760-54582762"""
"""chr11""",54582760,54582762,34,0.382,0.48,54393461,58007713,0.999864,7377,19,15,"""B""","""G""",0.684,0.0,0.857,0.051,False,54582761,54582762,"""0""","""0""",2,"""chr11:54582760-54582762"""


In [88]:
# the identity of the ALT allele doesn't matter to the logic above: 
subset_cpgs_at_variants(
    DF_METH_FOUNDER_PHASED_ALL_CPGS_WITH_VARIANT_LABEL, 
    allele_1=0, 
    allele_2=2,
    num_SNVs_overlapping_CG=2
)
# Note that those CpGs that have non-zero count-based methylation on both haplotypes should probably have zero methylation on one of them (based on IGV inspection)

chrom,start_cpg,end_cpg,total_read_count,methylation_level_count,methylation_level_model,start_hap_map_block,end_hap_map_block,haplotype_concordance_in_hap_map_block,num_het_SNVs_in_hap_map_block,total_read_count_pat,total_read_count_mat,founder_haplotype_pat,founder_haplotype_mat,methylation_level_pat_count,methylation_level_mat_count,methylation_level_pat_model,methylation_level_mat_model,cpg_is_within_50bp_of_mismatch_site,start_variant,end_variant,allele_1,allele_2,num_SNVs_overlapping_CG,locus_cpg
str,i64,i64,i64,f64,f64,i64,i64,f64,i64,i64,i64,str,str,f64,f64,f64,f64,bool,i64,i64,str,str,u32,str
"""chr1""",34187773,34187775,48,0.25,0.392,33202440,34366901,1.0,1244,23,25,"""B""","""K""",0.522,0.0,0.59,0.048,false,34187774,34187775,"""0""","""2""",2,"""chr1:34187773-34187775"""
"""chrX""",61897409,61897411,32,0.406,0.511,,,,,,,,,,,,,false,61897409,61897410,"""0""","""2""",2,"""chrX:61897409-61897411"""
"""chr17""",22749682,22749684,49,0.531,0.545,22042501,22763578,0.593607,657,17,26,"""A""","""I""",0.059,0.885,0.082,0.943,false,22749683,22749684,"""0""","""2""",2,"""chr17:22749682-22749684"""
"""chr3""",195713096,195713098,52,0.538,0.752,195598494,196879669,0.968792,1474,28,24,"""B""","""I""",0.429,0.667,0.666,0.833,false,195713097,195713098,"""0""","""2""",2,"""chr3:195713096-195713098"""
"""chr3""",195714190,195714192,52,0.327,0.534,195598494,196879669,0.968792,1474,28,24,"""B""","""I""",0.214,0.458,0.557,0.511,false,195714190,195714191,"""0""","""2""",2,"""chr3:195714190-195714192"""
"""chr18""",17840413,17840415,,,,,,,,,,,,,,,,false,17840414,17840415,"""0""","""2""",2,"""chr18:17840413-17840415"""
"""chr18""",20510374,20510376,,,,,,,,,,,,,,,,false,20510374,20510375,"""0""","""2""",2,"""chr18:20510374-20510376"""
"""chr18""",20805542,20805544,15,0.333,0.67,,,,,,,,,,,,,false,20805542,20805543,"""0""","""2""",2,"""chr18:20805542-20805544"""
"""chr5""",149655218,149655220,57,0.228,0.522,149626241,149690396,1.0,51,29,18,"""B""","""G""",0.0,0.722,0.142,0.849,false,149655219,149655220,"""0""","""2""",2,"""chr5:149655218-149655220"""
"""chr13""",113025954,113025956,,,,,,,,,,,,,,,,false,113025954,113025955,"""0""","""2""",2,"""chr13:113025954-113025956"""


## Label each unique CpG record with a flag indicating whether it is allele-specific

In [90]:
importlib.reload(expand_to_all_cpgs)
from expand_to_all_cpgs import label_cpgs_as_allele_specific

DF_METH_FOUNDER_PHASED_ALL_CPGS_WITH_ALLELE_SPECIFIC_FLAG = label_cpgs_as_allele_specific(DF_METH_FOUNDER_PHASED_ALL_CPGS_WITH_VARIANT_LABEL) 
DF_METH_FOUNDER_PHASED_ALL_CPGS_WITH_ALLELE_SPECIFIC_FLAG.filter(
    pl.col('cpg_overlaps_at_least_one_snv') > 0
)

chrom,start_cpg,end_cpg,total_read_count,methylation_level_count,methylation_level_model,start_hap_map_block,end_hap_map_block,haplotype_concordance_in_hap_map_block,num_het_SNVs_in_hap_map_block,total_read_count_pat,total_read_count_mat,founder_haplotype_pat,founder_haplotype_mat,methylation_level_pat_count,methylation_level_mat_count,methylation_level_pat_model,methylation_level_mat_model,cpg_is_within_50bp_of_mismatch_site,cpg_overlaps_at_least_one_snv,snv_genotypes,cpg_is_allele_specific
str,i64,i64,i64,f64,f64,i64,i64,f64,i64,i64,i64,str,str,f64,f64,f64,f64,bool,bool,str,bool
"""chr1""",10622,10624,15,0.8,0.919,,,,,,,,,,,,,false,true,"""hom""",false
"""chr1""",10748,10750,15,0.6,0.534,,,,,,,,,,,,,false,true,"""hom""",false
"""chr1""",10925,10927,14,0.786,0.949,,,,,,,,,,,,,false,true,"""hom""",false
"""chr1""",10930,10932,,,,,,,,,,,,,,,,false,true,"""hom""",false
"""chr1""",10933,10935,,,,,,,,,,,,,,,,false,true,"""hom""",false
"""chr1""",11001,11003,15,0.667,0.721,,,,,,,,,,,,,false,true,"""hom""",false
"""chr1""",11112,11114,15,0.733,0.905,,,,,,,,,,,,,false,true,"""hom""",false
"""chr1""",11153,11155,15,0.8,0.934,,,,,,,,,,,,,false,true,"""hom""",false
"""chr1""",11166,11168,,,,,,,,,,,,,,,,false,true,"""hom""",false
"""chr1""",11407,11409,15,0.933,0.925,,,,,,,,,,,,,false,true,"""hom""",false


### Sanity checking 

In [91]:
# CGs that overlap 1 SNV that is het indeed have zero methylation on one haplotype, and can be flagged for exclusion in imprinting scans: 
DF_METH_FOUNDER_PHASED_ALL_CPGS_WITH_ALLELE_SPECIFIC_FLAG.filter(pl.col("snv_genotypes") == "het").sample(10, seed=42)

chrom,start_cpg,end_cpg,total_read_count,methylation_level_count,methylation_level_model,start_hap_map_block,end_hap_map_block,haplotype_concordance_in_hap_map_block,num_het_SNVs_in_hap_map_block,total_read_count_pat,total_read_count_mat,founder_haplotype_pat,founder_haplotype_mat,methylation_level_pat_count,methylation_level_mat_count,methylation_level_pat_model,methylation_level_mat_model,cpg_is_within_50bp_of_mismatch_site,cpg_overlaps_at_least_one_snv,snv_genotypes,cpg_is_allele_specific
str,i64,i64,i64,f64,f64,i64,i64,f64,i64,i64,i64,str,str,f64,f64,f64,f64,bool,bool,str,bool
"""chr6""",169617009,169617011,31,0.452,0.536,169325654.0,169668622.0,1.0,476.0,16.0,15.0,"""B""","""G""",0.875,0.0,0.949,0.059,False,True,"""het""",True
"""chr16""",1801232,1801234,30,0.367,0.497,1654066.0,2247481.0,1.0,467.0,16.0,14.0,"""B""","""G""",0.0,0.786,0.055,0.963,False,True,"""het""",True
"""chrX""",61881002,61881004,16,0.25,0.102,,,,,,,,,,,,,False,True,"""het""",True
"""chr5""",477574,477576,19,0.211,0.386,28950.0,919382.0,0.707395,933.0,,13.0,"""A""","""E""",,0.0,,0.061,False,True,"""het""",True
"""chr6""",109558960,109558962,33,0.364,0.465,109450752.0,109792736.0,1.0,466.0,18.0,15.0,"""B""","""E""",0.0,0.8,0.056,0.949,False,True,"""het""",True
"""chr3""",34835819,34835821,39,0.308,0.473,34361702.0,34844223.0,1.0,376.0,14.0,25.0,"""A""","""E""",0.857,0.0,0.93,0.053,False,True,"""het""",True
"""chr10""",127754767,127754769,33,0.212,0.395,127673489.0,127991671.0,1.0,435.0,12.0,21.0,"""B""","""E""",0.583,0.0,0.837,0.057,False,True,"""het""",True
"""chr3""",93595095,93595097,18,0.667,0.629,,,,,,,,,,,,,False,True,"""het""",True
"""chr12""",100885932,100885934,35,0.314,0.419,100635594.0,101104611.0,1.0,359.0,18.0,17.0,"""B""","""E""",0.0,0.647,0.067,0.902,False,True,"""het""",True
"""chr9""",32802823,32802825,37,0.297,0.374,32757969.0,32947120.0,1.0,403.0,20.0,17.0,"""A""","""K""",0.0,0.647,0.05,0.773,False,True,"""het""",True


In [92]:
# CGs that overlap 2 SNVs, the first of which is het, indeed have zero methylation on one haplotype, and can be flagged for exclusion in imprinting scans: 
DF_METH_FOUNDER_PHASED_ALL_CPGS_WITH_ALLELE_SPECIFIC_FLAG.filter(pl.col("snv_genotypes").str.contains("het,")).sample(10, seed=42)

chrom,start_cpg,end_cpg,total_read_count,methylation_level_count,methylation_level_model,start_hap_map_block,end_hap_map_block,haplotype_concordance_in_hap_map_block,num_het_SNVs_in_hap_map_block,total_read_count_pat,total_read_count_mat,founder_haplotype_pat,founder_haplotype_mat,methylation_level_pat_count,methylation_level_mat_count,methylation_level_pat_model,methylation_level_mat_model,cpg_is_within_50bp_of_mismatch_site,cpg_overlaps_at_least_one_snv,snv_genotypes,cpg_is_allele_specific
str,i64,i64,i64,f64,f64,i64,i64,f64,i64,i64,i64,str,str,f64,f64,f64,f64,bool,bool,str,bool
"""chr7""",32465769,32465771,36.0,0.556,0.626,31976778.0,34723516.0,1.0,2754.0,26.0,10.0,"""A""","""G""",0.769,0.0,0.926,0.062,False,True,"""het,hom""",True
"""chr16""",8651112,8651114,,,,,,,,,,,,,,,,False,True,"""het,hom""",True
"""chrX""",61823147,61823149,23.0,0.043,0.065,,,,,,,,,,,,,False,True,"""het,hom""",True
"""chr5""",32641130,32641132,33.0,0.364,0.503,32595784.0,32994143.0,1.0,375.0,13.0,20.0,"""A""","""G""",0.923,0.0,0.964,0.05,False,True,"""het,hom""",True
"""chr6""",161491531,161491533,34.0,0.206,0.443,161128224.0,162120673.0,1.0,1401.0,17.0,17.0,"""A""","""G""",0.0,0.412,0.058,0.86,False,True,"""het,hom""",True
"""chr3""",61065119,61065121,45.0,0.444,0.647,61014799.0,62147918.0,1.0,1129.0,26.0,19.0,"""A""","""G""",0.769,0.0,0.93,0.085,False,True,"""het,het""",True
"""chr11""",23447168,23447170,38.0,0.368,0.49,23015746.0,23520155.0,1.0,974.0,20.0,18.0,"""B""","""K""",0.0,0.778,0.055,0.911,False,True,"""het,hom""",True
"""chr3""",130224669,130224671,28.0,0.464,0.502,130072561.0,130416231.0,1.0,447.0,19.0,,"""A""","""G""",0.684,,0.903,,False,True,"""het,hom""",True
"""chr12""",131129363,131129365,35.0,0.429,0.489,130655180.0,131486004.0,1.0,1384.0,17.0,18.0,"""A""","""G""",0.0,0.833,0.035,0.905,False,True,"""het,hom""",True
"""chr9""",67740445,67740447,49.0,0.367,0.724,,,,,,,,,,,,,False,True,"""het,het""",True


In [93]:
# CGs that overlap 2 SNVs, the second of which is het, indeed have zero methylation on one haplotype, and can be flagged for exclusion in imprinting scans: 
DF_METH_FOUNDER_PHASED_ALL_CPGS_WITH_ALLELE_SPECIFIC_FLAG.filter(pl.col("snv_genotypes").str.contains(",het")).sample(10, seed=42)

chrom,start_cpg,end_cpg,total_read_count,methylation_level_count,methylation_level_model,start_hap_map_block,end_hap_map_block,haplotype_concordance_in_hap_map_block,num_het_SNVs_in_hap_map_block,total_read_count_pat,total_read_count_mat,founder_haplotype_pat,founder_haplotype_mat,methylation_level_pat_count,methylation_level_mat_count,methylation_level_pat_model,methylation_level_mat_model,cpg_is_within_50bp_of_mismatch_site,cpg_overlaps_at_least_one_snv,snv_genotypes,cpg_is_allele_specific
str,i64,i64,i64,f64,f64,i64,i64,f64,i64,i64,i64,str,str,f64,f64,f64,f64,bool,bool,str,bool
"""chr7""",43153695,43153697,41.0,0.317,0.477,42870524.0,44426720.0,1.0,1816.0,22.0,19.0,"""A""","""G""",0.0,0.684,0.042,0.918,False,True,"""hom,het""",True
"""chr16""",26814110,26814112,24.0,0.292,0.513,26745515.0,27313943.0,1.0,608.0,,15.0,"""B""","""E""",,0.0,,0.048,False,True,"""het,het""",True
"""chrX""",62112714,62112716,22.0,0.364,0.526,,,,,,,,,,,,,False,True,"""hom,het""",True
"""chr5""",22452016,22452018,34.0,0.353,0.473,21791932.0,22895414.0,0.986607,896.0,18.0,16.0,"""A""","""G""",0.667,0.0,0.835,0.075,False,True,"""hom,het""",True
"""chr6""",166784388,166784390,36.0,0.5,0.599,166441808.0,166805274.0,1.0,546.0,17.0,19.0,"""B""","""G""",0.0,0.947,0.044,0.96,False,True,"""het,het""",True
"""chr3""",50084909,50084911,23.0,0.609,0.597,49575658.0,50489569.0,1.0,505.0,15.0,,"""A""","""G""",0.933,,0.958,,False,True,"""hom,het""",True
"""chr11""",39140731,39140733,,,,,,,,,,,,,,,,False,True,"""het,het""",True
"""chr3""",114615629,114615631,44.0,0.432,0.508,114152262.0,115231159.0,1.0,653.0,17.0,27.0,"""A""","""G""",0.0,0.704,0.059,0.93,False,True,"""hom,het""",True
"""chr12""",132475216,132475218,11.0,0.364,0.076,,,,,,,,,,,,,False,True,""".,het""",True
"""chr9""",84092872,84092874,30.0,0.533,0.563,83118464.0,84472897.0,1.0,1203.0,13.0,17.0,"""B""","""I""",0.0,0.941,0.056,0.956,False,True,"""hom,het""",True
