### Setup

In [1]:
import importlib
import polars as pl

SAMPLE = '200081' 

pl.Config.set_tbl_rows(25)

from pathlib import Path

READ_PHASED_DIR = Path('/scratch/ucgd/lustre-labs/quinlan/data-shared/read-backed-phasing')
IHT_PHASED_DIR = Path('/scratch/ucgd/lustre-labs/quinlan/data-shared/haplotype-maps/CEPH1463.GRCh38')
METH_COUNT_READ_PHASED_DIR = Path(f'/scratch/ucgd/lustre-labs/quinlan/data-shared/dna-methylation/CEPH1463.GRCh38.hifi.count.read-backed-phased')
METH_MODEL_READ_PHASED_DIR = Path(f'/scratch/ucgd/lustre-labs/quinlan/data-shared/dna-methylation/CEPH1463.GRCh38.hifi.model.read-backed-phased')

import sys

REPO_DIR = Path('/scratch/ucgd/lustre-labs/quinlan/u6018199/tapestry')
sys.path.append(str(REPO_DIR / 'src/util')) # hover over, e.g., "from shell import shell", etc., below, and choose "quick fix" to make pylance aware of this path

In [2]:
VCF_READ_PHASED = f"{READ_PHASED_DIR}/{SAMPLE}.GRCh38.deepvariant.glnexus.phased.vcf.gz" # single-sample vcf from hiphase
TSV_READ_PHASE_BLOCKS = f"{READ_PHASED_DIR}/{SAMPLE}.GRCh38.hiphase.blocks.tsv" # single-sample tsv from hiphase
VCF_IHT_PHASED = f"{IHT_PHASED_DIR}/CEPH1463.GRCh38.pass.sorted.vcf.gz" # joint-called multi-sample vcf from gtg-ped-map/gtg-concordance
TXT_IHT_BLOCKS = f"{IHT_PHASED_DIR}/CEPH1463.GRCh38.iht.sorted.txt" # multi-sample iht blocks file from gtg-ped-map/gtg-concordance
# bed files from aligned_bam_to_cpg_scores:
BED_METH_COUNT_HAP1 = f"{METH_COUNT_READ_PHASED_DIR}/{SAMPLE}.GRCh38.haplotagged.hap1.bed.gz" 
BED_METH_COUNT_HAP2 = f"{METH_COUNT_READ_PHASED_DIR}/{SAMPLE}.GRCh38.haplotagged.hap2.bed.gz" 
BED_METH_MODEL_HAP1 = f"{METH_MODEL_READ_PHASED_DIR}/{SAMPLE}.GRCh38.haplotagged.hap1.bed.gz" 
BED_METH_MODEL_HAP2 = f"{METH_MODEL_READ_PHASED_DIR}/{SAMPLE}.GRCh38.haplotagged.hap2.bed.gz" 

### Phase alleles at SNPs to read-backed haplotypes and founder haplotypes 

In [3]:
import get_all_phasing
importlib.reload(get_all_phasing)
from get_all_phasing import get_read_phasing

DF_READ_PHASING = get_read_phasing(VCF_READ_PHASED) 
DF_READ_PHASING

chrom,start,end,REF,ALT,phase_block_id,allele_hap1,allele_hap2
str,i64,i64,str,str,str,str,str
"""chr1""",11862,11863,"""C""","""A""","""11863""","""0""","""1"""
"""chr1""",11921,11922,"""T""","""A""","""11863""","""0""","""1"""
"""chr1""",15117,15118,"""A""","""G""","""11863""","""0""","""1"""
"""chr1""",15819,15820,"""G""","""T""","""11863""","""1""","""0"""
"""chr1""",16013,16014,"""C""","""T""","""11863""","""0""","""1"""
"""chr1""",16948,16949,"""A""","""C""","""11863""","""0""","""1"""
"""chr1""",17019,17020,"""G""","""A""","""11863""","""0""","""1"""
"""chr1""",17384,17385,"""G""","""A""","""11863""","""0""","""1"""
"""chr1""",18088,18089,"""G""","""T""","""11863""","""0""","""1"""
"""chr1""",19171,19172,"""A""","""G""","""11863""","""0""","""1"""


In [4]:
import get_all_phasing
importlib.reload(get_all_phasing)
from get_all_phasing import get_read_phase_blocks

DF_READ_PHASE_BLOCKS = get_read_phase_blocks(TSV_READ_PHASE_BLOCKS)
DF_READ_PHASE_BLOCKS

source_block_index,sample_name,phase_block_id,chrom,start,end,num_variants
i64,i64,str,str,i64,i64,i64
0,200081,"""11863""","""chr1""",11863,204487,479
1,200081,"""257716""","""chr1""",257716,292262,110
2,200081,"""350805""","""chr1""",350805,396627,205
4,200081,"""416412""","""chr1""",416412,433777,76
6,200081,"""492339""","""chr1""",492339,532812,56
8,200081,"""593123""","""chr1""",593123,1314109,1169
9,200081,"""1351126""","""chr1""",1351126,1382658,5
10,200081,"""1421668""","""chr1""",1421668,1427528,2
11,200081,"""1432961""","""chr1""",1432961,2931691,2168
12,200081,"""2961801""","""chr1""",2961801,4207029,2441


In [5]:
import get_all_phasing
importlib.reload(get_all_phasing)
from get_all_phasing import get_iht_phasing

DF_IHT_PHASING = get_iht_phasing(SAMPLE, VCF_IHT_PHASED)
DF_IHT_PHASING

chrom,start,end,REF,ALT,allele_pat,allele_mat
str,i64,i64,str,str,str,str
"""chr1""",497702,497703,"""T""","""C""","""1""","""0"""
"""chr1""",500803,500804,"""T""","""C""","""1""","""0"""
"""chr1""",502903,502904,"""T""","""C""","""1""","""0"""
"""chr1""",504316,504317,"""T""","""C""","""0""","""1"""
"""chr1""",505781,505782,"""C""","""T""","""0""","""1"""
"""chr1""",506629,506630,"""A""","""G""","""1""","""0"""
"""chr1""",512114,512115,"""A""","""G""","""0""","""1"""
"""chr1""",514052,514053,"""A""","""G""","""1""","""0"""
"""chr1""",515920,515921,"""C""","""T""","""1""","""0"""
"""chr1""",516229,516230,"""T""","""C""","""0""","""1"""


In [6]:
import get_all_phasing
importlib.reload(get_all_phasing)
from get_all_phasing import get_iht_blocks

DF_IHT_BLOCKS = get_iht_blocks(SAMPLE, TXT_IHT_BLOCKS)
DF_IHT_BLOCKS

chrom,start,end,founder_label_pat,founder_label_mat
str,i64,i64,str,str
"""chr1""",13301,1610923,"""B""","""I"""
"""chr1""",1613420,3184712,"""B""","""I"""
"""chr1""",3184788,3200879,"""B""","""I"""
"""chr1""",3203589,3398794,"""B""","""I"""
"""chr1""",3399126,5038259,"""B""","""I"""
"""chr1""",5038659,5097605,"""B""","""I"""
"""chr1""",5097819,6082247,"""B""","""I"""
"""chr1""",6082326,9098705,"""B""","""I"""
"""chr1""",9098937,9424618,"""B""","""I"""
"""chr1""",9424760,10778498,"""B""","""I"""


In [7]:
import get_all_phasing
importlib.reload(get_all_phasing)
from get_all_phasing import get_all_phasing

# Note: there are no bit vectors prior to 500kb: 
# https://quinlangroup.slack.com/archives/C09027S4C5Q/p1750792033498849
DF_ALL_PHASING = get_all_phasing(
    DF_READ_PHASING, 
    DF_READ_PHASE_BLOCKS, 
    DF_IHT_PHASING, 
    DF_IHT_BLOCKS
)
DF_ALL_PHASING

chrom,start,end,REF,ALT,allele_hap1,allele_hap2,start_phase_block,end_phase_block,allele_pat,allele_mat,start_iht_block,end_iht_block,founder_label_pat_iht_block,founder_label_mat_iht_block
str,i64,i64,str,str,str,str,i64,i64,str,str,i64,i64,str,str
"""chr1""",497702,497703,"""T""","""C""","""0""","""1""",492339,532812,"""1""","""0""",13301,1610923,"""B""","""I"""
"""chr1""",500803,500804,"""T""","""C""","""0""","""1""",492339,532812,"""1""","""0""",13301,1610923,"""B""","""I"""
"""chr1""",502903,502904,"""T""","""C""","""0""","""1""",492339,532812,"""1""","""0""",13301,1610923,"""B""","""I"""
"""chr1""",504316,504317,"""T""","""C""","""1""","""0""",492339,532812,"""0""","""1""",13301,1610923,"""B""","""I"""
"""chr1""",505781,505782,"""C""","""T""","""1""","""0""",492339,532812,"""0""","""1""",13301,1610923,"""B""","""I"""
"""chr1""",506629,506630,"""A""","""G""","""0""","""1""",492339,532812,"""1""","""0""",13301,1610923,"""B""","""I"""
"""chr1""",512114,512115,"""A""","""G""","""1""","""0""",492339,532812,"""0""","""1""",13301,1610923,"""B""","""I"""
"""chr1""",514052,514053,"""A""","""G""","""0""","""1""",492339,532812,"""1""","""0""",13301,1610923,"""B""","""I"""
"""chr1""",515920,515921,"""C""","""T""","""0""","""1""",492339,532812,"""1""","""0""",13301,1610923,"""B""","""I"""
"""chr1""",516229,516230,"""T""","""C""","""1""","""0""",492339,532812,"""0""","""1""",13301,1610923,"""B""","""I"""


In [8]:
# import bioframe as bf

# pl.DataFrame(bf.select(
#     df=DF_ALL_PHASING.to_pandas(), 
#     region="chr1:500,000-1,500,000",
#     cols=["chrom", "start", "end"]
# ))

### Construct a Hap Map, consisting of intervals in which read-backed haplotypes are mapped to founder haplotypes

In [9]:
import get_hap_map
importlib.reload(get_hap_map)
from get_hap_map import get_hap_map

DF_HAP_MAP, DF_SITES, DF_SITES_MISMATCH = get_hap_map(DF_ALL_PHASING)
DF_HAP_MAP

chrom,start,end,paternal_haplotype,maternal_haplotype,haplotype_concordance,num_het_SNVs
str,i64,i64,str,str,f64,i64
"""chr1""",492339,532812,"""B_hap2""","""I_hap1""",1.0,16
"""chr1""",593123,1314109,"""B_hap2""","""I_hap1""",0.993119,436
"""chr1""",1351126,1382658,"""B_hap2""","""I_hap1""",1.0,1
"""chr1""",1421668,1427528,"""B_hap1""","""I_hap2""",1.0,1
"""chr1""",1432961,1610923,"""B_hap1""","""I_hap2""",1.0,28
"""chr1""",1613420,2931691,"""B_hap1""","""I_hap2""",1.0,1201
"""chr1""",2961801,3184712,"""B_hap2""","""I_hap1""",1.0,392
"""chr1""",3184788,3200879,"""B_hap2""","""I_hap1""",1.0,1
"""chr1""",3203589,3398794,"""B_hap2""","""I_hap1""",1.0,243
"""chr1""",3399126,4207029,"""B_hap2""","""I_hap1""",1.0,1114


In [10]:
DF_HAP_MAP.filter(pl.col("haplotype_concordance") < 1.0)

chrom,start,end,paternal_haplotype,maternal_haplotype,haplotype_concordance,num_het_SNVs
str,i64,i64,str,str,f64,i64
"""chr1""",593123,1314109,"""B_hap2""","""I_hap1""",0.993119,436
"""chr1""",8329093,9098705,"""B_hap1""","""I_hap2""",0.718696,583
"""chr1""",13009397,13267330,"""B_hap1""","""K_hap2""",0.714286,7
"""chr1""",14551881,14943007,"""B_hap2""","""K_hap1""",0.514658,307
"""chr1""",18434710,19541226,"""B_hap1""","""K_hap2""",0.952234,1298
"""chr1""",25776419,26154619,"""B_hap2""","""K_hap1""",0.960265,151
"""chr1""",27783490,28585938,"""B_hap2""","""K_hap1""",0.996099,769
"""chr1""",31384925,31840697,"""B_hap2""","""K_hap1""",0.98806,335
"""chr1""",35192783,35589188,"""B_hap2""","""K_hap1""",0.705882,17
"""chr1""",35938634,36039515,"""B_hap1""","""K_hap2""",0.571429,7


In [11]:
DF_SITES_MISMATCH

chrom,start,end,REF,ALT
str,i64,i64,str,str
"""chr5""",28950,28951,"""T""","""G"""
"""chr5""",29524,29525,"""C""","""T"""
"""chr5""",31108,31109,"""G""","""A"""
"""chr5""",32887,32888,"""G""","""A"""
"""chr5""",33800,33801,"""A""","""T"""
"""chr5""",34868,34869,"""A""","""G"""
"""chr5""",34959,34960,"""C""","""T"""
"""chr5""",34960,34961,"""G""","""A"""
"""chr5""",35001,35002,"""C""","""T"""
"""chr5""",36345,36346,"""T""","""C"""


### Get HiFi DNA methylation levels (both count-based and model-based) at CpG sites phased to hap1/hap2

In [15]:
import get_meth_hap1_hap2
importlib.reload(get_meth_hap1_hap2)
from get_meth_hap1_hap2 import get_meth_hap1_hap2

DF_METH_COUNT_HAP1_HAP2 = get_meth_hap1_hap2(
    pb_cpg_tool_mode='count', 
    bed_hap1=BED_METH_COUNT_HAP1, 
    bed_hap2=BED_METH_COUNT_HAP2
)
DF_METH_MODEL_HAP1_HAP2 = get_meth_hap1_hap2(
    pb_cpg_tool_mode='model', 
    bed_hap1=BED_METH_MODEL_HAP1, 
    bed_hap2=BED_METH_MODEL_HAP2
)    

In [16]:
DF_METH_COUNT_HAP1_HAP2

chrom,start,end,total_read_count_hap1,methylation_level_hap1,total_read_count_hap2,methylation_level_hap2
str,i64,i64,i64,f64,i64,f64
"""chr1""",14061,14062,,,10,0.4
"""chr1""",14178,14179,,,10,0.6
"""chr1""",14348,14349,,,11,0.727
"""chr1""",14353,14354,,,11,1.0
"""chr1""",14434,14435,,,11,0.727
"""chr1""",14468,14469,,,11,0.636
"""chr1""",14485,14486,,,11,0.909
"""chr1""",14520,14521,10,0.9,11,1.0
"""chr1""",14552,14553,10,0.7,11,0.909
"""chr1""",14588,14589,10,0.9,11,0.727


In [18]:
DF_METH_MODEL_HAP1_HAP2

chrom,start,end,total_read_count_hap1,methylation_level_hap1,total_read_count_hap2,methylation_level_hap2
str,i64,i64,i64,f64,i64,f64
"""chr1""",14061,14062,,,10,0.211
"""chr1""",14178,14179,,,10,0.82
"""chr1""",14348,14349,,,11,0.95
"""chr1""",14353,14354,,,11,0.958
"""chr1""",14434,14435,,,11,0.953
"""chr1""",14468,14469,,,11,0.956
"""chr1""",14485,14486,,,11,0.963
"""chr1""",14520,14521,10,0.969,11,0.966
"""chr1""",14552,14553,10,0.93,11,0.952
"""chr1""",14588,14589,10,0.94,11,0.931


### Phase DNA methylation levels to founder haplotypes

In [19]:
import phase_meth_to_founder_haps
importlib.reload(phase_meth_to_founder_haps)
from phase_meth_to_founder_haps import phase_meth_to_founder_haps

DF_METH_COUNT_FOUNDER_PHASED = phase_meth_to_founder_haps(DF_METH_COUNT_HAP1_HAP2, DF_HAP_MAP)
DF_METH_MODEL_FOUNDER_PHASED = phase_meth_to_founder_haps(DF_METH_MODEL_HAP1_HAP2, DF_HAP_MAP)

In [20]:
print(len(DF_METH_COUNT_FOUNDER_PHASED))
DF_METH_COUNT_FOUNDER_PHASED.filter(pl.col("start_hap_map_block").is_not_null()).head()

26729958


chrom,start,end,start_hap_map_block,end_hap_map_block,haplotype_concordance_in_hap_map_block,num_het_SNVs_in_hap_map_block,methylation_level_pat,methylation_level_mat,total_read_count_pat,total_read_count_mat,founder_haplotype_pat,founder_haplotype_mat
str,i64,i64,i64,i64,f64,i64,f64,f64,f64,f64,str,str
"""chr1""",496845,496846,492339,532812,1.0,16,,0.7,,10.0,"""B""","""I"""
"""chr1""",496861,496862,492339,532812,1.0,16,,1.0,,10.0,"""B""","""I"""
"""chr1""",497014,497015,492339,532812,1.0,16,,0.818,,11.0,"""B""","""I"""
"""chr1""",497036,497037,492339,532812,1.0,16,,0.818,,11.0,"""B""","""I"""
"""chr1""",497039,497040,492339,532812,1.0,16,,0.909,,11.0,"""B""","""I"""


In [21]:
print(len(DF_METH_MODEL_FOUNDER_PHASED))
DF_METH_MODEL_FOUNDER_PHASED.filter(pl.col("start_hap_map_block").is_not_null()).head()

26729958


chrom,start,end,start_hap_map_block,end_hap_map_block,haplotype_concordance_in_hap_map_block,num_het_SNVs_in_hap_map_block,methylation_level_pat,methylation_level_mat,total_read_count_pat,total_read_count_mat,founder_haplotype_pat,founder_haplotype_mat
str,i64,i64,i64,i64,f64,i64,f64,f64,f64,f64,str,str
"""chr1""",496845,496846,492339,532812,1.0,16,,0.911,,10.0,"""B""","""I"""
"""chr1""",496861,496862,492339,532812,1.0,16,,0.962,,10.0,"""B""","""I"""
"""chr1""",497014,497015,492339,532812,1.0,16,,0.963,,11.0,"""B""","""I"""
"""chr1""",497036,497037,492339,532812,1.0,16,,0.961,,11.0,"""B""","""I"""
"""chr1""",497039,497040,492339,532812,1.0,16,,0.958,,11.0,"""B""","""I"""


### Combine count-based and model-based methylation levels 

In [22]:
import phase_meth_to_founder_haps
importlib.reload(phase_meth_to_founder_haps)
from phase_meth_to_founder_haps import combine_count_and_model_based_methylation_levels

DF_METH_FOUNDER_PHASED = combine_count_and_model_based_methylation_levels(DF_METH_COUNT_FOUNDER_PHASED, DF_METH_MODEL_FOUNDER_PHASED)
DF_METH_FOUNDER_PHASED

chrom,start,end,start_hap_map_block,end_hap_map_block,haplotype_concordance_in_hap_map_block,num_het_SNVs_in_hap_map_block,total_read_count_pat,total_read_count_mat,founder_haplotype_pat,founder_haplotype_mat,methylation_level_pat_count,methylation_level_mat_count,methylation_level_pat_model,methylation_level_mat_model
str,i64,i64,i64,i64,f64,i64,f64,f64,str,str,f64,f64,f64,f64
"""chr1""",14061,14062,,,,,,,,,,,,
"""chr1""",14178,14179,,,,,,,,,,,,
"""chr1""",14348,14349,,,,,,,,,,,,
"""chr1""",14353,14354,,,,,,,,,,,,
"""chr1""",14434,14435,,,,,,,,,,,,
"""chr1""",14468,14469,,,,,,,,,,,,
"""chr1""",14485,14486,,,,,,,,,,,,
"""chr1""",14520,14521,,,,,,,,,,,,
"""chr1""",14552,14553,,,,,,,,,,,,
"""chr1""",14588,14589,,,,,,,,,,,,


### Notes 


#### Why some SNPs can be read-phased, but not pedigree-phased

https://quinlangroup.slack.com/archives/C09027S4C5Q/p1750792033498849 