## Setup

In [1]:
import sys
from pathlib import Path

REFERENCE_GENOME = "hg38"

REPO_DIR = Path("/scratch/ucgd/lustre-labs/quinlan/u6018199/tapestry")
sys.path.append(str(REPO_DIR / "src" / "util"))
sys.path

['/usr/lib64/python311.zip',
 '/usr/lib64/python3.11',
 '/usr/lib64/python3.11/lib-dynload',
 '',
 '/scratch/ucgd/lustre-labs/quinlan/u6018199/tapestry/.venv/lib64/python3.11/site-packages',
 '/scratch/ucgd/lustre-labs/quinlan/u6018199/tapestry/.venv/lib64/python3.11/site-packages/cyvcf2-0.31.1-py3.11-linux-x86_64.egg',
 '/scratch/ucgd/lustre-labs/quinlan/u6018199/tapestry/.venv/lib/python3.11/site-packages',
 '/scratch/ucgd/lustre-labs/quinlan/u6018199/tapestry/.venv/lib/python3.11/site-packages/cyvcf2-0.31.1-py3.11-linux-x86_64.egg',
 '/scratch/ucgd/lustre-labs/quinlan/u6018199/tapestry/src/util']

## Palladium Trio

In [2]:
FATHER_UID = "NA12877" 
MOTHER_UID = "NA12878"
CHILD_UID = "NA12881" # 2211

## TR DNMs in child

In [3]:
import polars as pl

pl.Config.set_tbl_rows(10)

# Tom Sasani provided TR DNMs for the child 2211: 
# https://quinlangroup.slack.com/archives/D9LFRMXV3/p1753988258836809
CHILD_TR_DNMS = pl.read_csv("/scratch/ucgd/lustre-labs/quinlan/u1006375/CEPH-K1463-TandemRepeats/tr_validation/seq/2211.GRCh38.allele_sequences.tsv", separator="\t")

(
    CHILD_TR_DNMS
    .select(['#chrom', 'start', 'end', 'phase_consensus', 'haplotype_in_parent_consensus', 'denovo_allele_sequence', 'precursor_allele_length_in_parent', 'precursor_sequence_in_parent', 'struc'])
    .with_columns([
        (pl.col('denovo_allele_sequence').str.len_chars() - pl.col('precursor_sequence_in_parent').str.len_chars()).alias('allele_size_difference')
    ])
    .rename({
        '#chrom': 'chrom',
        'phase_consensus': 'parent-of-origin', 
        'haplotype_in_parent_consensus': 'haplotype-of-origin'
    })
)

chrom,start,end,parent-of-origin,haplotype-of-origin,denovo_allele_sequence,precursor_allele_length_in_parent,precursor_sequence_in_parent,struc,allele_size_difference
str,i64,i64,str,str,str,f64,str,str,u32
"""chr10""",101793189,101793216,"""mom:64""","""UNK""","""AAAAACAAAAACAAAAACAAAAACAAAAAC…",-1.0,"""UNK""","""(AAAAAC)n""",36
"""chr10""",30432978,30433009,"""dad:279""","""B""","""AGAGAGAGAGAGAGAGAGAGAGAGAGAGAG…",247.0,"""AGAAGAGAGAGAGAGAGAGAGAGGAGAGGA…","""(GA)n""",4
"""chr10""",30433093,30433105,"""unknown:0""","""UNK""","""GAGAGAGAGAGAGAGAGAGAGAGAGAGAGA…",-1.0,"""UNK""","""(AGGGGG)n""",233
"""chr10""",44662653,44663103,"""dad:445""","""B""","""AGGGGGCTGTGTCGTGGAGGGTGAGGATGG…",6050.0,"""AGGGGGCTGTGTCGTGGAGGGTGAGGATGG…","""(GGGGTCCTAGGCGGCTGTGTCATGGAGGG…",150
"""chr10""",66515265,66515370,"""mom:287""","""B""","""TATCTATCTATCTATCTATCTATCTATCTA…",129.0,"""TATCTATCTATCTATCTATCTATCTATCTA…","""(TA)n(TC)n(TA)n""",4294967284
…,…,…,…,…,…,…,…,…,…
"""chr9""",70469020,70469154,"""dad:144""","""B""","""TTTTCTTTTCTTCTCTTCTCTAATGACTTC…",134.0,"""TTTTCTTTTCTTCTCTTCTCTAATGACTTC…","""(TTTC)n(TC)n(TTTC)n""",109
"""chr9""",96272373,96272473,"""dad:289""","""B""","""TCTCTCTCTCTCTCTCTCTCTCTCTCTCTC…",120.0,"""TCTCTCTCTCTCTCTCTCTCTCTCTCTCTC…","""(TC)n(TA)n""",4294967290
"""chrX""",130516288,130516340,"""dad:123""","""UNK""","""AAATAAATAAATAAATAAATAAATAAATAA…",-1.0,"""UNK""","""(AAAT)n(GT)n""",53
"""chrX""",50967367,50967415,"""dad:52""","""UNK""","""ATATATATATATATATATATATATATATAT…",-1.0,"""UNK""","""(AT)n""",53


## Founder-phased DNA methylation at CpG sites common to Father, Mother, and Child in the Palladium trio

In [4]:
PB_CPG_TOOL_MODE = 'model' # mode of aligned_bam_to_cpg_scores
# output dir of src/phase_meth_to_founder_haps.py
OUTPUT_DIR = Path(f"/scratch/ucgd/lustre-labs/quinlan/data-shared/dna-methylation/CEPH1463.GRCh38.hifi.{PB_CPG_TOOL_MODE}.founder-phased")
OUTPUT_DIR

PosixPath('/scratch/ucgd/lustre-labs/quinlan/data-shared/dna-methylation/CEPH1463.GRCh38.hifi.model.founder-phased')

In [6]:
from read_data import read_data

def get_methylation_sample(uid): 
    return (
        read_data(OUTPUT_DIR, f"{uid}.dna-methylation.founder-phased")
        .select(['chrom', 'start', 'end', 'methylation_level_pat', 'methylation_level_mat', 'founder_haplotype_pat', 'founder_haplotype_mat'])
    )

get_methylation_sample(FATHER_UID).filter(pl.col("founder_haplotype_pat").is_not_null())

chrom,start,end,methylation_level_pat,methylation_level_mat,founder_haplotype_pat,founder_haplotype_mat
str,i64,i64,f64,f64,str,str
"""chr1""",689237,689238,0.228,0.927,"""E""","""G"""
"""chr1""",689434,689435,0.075,0.577,"""E""","""G"""
"""chr1""",690032,690033,0.889,0.905,"""E""","""G"""
"""chr1""",690091,690092,0.87,0.76,"""E""","""G"""
"""chr1""",690430,690431,0.761,0.045,"""E""","""G"""
…,…,…,…,…,…,…
"""chr9""",138216387,138216388,0.641,0.656,"""E""","""G"""
"""chr9""",138216400,138216401,0.89,0.927,"""E""","""G"""
"""chr9""",138216450,138216451,0.775,0.67,"""E""","""G"""
"""chr9""",138216484,138216485,0.658,0.547,"""E""","""G"""


In [23]:
def prefix_columns(df, prefix, join_keys):
    # add prefixes to all columns except the join keys
    return df.rename({col: f"{prefix}_{col}" for col in df.columns if col not in join_keys})

def get_methylation_trio(father_uid, mother_uid, child_uid):
    father_meth = get_methylation_sample(father_uid)
    mother_meth = get_methylation_sample(mother_uid)
    child_meth = get_methylation_sample(child_uid)
    
    father_prefixed = prefix_columns(father_meth, prefix="father", join_keys=['chrom', 'start', 'end'])
    mother_prefixed = prefix_columns(mother_meth, prefix="mother", join_keys=['chrom', 'start', 'end'])
    child_prefixed = prefix_columns(child_meth, prefix="child", join_keys=['chrom', 'start', 'end'])

    # Join on chrom, start, end
    return (
        father_prefixed
        .join(mother_prefixed, on=['chrom', 'start', 'end'], how='inner')
        .join(child_prefixed, on=['chrom', 'start', 'end'], how='inner')
    )

TRIO_METH = get_methylation_trio(FATHER_UID, MOTHER_UID, CHILD_UID)

TRIO_METH.filter(pl.col("father_founder_haplotype_pat").is_not_null())

chrom,start,end,father_methylation_level_pat,father_methylation_level_mat,father_founder_haplotype_pat,father_founder_haplotype_mat,mother_methylation_level_pat,mother_methylation_level_mat,mother_founder_haplotype_pat,mother_founder_haplotype_mat,child_methylation_level_pat,child_methylation_level_mat,child_founder_haplotype_pat,child_founder_haplotype_mat
str,i64,i64,f64,f64,str,str,f64,f64,str,str,f64,f64,str,str
"""chr1""",689434,689435,0.075,0.577,"""E""","""G""",0.764,0.695,"""I""","""K""",0.586,0.097,"""G""","""K"""
"""chr1""",690032,690033,0.889,0.905,"""E""","""G""",0.883,0.804,"""I""","""K""",0.858,0.957,"""G""","""K"""
"""chr1""",690091,690092,0.87,0.76,"""E""","""G""",0.796,0.866,"""I""","""K""",0.902,0.924,"""G""","""K"""
"""chr1""",690621,690622,0.785,0.544,"""E""","""G""",0.515,0.494,"""I""","""K""",0.588,0.857,"""G""","""K"""
"""chr1""",690673,690674,0.94,0.583,"""E""","""G""",0.75,0.904,"""I""","""K""",0.634,0.928,"""G""","""K"""
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""chr9""",138216387,138216388,0.641,0.656,"""E""","""G""",0.646,0.918,"""I""","""K""",0.889,0.659,"""E""","""K"""
"""chr9""",138216400,138216401,0.89,0.927,"""E""","""G""",0.805,0.928,"""I""","""K""",0.748,0.848,"""E""","""K"""
"""chr9""",138216450,138216451,0.775,0.67,"""E""","""G""",0.845,0.546,"""I""","""K""",0.903,0.399,"""E""","""K"""
"""chr9""",138216484,138216485,0.658,0.547,"""E""","""G""",0.652,0.774,"""I""","""K""",0.91,0.429,"""E""","""K"""


## Computing a set of intervals in each of which all CpG sites share the same founder labels on each of the 6 haplotypes 

In [14]:
import bioframe as bf # https://bioframe.readthedocs.io/en/latest/index.html

def get_tiles(reference_genome, size):
    chromsizes = bf.fetch_chromsizes(db=reference_genome)
    windows = []
    for chrom, length in chromsizes.items():
        for start in range(0, length, size): # type: ignore
            end = start + size
            if end > length:
                continue
            windows.append({"chrom": chrom, "start": start, "end": end})

    return pl.DataFrame(windows)

get_tiles(REFERENCE_GENOME, size=1000)

chrom,start,end
str,i64,i64
"""chr1""",0,1000
"""chr1""",1000,2000
"""chr1""",2000,3000
"""chr1""",3000,4000
"""chr1""",4000,5000
…,…,…
"""chrM""",11000,12000
"""chrM""",12000,13000
"""chrM""",13000,14000
"""chrM""",14000,15000


In [10]:
def get_hap_map_blocks(uid):
    return (
        read_data(OUTPUT_DIR, f"{uid}.hap-map-blocks")
        .select(['chrom', 'start', 'end'])
    )

get_hap_map_blocks(FATHER_UID)

chrom,start,end
str,i64,i64
"""chr1""",668466,1052726
"""chr1""",1074443,1079281
"""chr1""",1108637,1265603
"""chr1""",1288373,1610923
"""chr1""",1613420,2370170
…,…,…
"""chr9""",137873339,137943538
"""chr9""",137976541,137977195
"""chr9""",138002413,138087950
"""chr9""",138111774,138174171


In [24]:
def get_intervals(reference_genome, size, father_uid, mother_uid, child_uid):
    tiles = get_tiles(reference_genome, size).to_pandas()
    father_blocks = get_hap_map_blocks(father_uid).to_pandas()
    mother_blocks = get_hap_map_blocks(mother_uid).to_pandas()
    child_blocks = get_hap_map_blocks(child_uid).to_pandas()

    # Intersect tiles and father blocks using bioframe.overlap
    tiles_father = (
        bf.overlap(
            tiles,
            father_blocks,
            how='inner',
            return_overlap=True
        )
        [["chrom", "overlap_start", "overlap_end"]]
        .rename(columns={"overlap_start": "start", "overlap_end": "end"})
    )

    # Intersect tiles_father and mother blocks using bioframe.overlap
    tiles_father_mother = (
        bf.overlap(
            tiles_father,
            mother_blocks,
            how='inner',
            return_overlap=True
        )
        [["chrom", "overlap_start", "overlap_end"]]
        .rename(columns={"overlap_start": "start", "overlap_end": "end"})
    )

    # Intersect tiles_father_mother and child blocks using bioframe.overlap
    tiles_father_mother_child = (
        bf.overlap(
            tiles_father_mother,
            child_blocks,
            how='inner',
            return_overlap=True
        )
        [["chrom", "overlap_start", "overlap_end"]]
        .rename(columns={"overlap_start": "start", "overlap_end": "end"})
    )

    return (
        pl
        .from_pandas(tiles_father_mother_child)
        .filter(pl.col("end") - pl.col("start") == size)
    ) 

INTERVALS = get_intervals(REFERENCE_GENOME, size=1000, father_uid=FATHER_UID, mother_uid=MOTHER_UID, child_uid=CHILD_UID)
INTERVALS

chrom,start,end
str,i64,i64
"""chr17""",119000,120000
"""chr17""",120000,121000
"""chr17""",121000,122000
"""chr17""",122000,123000
"""chr17""",123000,124000
…,…,…
"""chr10""",133635000,133636000
"""chr10""",133636000,133637000
"""chr10""",133637000,133638000
"""chr10""",133638000,133639000


## XXX

In [None]:
def f(trio_meth, intervals): 
    return bf.overlap(
        trio_meth.to_pandas(),
        intervals.to_pandas(),
        how='inner',
        return_overlap=False
    )

f(TRIO_METH, INTERVALS)


Unnamed: 0,chrom,start,end,father_methylation_level_pat,father_methylation_level_mat,father_founder_haplotype_pat,father_founder_haplotype_mat,mother_methylation_level_pat,mother_methylation_level_mat,mother_founder_haplotype_pat,mother_founder_haplotype_mat,child_methylation_level_pat,child_methylation_level_mat,child_founder_haplotype_pat,child_founder_haplotype_mat,chrom_,start_,end_,overlap_start,overlap_end
0,chr17,130000,130001,0.938,0.954,E,G,0.823,0.946,I,K,0.873,0.885,G,K,chr17,130000,131000,130000,130001
1,chr17,139000,139001,0.954,0.939,E,G,0.949,0.941,I,K,0.928,0.893,G,K,chr17,139000,140000,139000,139001
2,chr17,142000,142001,0.917,0.969,E,G,0.932,0.952,I,K,0.915,0.910,G,K,chr17,142000,143000,142000,142001
3,chr17,797000,797001,0.948,0.927,E,G,0.907,0.926,I,K,0.957,0.955,G,K,chr17,797000,798000,797000,797001
4,chr17,978000,978001,0.261,0.584,E,G,0.431,0.580,I,K,0.365,0.757,G,K,chr17,978000,979000,978000,978001
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13932432,chr10,133639311,133639312,0.878,0.942,E,G,0.937,0.945,I,K,0.934,0.889,G,K,chr10,133639000,133640000,133639311,133639312
13932433,chr10,133639660,133639661,0.937,0.931,E,G,0.938,0.934,I,K,0.935,0.946,G,K,chr10,133639000,133640000,133639660,133639661
13932434,chr10,133639705,133639706,0.953,0.946,E,G,0.950,0.938,I,K,0.957,0.959,G,K,chr10,133639000,133640000,133639705,133639706
13932435,chr10,133639968,133639969,0.941,0.951,E,G,0.958,0.890,I,K,0.942,0.535,G,K,chr10,133639000,133640000,133639968,133639969


In [None]:
# TODO 
# 2. group by those intervals AND founder haplotypes (which are now, by construction, all the same at all CpG sites in each interval -- check this), and aggregate the methylation levels 
# 3. for each record, use founder haplotypes to pair each of the child methylation levels to a parental methylation level, and compute the diff, as follows:  
#     i. determine which of father_founder_haplotype_pat, father_founder_haplotype_mat, mother_founder_haplotype_pat or mother_founder_haplotype_mat matches child_founder_haplotype_pat
#     ii. e.g., if it is father_founder_haplotype_pat, then use father_methylation_level_pat and child_founder_haplotype_pat to compute the diff of methylation levels, etc.  
#     iii. repeats steps i and ii for child_founder_haplotype_mat
# 4. order blocks by diff of methylation levels for child_founder_haplotype_pat and child_founder_haplotype_mat, 
#    and inspect the top-ranked blocks in IGV 
#    (another way of saying look for outliers in diff of methylation levels, i.e., windows whose diffs lie in the tail of the distribution)

## Searching for epimutations