In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import numba
import numba.typed

from pathlib import Path
import tqdm
import sys
import seaborn as sns
import scipy.stats
import os
import pysam
import pprint
import pickle
import joblib
import subprocess
import polars as pl
import io
import itertools
import warnings
import igv_notebook

from brokenaxes import brokenaxes


from adjustText import adjust_text


# This is needed before pybedtools to make sure bedtools is imported on sanger JupyterHub 
os.environ["PATH"] += ":" + os.path.join(sys.prefix, "bin")
import pybedtools

pd.set_option('display.max_rows', 1000)


In [3]:
sys.path.append(str(Path(os.getcwd()).parent))
from src import liftover, annotate, diagnostics, inference, dashboard

In [4]:
aut_chrom_names = [f"chr{i}" for i in list(range(1, 23))]
chrom_names = aut_chrom_names + ["chrX", "chrY"]

In [5]:
# Samples to do
sample_ids = [
    "PD50477f",
    # "PD50508bf", -- ignore; merged two sampling dates just for phasing, but should be analyzed separately
    "PD50519d",
    # "PD47269d", -- don't use, not there
    "PD50508f",
    # "PD50511e", -- don't use, likely mixture
    "PD50523b",
    # "PD48473b", -- don't use, not there
    "PD50521b",
    "PD50508b",
    # "PD50521be", -- ignore; merged two sampling dates just for phasing, but should be analyzed separately
    "PD46180c",
    # "PD50502f", -- don't use, likely mixture
    "PD50521e",
    # "PD50511e_SS",  --- don't use
    "PD50489e",
]



# Prepare PRDM9 alleles fasta

## Read diploids file from Alleva et al. 2021

In [6]:
diploids_df = pl.read_csv(
    "/lustre/scratch126/casm/team154pc/sl17/03.sperm/01.data/07.references/07.PRDM9/GSE166483_File_S1_PRDM9_genotypes.txt",
    comment_prefix="#",
    separator="\t",
    ignore_errors=True,
)

In [7]:
diploids_df

id,pop,diploid,homhet,allele_1,allele_2,allele_1_code,allele_2_code,size_1,size_2,seq_1,seq_2,nseqs,nzfseqs,is_child
str,str,str,str,str,str,str,str,i64,i64,str,str,i64,i64,bool
"""AA-428""","""OTH""","""A/A""","""hom""","""A""","""A""",""":A:B:C:D:D:E:C…",""":A:B:C:D:D:E:C…",13,13,"""TGTGGACAAGGTTT…","""TGTGGACAAGGTTT…",43209,1741,false
"""AN-472""","""OTH""","""A/Av:s:0053:M1…","""het""","""A""","""Av:s:0053:M1S:…",""":A:B:C:D:D:E:C…",""":A:B:C:D:D:E:C…",13,12,"""TGTGGACAAGGTTT…","""TGTGGACAAGGTTT…",2048,689,false
"""HG00171""","""FIN""","""A/A""","""hom""","""A""","""A""",""":A:B:C:D:D:E:C…",""":A:B:C:D:D:E:C…",13,13,"""TGTGGACAAGGTTT…","""TGTGGACAAGGTTT…",69673,1843,false
"""HG00173""","""FIN""","""A/A""","""hom""","""A""","""A""",""":A:B:C:D:D:E:C…",""":A:B:C:D:D:E:C…",13,13,"""TGTGGACAAGGTTT…","""TGTGGACAAGGTTT…",7307,1543,false
"""HG00174""","""FIN""","""A/A""","""hom""","""A""","""A""",""":A:B:C:D:D:E:C…",""":A:B:C:D:D:E:C…",13,13,"""TGTGGACAAGGTTT…","""TGTGGACAAGGTTT…",16394,1791,false
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""NA20826""","""TSI""","""A/A""","""hom""","""A""","""A""",""":A:B:C:D:D:E:C…",""":A:B:C:D:D:E:C…",13,13,"""TGTGGACAAGGTTT…","""TGTGGACAAGGTTT…",10016,1790,false
"""NA20827""","""TSI""","""A/A""","""hom""","""A""","""A""",""":A:B:C:D:D:E:C…",""":A:B:C:D:D:E:C…",13,13,"""TGTGGACAAGGTTT…","""TGTGGACAAGGTTT…",40719,1764,false
"""NA20828""","""TSI""","""A/A""","""hom""","""A""","""A""",""":A:B:C:D:D:E:C…",""":A:B:C:D:D:E:C…",13,13,"""TGTGGACAAGGTTT…","""TGTGGACAAGGTTT…",15287,1709,false
"""NA20831""","""TSI""","""A/A""","""hom""","""A""","""A""",""":A:B:C:D:D:E:C…",""":A:B:C:D:D:E:C…",13,13,"""TGTGGACAAGGTTT…","""TGTGGACAAGGTTT…",21953,1811,false


In [8]:
allele_to_seq = dict(diploids_df.select("allele_1", "seq_1").rows() + diploids_df.select("allele_2", "seq_2").rows())

In [9]:
len(allele_to_seq)

74

## Write a fasta file

In [10]:
with open("/lustre/scratch126/casm/team154pc/sl17/03.sperm/01.data/07.references/07.PRDM9/GSE166483_PRDM9_genotypes.fasta", "w") as F:
    for k, v in allele_to_seq.items():
        F.write(f">{k}\n");
        F.write(f"{v}\n");
    
    
    

# Analyze mapping results

In [11]:
def analyze_bam(focal_sample_id, haplotype):
    filename = f"/lustre/scratch126/casm/team154pc/sl17/03.sperm/02.results/07.prdm9/{focal_sample_id}/{focal_sample_id}.hap{haplotype}.minimap2.sorted.bam"
    
    rows = []
    for aln in pysam.AlignmentFile(filename):
        #print(aln)
        matching_field_index = "MIDNSHP=X".index("=")
        cigar_stats = aln.get_cigar_stats()[0]
        rows.append([
            focal_sample_id, 
            haplotype, 
            aln.query_name, 
            aln.query_length, 
            aln.reference_name,
            aln.reference_length, 
            aln.reference_start,
            cigar_stats[matching_field_index],
        ])
        
    df = pl.DataFrame(
        rows,
        schema=["sample_id", "haplotype", "PRDM9_allele", "PRDM9_allele_length", "reference_name", "reference_length", "reference_start", "n_matches_bp"],
    )
    
    return df

map_df = pl.concat([analyze_bam(sample_id, haplotype) for sample_id in sample_ids for haplotype in [1,2]])
map_df = map_df.drop_nulls()

# Check if flanking sequence matches

### From Alleva et al. 2021

In [12]:
flanking_5prime = """CACAGCCGTAATGACAAAACCAAAGGTCAAGAGATCA
AAGAAAGGTCCAAACTCTTGAATAAAAGGACATGGCAGA
GGGAGATTTCAAGGGCCTTTTCTAGCCCACCCAAAGGAC
AAATGGGGAGCTGTAGAGTGGGAAAAAGAATAATGGAA
GAAGAGTCCAGAACAGGCCAGAAAGTGAATCCAGGGAA
CACAGGCAAATTATTTGTGGGGGTAGGAATCTCAAGAAT
TGCAAAAGTCAAGTATGGAGAG""".replace('\n','')

flanking_3prime = """GATGAGTAAGTCATTAGTAATAAAACCTCATCTCAATA
GCCACAAAAAGACAAATGTGGTCACCACACACTTGCACA
CCCCAGCTGTGAGGTGGCTTCAGCGGAAGTCTGCTGAC
CCCTTATATTCCCCGAGAGTATAAAGAGATCGGAAATAAC
TGATTAAACAAATCCGCCACTTTCATGACTAGAGATGAG
GAAGAACAAGGGATAGTTCTGTAAGTGTTCGGGGGACAT
CAGCATGTGTGGTTCTTTC""".replace('\n','')

In [13]:
def confirm_flanking(focal_sample_id, haplotype, reference_name, reference_length, reference_start):
    ref_filename = Path("/lustre/scratch126/casm/team154pc/sl17/03.sperm/01.data/05.ragtag/03.haplotype_specific_scaffolds") \
        / f"{focal_sample_id}" / f"haplotype_{haplotype}" / "ragtag.scaffold.expanded.fasta"
    
    with pysam.FastaFile(ref_filename) as F:
        obs_flanking_5prime = F.fetch(
            reference=reference_name, 
            start=reference_start-len(flanking_5prime), 
            end=reference_start
        )
        obs_flanking_3prime = F.fetch(
            reference=reference_name, 
            start=reference_start+reference_length, 
            end=reference_start+reference_length+len(flanking_3prime)
        )
        
        # for i in range(0, len(flanking_3prime), 20):
        #     print(flanking_3prime[i:i+20])
        #     print(obs_flanking_3prime[i:i+20])
        #     print()
        n_diffs_5prime = (np.array(list(obs_flanking_5prime)) != np.array(list(flanking_5prime))).sum()
        n_diffs_3prime = (np.array(list(obs_flanking_3prime)) != np.array(list(flanking_3prime))).sum()
        
        return (n_diffs_5prime, n_diffs_3prime)

In [14]:
map_df = pl.concat(
    [
        map_df,
        (map_df
            .map_rows(lambda row: confirm_flanking(row[0], row[1], row[4], row[5], row[6]))
            .rename({            
                "column_0": "n_diffs_5prime",
                "column_1": "n_diffs_3prime",
            })
        ),
    ],
    how="horizontal",
)


In [15]:
perfect_maps_df = (map_df
    .filter(pl.col("PRDM9_allele_length") == pl.col("reference_length"))
    .filter(pl.col("PRDM9_allele_length") == pl.col("n_matches_bp"))
)

In [16]:
with pl.Config(tbl_rows=20) as cfg:
    display(perfect_maps_df)

sample_id,haplotype,PRDM9_allele,PRDM9_allele_length,reference_name,reference_length,reference_start,n_matches_bp,n_diffs_5prime,n_diffs_3prime
str,i64,str,i64,str,i64,i64,i64,i64,i64
"""PD50477f""",1,"""A""",1092,"""chr5_RagTag""",1092,23776409,1092,0,1
"""PD50477f""",2,"""A""",1092,"""chr5_RagTag""",1092,23659462,1092,0,2
"""PD50519d""",1,"""A""",1092,"""chr5_RagTag""",1092,24562580,1092,0,1
"""PD50519d""",2,"""A""",1092,"""chr5_RagTag""",1092,24762280,1092,0,1
"""PD50508f""",1,"""A""",1092,"""chr5_RagTag""",1092,23633360,1092,0,1
"""PD50508f""",2,"""A""",1092,"""chr5_RagTag""",1092,22353912,1092,0,2
"""PD50523b""",1,"""A""",1092,"""chr5_RagTag""",1092,23778983,1092,0,1
"""PD50523b""",2,"""D""",1176,"""chr5_RagTag""",1176,23826019,1176,0,2
"""PD50521b""",2,"""A""",1092,"""chr5_RagTag""",1092,23661621,1092,0,2
"""PD50508b""",1,"""A""",1092,"""chr5_RagTag""",1092,23633360,1092,0,1


### What going on with ones without a perfect match?

In [17]:
with pl.Config(tbl_rows=80, fmt_str_lengths=20) as cfg:
    display(map_df
        .filter((pl.col("sample_id") == "PD50521b") & (pl.col("haplotype") == 1))
        .sort("n_matches_bp", descending=True)
        [:10]
    )

sample_id,haplotype,PRDM9_allele,PRDM9_allele_length,reference_name,reference_length,reference_start,n_matches_bp,n_diffs_5prime,n_diffs_3prime
str,i64,str,i64,str,i64,i64,i64,i64,i64
"""PD50521b""",1,"""Av:s:0024:M1S:A-A""",1176,"""chr5_RagTag""",1176,23747726,1174,0,1
"""PD50521b""",1,"""M14""",1428,"""chr5_RagTag""",1176,23747726,1174,0,1
"""PD50521b""",1,"""Cv:s:0196:M7S:C-C""",1260,"""chr5_RagTag""",1176,23747726,1165,0,1
"""PD50521b""",1,"""M16""",1260,"""chr5_RagTag""",1176,23747726,1165,0,1
"""PD50521b""",1,"""L6""",1260,"""chr5_RagTag""",1176,23747726,1164,0,1
"""PD50521b""",1,"""Cv:s:0391:M13S:C-A""",1260,"""chr5_RagTag""",1176,23747726,1164,0,1
"""PD50521b""",1,"""M20""",1260,"""chr5_RagTag""",1176,23747726,1163,0,1
"""PD50521b""",1,"""D""",1176,"""chr5_RagTag""",1176,23747726,1159,0,1
"""PD50521b""",1,"""L17""",1260,"""chr5_RagTag""",1176,23747726,1158,0,1
"""PD50521b""",1,"""L22""",1176,"""chr5_RagTag""",1176,23747726,1156,0,1


In [18]:
allele_to_seq["Av:s:0024:M1S:A-A"]

'TGTGGACAAGGTTTCAGTGTTAAATCAGATGTTATTACACACCAAAGGACACATACAGGGGAGAAGCTCTACGTCTGCAGGGAGTGTGGGCGGGGCTTTAGCTGGAAGTCACACCTCCTCATTCACCAGAGGATACACACAGGGGAGAAGCCCTATGTCTGCAGGGAGTGTGGGCGGGGCTTTAGCTGGCAGTCAGTCCTCCTCACTCACCAGAGGACACACACAGGGGAGAAGCCCTATGTCTGCAGGGAGTGTGGGCGGGGCTTTAGCCGGCAGTCAGTCCTCCTCACTCACCAGAGGAGACACACAGGGGAGAAGCCCTATGTCTGCAGGGAGTGTGGGCGGGGCTTTAGCCGGCAGTCAGTCCTCCTCACTCACCAGAGGAGACACACAGGGGAGAAGCCCTATGTCTGCAGGGAGTGTGGGCGGGGCTTTAGCCGGCAGTCAGTCCTCCTCACTCACCAGAGGAGACACACAGGGGAGAAGCCCTATGTCTGCAGGGAGTGTGGGCGGGGCTTTAGCTGGCAGTCAGTCCTCCTCAGTCACCAGAGGACACACACAGGGGAGAAGCCCTATGTCTGCAGGGAGTGTGGGCGGGGCTTTAGCTGGCAGTCAGTCCTCCTCACTCACCAGAGGACACACACAGGGGAGAAGCCCTATGTCTGCAGGGAGTGTGGGCGGGGCTTTAGCAATAAGTCACACCTCCTCAGACACCAGAGGACACACACAGGGGAGAAGCCCTATGTCTGCAGGGAGTGTGGGCGGGGCTTTCGCGATAAGTCACACCTCCTCAGACACCAGAGGACACACACAGGGGAGAAGCCCTATGTCTGCAGGGAGTGTGGGCGGGGCTTTAGAGATAAGTCAAACCTCCTCAGTCACCAGAGGACACACACAGGGGAGAAGCCCTATGTCTGCAGGGAGTGTGGGCGGGGCTTTAGCAATAAGTCACACCTCCTCAGACACCAGAGGACACACACAGGGGAGAAGCCCTATGTC

# Look at assemblies to double check

In [125]:
def examine_assembly(
    focal_sample_id,
    haplotype,
    reference_name,
    reference_start,
    reference_length,
):
    reference_fasta_filename = Path("/lustre/scratch126/casm/team154pc/sl17/03.sperm/01.data/05.ragtag/03.haplotype_specific_scaffolds") \
        / f"{focal_sample_id}" / f"haplotype_{haplotype}" / "ragtag.scaffold.expanded.fasta"
    
    bam_filename = Path("/lustre/scratch126/casm/team154pc/sl17/03.sperm/02.results/01.read_alignment/01.ccs/04.hifiasm/02.hifiasm_0.19.5-r592/02.chromosome_length_scaffolds/") \
            / f"{focal_sample_id}" / f"{focal_sample_id}.hap{haplotype}.minimap2.sorted.primary_alignments.bam"
    
    chrom = reference_name
    
    locus = f"{chrom}:{reference_start}-{reference_start + reference_length}"

    igv_browser = igv_notebook.Browser(
        {
            "reference": {
                "id": "xxx",
                "name": "xxx",
                "fastaURL": str(reference_fasta_filename),
                "indexPath": str(reference_fasta_filename) + ".fai",
            },
            "locus": locus,
            "showCenterGuide": True,
            "tracks": [
                {
                    "name": "Local BAM",
                    "url": str(bam_filename),
                    "indexURL": str(bam_filename) + ".bai",
                    "format": "bam",
                    "type": "alignment",
                    "colorBy": "tag",
                    "colorByTag": "YC",
                    "maxHeight": 700,
                    "autoHeight": True,
                    "sort": {
                        "chr": chrom,
                        "position": reference_start,
                        "option": "TAG", 
                        "tag": "YD",
                        "direction": "DESC",
                    }
                }
            ]
        }
    )
    
    display(igv_browser);

In [133]:
with pl.Config(tbl_rows=20) as cfg:
    display(perfect_maps_df)

sample_id,haplotype,PRDM9_allele,PRDM9_allele_length,reference_name,reference_length,reference_start,n_matches_bp,n_diffs_5prime,n_diffs_3prime
str,i64,str,i64,str,i64,i64,i64,i64,i64
"""PD50477f""",1,"""A""",1092,"""chr5_RagTag""",1092,23776409,1092,0,1
"""PD50477f""",2,"""A""",1092,"""chr5_RagTag""",1092,23659462,1092,0,2
"""PD50519d""",1,"""A""",1092,"""chr5_RagTag""",1092,24562580,1092,0,1
"""PD50519d""",2,"""A""",1092,"""chr5_RagTag""",1092,24762280,1092,0,1
"""PD50508f""",1,"""A""",1092,"""chr5_RagTag""",1092,23633360,1092,0,1
"""PD50508f""",2,"""A""",1092,"""chr5_RagTag""",1092,22353912,1092,0,2
"""PD50523b""",1,"""A""",1092,"""chr5_RagTag""",1092,23778983,1092,0,1
"""PD50523b""",2,"""D""",1176,"""chr5_RagTag""",1176,23826019,1176,0,2
"""PD50521b""",2,"""A""",1092,"""chr5_RagTag""",1092,23661621,1092,0,2
"""PD50508b""",1,"""A""",1092,"""chr5_RagTag""",1092,23633360,1092,0,1


In [137]:
examine_assembly("PD50521b", 1, "chr5_RagTag", 23747726, 1176)

<IPython.core.display.Javascript object>

<igv_notebook.browser.Browser at 0x147b00cd3d90>

In [141]:
focal_sample_id = "PD50521b"
haplotype = 1
reference_name = "chr5_RagTag"
reference_start = 23747726
reference_length = 1176

ref_filename = Path("/lustre/scratch126/casm/team154pc/sl17/03.sperm/01.data/05.ragtag/03.haplotype_specific_scaffolds") \
    / f"{focal_sample_id}" / f"haplotype_{haplotype}" / "ragtag.scaffold.expanded.fasta"

with pysam.FastaFile(ref_filename) as F:
    new_allele = F.fetch(
        reference=reference_name, 
        start=reference_start, 
        end=reference_start+reference_length,
    )

In [146]:
for i in range(0, len(new_allele), 50):
    print(allele_to_seq["Av:s:0024:M1S:A-A"][i:i+50])
    print(new_allele[i:i+50])
    print()

TGTGGACAAGGTTTCAGTGTTAAATCAGATGTTATTACACACCAAAGGAC
TGTGGACAAGGTTTCAGTGTTAAATCAGATGTTATTACACACCAAAGGAC

ACATACAGGGGAGAAGCTCTACGTCTGCAGGGAGTGTGGGCGGGGCTTTA
ACATACAGGGGAGAAGCTCTACGTCTGCAGGGAGTGTGGGCGGGGCTTTA

GCTGGAAGTCACACCTCCTCATTCACCAGAGGATACACACAGGGGAGAAG
GCTGGAAGTCACACCTCCTCATTCACCAGAGGATACACACAGGGGAGAAG

CCCTATGTCTGCAGGGAGTGTGGGCGGGGCTTTAGCTGGCAGTCAGTCCT
CCCTATGTCTGCAGGGAGTGTGGGCGGGGCTTTAGCTGGCAGTCAGTCCT

CCTCACTCACCAGAGGACACACACAGGGGAGAAGCCCTATGTCTGCAGGG
CCTCACTCACCAGAGGACACACACAGGGGAGAAGCCCTATGTCTGCAGGG

AGTGTGGGCGGGGCTTTAGCCGGCAGTCAGTCCTCCTCACTCACCAGAGG
AGTGTGGGCGGGGCTTTAGCTGGCAGTCAGTCCTCCTCACTCACCAGAGG

AGACACACAGGGGAGAAGCCCTATGTCTGCAGGGAGTGTGGGCGGGGCTT
ACACACACAGGGGAGAAGCCCTATGTCTGCAGGGAGTGTGGGCGGGGCTT

TAGCCGGCAGTCAGTCCTCCTCACTCACCAGAGGAGACACACAGGGGAGA
TAGCCGGCAGTCAGTCCTCCTCACTCACCAGAGGAGACACACAGGGGAGA

AGCCCTATGTCTGCAGGGAGTGTGGGCGGGGCTTTAGCCGGCAGTCAGTC
AGCCCTATGTCTGCAGGGAGTGTGGGCGGGGCTTTAGCCGGCAGTCAGTC

CTCCTCACTCACCAGAGGAGACACACAGGGGAGAAGCCCTATGTCTGCAG
CTCCTCACTCACCAGAGGAGAC

In [147]:
print(new_allele)

TGTGGACAAGGTTTCAGTGTTAAATCAGATGTTATTACACACCAAAGGACACATACAGGGGAGAAGCTCTACGTCTGCAGGGAGTGTGGGCGGGGCTTTAGCTGGAAGTCACACCTCCTCATTCACCAGAGGATACACACAGGGGAGAAGCCCTATGTCTGCAGGGAGTGTGGGCGGGGCTTTAGCTGGCAGTCAGTCCTCCTCACTCACCAGAGGACACACACAGGGGAGAAGCCCTATGTCTGCAGGGAGTGTGGGCGGGGCTTTAGCTGGCAGTCAGTCCTCCTCACTCACCAGAGGACACACACAGGGGAGAAGCCCTATGTCTGCAGGGAGTGTGGGCGGGGCTTTAGCCGGCAGTCAGTCCTCCTCACTCACCAGAGGAGACACACAGGGGAGAAGCCCTATGTCTGCAGGGAGTGTGGGCGGGGCTTTAGCCGGCAGTCAGTCCTCCTCACTCACCAGAGGAGACACACAGGGGAGAAGCCCTATGTCTGCAGGGAGTGTGGGCGGGGCTTTAGCTGGCAGTCAGTCCTCCTCAGTCACCAGAGGACACACACAGGGGAGAAGCCCTATGTCTGCAGGGAGTGTGGGCGGGGCTTTAGCTGGCAGTCAGTCCTCCTCACTCACCAGAGGACACACACAGGGGAGAAGCCCTATGTCTGCAGGGAGTGTGGGCGGGGCTTTAGCAATAAGTCACACCTCCTCAGACACCAGAGGACACACACAGGGGAGAAGCCCTATGTCTGCAGGGAGTGTGGGCGGGGCTTTCGCGATAAGTCACACCTCCTCAGACACCAGAGGACACACACAGGGGAGAAGCCCTATGTCTGCAGGGAGTGTGGGCGGGGCTTTAGAGATAAGTCAAACCTCCTCAGTCACCAGAGGACACACACAGGGGAGAAGCCCTATGTCTGCAGGGAGTGTGGGCGGGGCTTTAGCAATAAGTCACACCTCCTCAGACACCAGAGGACACACACAGGGGAGAAGCCCTATGTCT