In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import numba
import numba.typed

from pathlib import Path
import tqdm
import sys
import seaborn as sns
import scipy.stats
import os
import pysam
import pprint
import pickle
import joblib
import subprocess
import polars as pl
import io

# This is needed before pybedtools to make sure bedtools is imported on sanger JupyterHub 
os.environ["PATH"] += ":" + os.path.join(sys.prefix, "bin")
import pybedtools

pd.set_option('display.max_rows', 1000)


In [3]:
sys.path.append(str(Path(os.getcwd()).parent))
from src import liftover, annotate, diagnostics, inference

In [4]:
aut_chrom_names = [f"chr{i}" for i in list(range(1, 23))]
chrom_names = aut_chrom_names + ["chrX", "chrY"]

# Sample IDs

In [5]:
# Samples to do
sample_ids = [
    "PD50477f",
    # "PD50508bf", -- ignore; merged two sampling dates just for phasing, but should be analyzed separately
    "PD50519d",
    # "PD47269d", -- don't use, not there
    "PD50508f",
    # "PD50511e", -- don't use, likely mixture
    "PD50523b",
    # "PD48473b", -- don't use, not there
    "PD50521b",
    "PD50508b",
    # "PD50521be", -- ignore; merged two sampling dates just for phasing, but should be analyzed separately
    "PD46180c",
    # "PD50502f", -- don't use, likely mixture
    "PD50521e",
    # "PD50511e_SS",  --- don't use
    "PD50489e",
]



# Basic stats

## Read coverage

In [25]:
def load_depth(sample_id, chrom, n_every = 1):
    depth_path = f"/lustre/scratch126/casm/team154pc/sl17/03.sperm/02.results/01.read_alignment/01.ccs/03.T2T-CHM13/{sample_id}/chm13.{sample_id}.{chrom}.depth.txt.gz"

    res = subprocess.run(f"zcat {depth_path} | awk 'NR % {n_every} == 0'", shell=True, capture_output=True)
    
    df = pl.read_csv(
        io.BytesIO(res.stdout),
        separator="\t",
        new_columns=["chrom", "pos_1based", "depth"],
    )

    return df

In [None]:
%%time
stats = []

def runme(sample_id, chrom, n_every):
    df = load_depth(sample_id, chrom, n_every)
    return [sample_id, chrom, len(df), df["depth"].mean()]
    
stats = joblib.Parallel(n_jobs=-1, verbose=10)(
    joblib.delayed(runme)(sample_id, chrom, 1000) for sample_id in sample_ids for chrom in aut_chrom_names
)


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:  2.0min
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:  2.8min
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:  3.7min
[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:  5.1min
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:  6.8min
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  7.7min
[Parallel(n_jobs=-1)]: Done  53 tasks      | elapsed: 10.6min
[Parallel(n_jobs=-1)]: Done  64 tasks      | elapsed: 11.6min


In [95]:
stats_df = (pl.DataFrame(stats, schema=["sample_id", "chrom", "n_sites", "mean_depth"])
    .group_by("sample_id")
    .agg(
        n_total = pl.col("n_sites").sum(),
        weighted_sum = (pl.col("n_sites") * pl.col("mean_depth")).sum(),
    )
    .select(
        "sample_id",
        (pl.col("weighted_sum") / pl.col("n_total")).alias("mean_depth"),
    )
    .sort("mean_depth")
)

In [96]:
stats_df

sample_id,mean_depth
str,f64
"""PD50508f""",15.652876
"""PD50508b""",20.623035
"""PD50489e""",22.238416
"""PD50519d""",23.319102
"""PD50523b""",24.503857
"""PD50521e""",25.114194
"""PD50477f""",25.343984
"""PD46180c""",31.714647
"""PD50521b""",33.258736


In [92]:
stats_df.mean()

sample_id,mean_depth
str,f64
,24.640983


## Get read counts

In [31]:
read_length_means = {}
read_length_stds = {}
read_numbers = {}

ccs_read_path = Path("/lustre/scratch126/casm/team154pc/sl17/03.sperm/01.data/02.ccs/")

for sample_id in sample_ids:
    count_path = ccs_read_path / sample_id / (sample_id + ".ccs.filtered.fastqc")
    d = open(count_path).read().strip().split()
    read_numbers[sample_id] = int(d[1])
    read_length_means[sample_id] = float(d[3])
    read_length_stds[sample_id] = float(d[4])

In [34]:
display(read_numbers)

{'PD50477f': 7185241,
 'PD50519d': 5353906,
 'PD50508f': 3040994,
 'PD50523b': 4741878,
 'PD50521b': 6113622,
 'PD50508b': 3947659,
 'PD46180c': 5936129,
 'PD50521e': 6168195,
 'PD50489e': 6065965}

In [79]:
print("Total number of reads (before any processing):", np.sum(list(read_numbers.values())))

Total number of reads (before any processing): 48553589


# Get read counts after filtering

In [29]:
rows = []

# for focal_sample_id in tqdm.tqdm(sample_ids):
#     for chrom in tqdm.tqdm(aut_chrom_names):
def runme(focal_sample_id, chrom):
    all_reads_df = pl.scan_parquet(
        Path("/lustre/scratch126/casm/team154pc/sl17/03.sperm/02.results/01.read_alignment/01.ccs/04.hifiasm/02.hifiasm_0.19.5-r592/02.chromosome_length_scaffolds/") \
            / f"{focal_sample_id}" / "reads" / f"{chrom}_RagTag.high_confidence_snps.parquet"
    )

    n_reads = (all_reads_df
        .filter(
            (pl.col("mapq1") >= 60) & \
            (pl.col("mapq2") >= 60) & \
            (pl.col("is_forward1") == pl.col("is_forward2"))
        )
        .unique("read_name")
    ).select(pl.len()).collect(streaming=True).item()

    return [focal_sample_id, chrom, n_reads]

rows = joblib.Parallel(n_jobs=-1, verbose=10)(
    joblib.delayed(runme)(focal_sample_id, chrom) for focal_sample_id in sample_ids for chrom in aut_chrom_names
)

filtered_read_counts_df = pl.DataFrame(rows, schema=["sample_id", "chrom", "n_reads"])

  0%|          | 0/9 [00:00<?, ?it/s]
  0%|          | 0/22 [00:00<?, ?it/s][A
  5%|▍         | 1/22 [00:02<00:45,  2.19s/it][A
  9%|▉         | 2/22 [00:08<01:26,  4.33s/it][A
 14%|█▎        | 3/22 [00:11<01:13,  3.88s/it][A
 18%|█▊        | 4/22 [00:15<01:11,  3.97s/it][A
 23%|██▎       | 5/22 [00:19<01:05,  3.85s/it][A
 27%|██▋       | 6/22 [00:21<00:55,  3.46s/it][A
 32%|███▏      | 7/22 [00:24<00:45,  3.06s/it][A
 36%|███▋      | 8/22 [00:27<00:43,  3.10s/it][A
 41%|████      | 9/22 [00:30<00:41,  3.18s/it][A
 45%|████▌     | 10/22 [00:34<00:39,  3.30s/it][A
 50%|█████     | 11/22 [00:37<00:36,  3.31s/it][A
 55%|█████▍    | 12/22 [00:40<00:32,  3.22s/it][A
 59%|█████▉    | 13/22 [00:43<00:27,  3.10s/it][A
 64%|██████▎   | 14/22 [00:46<00:25,  3.17s/it][A
 68%|██████▊   | 15/22 [00:50<00:22,  3.25s/it][A
 73%|███████▎  | 16/22 [00:52<00:17,  2.87s/it][A
 77%|███████▋  | 17/22 [00:55<00:15,  3.17s/it][A
 82%|████████▏ | 18/22 [00:59<00:12,  3.24s/it][A
 86%|██████

KeyboardInterrupt: 

## Get read lengths

### This is a bit lazy because this is just for mapped reads with SNPs, but I think it shouldn't matter a lot

In [71]:
def get_read_lengths(focal_sample_id, chrom):
    denovo_chrom = chrom + "_RagTag"
    filtered_high_conf_mismatches_df = pl.scan_parquet(Path("/lustre/scratch126/casm/team154pc/sl17/03.sperm/02.results/01.read_alignment/01.ccs/04.hifiasm/02.hifiasm_0.19.5-r592/02.chromosome_length_scaffolds/") \
            / f"{focal_sample_id}" / "reads" / f"{denovo_chrom}.high_confindence_snps.parquet")

    read_lengths = filtered_high_conf_mismatches_df.unique(["read_name"]).select("read_length1")

    stats = (read_lengths
        .with_columns(
            one=pl.lit(1),
        )
        .select(
            pl.col("one").sum().alias("n_reads"),
            pl.col("read_length1").mean().alias("mean"),
            pl.col("read_length1").quantile(0.01).alias("q_0.01"),
            pl.col("read_length1").quantile(0.05).alias("q_0.05"),
            pl.col("read_length1").quantile(0.95).alias("q_0.95"),
            pl.col("read_length1").quantile(0.99).alias("q_0.99"),
            pl.col("read_length1").quantile(0.5).alias("median"),
        )
        .collect(streaming=True)
        .with_columns(
            sample_id = pl.lit(focal_sample_id),
            chrom = pl.lit(chrom),
        )
    )
    
    return stats


In [74]:
%%time
read_stast_df = pl.concat([
    get_read_lengths(sample_id, chrom) for sample_id in tqdm(sample_ids) for chrom in aut_chrom_names
])

100%|██████████| 9/9 [05:19<00:00, 35.52s/it]

CPU times: user 2min 26s, sys: 26.1 s, total: 2min 52s
Wall time: 5min 19s





In [80]:
read_stast_df

n_reads,mean,q_0.01,q_0.05,q_0.95,q_0.99,median,sample_id,chrom
i32,f64,f64,f64,f64,f64,f64,str,str
314848,13404.161186,9489.0,10253.0,18303.0,20581.0,12894.0,"""PD50477f""","""chr1"""
348598,13381.077533,9487.0,10252.0,18251.0,20569.0,12869.0,"""PD50477f""","""chr2"""
287600,13387.987789,9494.0,10251.0,18275.0,20567.0,12880.0,"""PD50477f""","""chr3"""
274577,13351.067966,9459.0,10242.0,18226.0,20540.0,12838.0,"""PD50477f""","""chr4"""
259776,13374.356411,9489.0,10247.0,18249.0,20522.0,12867.0,"""PD50477f""","""chr5"""
242835,13371.710853,9488.0,10242.0,18245.0,20507.0,12863.0,"""PD50477f""","""chr6"""
216974,13385.01638,9498.0,10263.0,18250.0,20555.0,12880.0,"""PD50477f""","""chr7"""
202229,13393.805775,9493.0,10251.0,18279.0,20524.0,12890.0,"""PD50477f""","""chr8"""
163839,13394.646763,9479.0,10249.0,18263.0,20520.0,12894.0,"""PD50477f""","""chr9"""
195565,13384.416174,9503.0,10251.0,18260.0,20533.0,12875.0,"""PD50477f""","""chr10"""


In [73]:
display(read_length_means)

{'PD50477f': 13465.900949042627,
 'PD50519d': 14053.125118184742,
 'PD50508f': 17011.82899703189,
 'PD50523b': 15500.8475274986,
 'PD50521b': 15827.801594374007,
 'PD50508b': 15688.517728354956,
 'PD46180c': 16380.781127903387,
 'PD50521e': 12234.025510380265,
 'PD50489e': 11014.38739689398}

In [38]:
print(
    "Average read length:", 
    np.sum([read_length_means[sample_id] * read_numbers[sample_id] for sample_id in sample_ids]) / np.sum(list(read_numbers.values())),
)

Average read length: 14323.187104500143


# Candidate read classes

In [6]:
cls_df = pl.concat([
    pl.read_parquet(
        str(Path("/lustre/scratch126/casm/team154pc/sl17/03.sperm/02.results/01.read_alignment/01.ccs/04.hifiasm/02.hifiasm_0.19.5-r592/02.chromosome_length_scaffolds")
            / f"{focal_sample_id}" / "reads" / f"{chrom}_RagTag.certainty_0.95.classified_reads.parquet")
    ) \
    for focal_sample_id in sample_ids
    for chrom in aut_chrom_names
])

very_trusty_cls_df = (cls_df
    .filter(~pl.col("has_common_transition"))
    .filter(pl.col("min_coverage_between_transitions_hap1") >= 3)
    .filter(pl.col("min_coverage_between_transitions_hap2") >= 3)
    .filter(pl.col("mapq1") >= 60)
    .filter(pl.col("mapq2") >= 60)
    .filter(pl.col("is_forward1") == pl.col("is_forward2")) 
    .filter((pl.col("total_mismatches") <= 100) & (pl.col("total_clipping") <= 10))
)

In [7]:
call_stats_df = (very_trusty_cls_df    
    .pivot(
        values=["class"],
        index=["sample_id"],
        columns=["class"],
        aggregate_function='len'
    )
    .sort(by="sample_id")
    .select("sample_id", "CO", "GC", "ambiguous", "CNCO")
)

display(call_stats_df)

sample_id,CO,GC,ambiguous,CNCO
str,u32,u32,u32,u32
"""PD46180c""",261,74,131,45
"""PD50477f""",202,75,130,31
"""PD50489e""",180,53,81,24
"""PD50508b""",161,46,93,3
"""PD50508f""",130,31,78,8
"""PD50519d""",177,85,111,54
"""PD50521b""",233,91,160,62
"""PD50521e""",199,74,119,32
"""PD50523b""",208,85,119,19


In [8]:
call_stats_df.sum()

sample_id,CO,GC,ambiguous,CNCO
str,u32,u32,u32,u32
,1751,614,1022,278


In [33]:
read_names

['m64230e_220906_003948/60032580/ccs',
 'm64178e_220829_002441/25232978/ccs',
 'm64094e_220917_231317/155059512/ccs',
 'm64178e_220829_002441/62392396/ccs',
 'm64094e_220917_231317/96076942/ccs',
 'm64094e_220917_231317/74517145/ccs',
 'm64230e_220906_003948/52166986/ccs',
 'm64230e_220906_003948/11666271/ccs',
 'm64222e_220804_013319/56297546/ccs',
 'm64222e_220804_013319/41749151/ccs',
 'm64222e_220804_013319/45547932/ccs',
 'm64221e_220814_084225/164430634/ccs',
 'm64222e_220804_013319/43976808/ccs',
 'm64222e_220804_013319/47317891/ccs',
 'm64221e_220814_084225/86903563/ccs',
 'm64222e_220804_013319/22283946/ccs',
 'm64222e_220804_013319/116720068/ccs',
 'm64221e_220811_133409/30083306/ccs',
 'm64221e_220811_133409/86312411/ccs',
 'm64221e_220811_133409/57606928/ccs',
 'm64221e_220814_084225/42402180/ccs',
 'm64221e_220811_133409/156237854/ccs',
 'm64221e_220814_084225/135531048/ccs',
 'm64221e_220814_084225/61933374/ccs',
 'm64222e_220804_013319/52035891/ccs',
 'm64222e_220804_013