In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import matplotlib.patches as patches

from pathlib import Path
import tqdm
import sys
import seaborn as sns
import scipy.stats
import os
import joblib

import polars as pl
pl.Config.set_tbl_rows(-1);
pl.Config.set_fmt_str_lengths(50);


In [3]:
# This is needed before pybedtools to make sure bedtools is imported on sanger JupyterHub 
os.environ["PATH"] += ":" + os.path.join(sys.prefix, "bin")
import pybedtools


In [4]:
sys.path.append(str(Path(os.getcwd()).parent))
from src import liftover, annotate, diagnostics, inference
from src.IDs import *

In [75]:
%%time
reads_df = pl.concat([
    pl.scan_parquet(
        f"/lustre/scratch126/casm/team154pc/sl17/03.sperm/02.results/01.read_alignment/01.ccs/04.hifiasm/02.hifiasm_0.19.5-r592/02.chromosome_length_scaffolds/{focal_sample_id}/reads/{chrom}_RagTag.certainty_0.95.all_reads_structure_annotated.parquet"
    ) 
    for focal_sample_id in tqdm.tqdm(sample_ids)
    for chrom in aut_chrom_names
])

100%|██████████| 9/9 [00:03<00:00,  2.95it/s]

CPU times: user 148 ms, sys: 150 ms, total: 298 ms
Wall time: 3.07 s





## Subset of reads with at least 4 SNPs (only four because we focus on COs)

In [80]:
%%time
test_df = (reads_df
    .select(
        'read_name',
        'chrom',
        'sample_id',
        'grch37_reference_start',
        'grch37_reference_end', 
        "grch37_chromosome_size_in_bp",
        "mid_quality_snp_positions",
        "between_mid_quality_snps_cM",
        "before_read_cM",
        "after_read_cM",
        "idx_transitions",
    )
    .filter(
        pl.col("grch37_reference_start").is_not_null() & \
        pl.col("grch37_reference_end").is_not_null()
    )
    .head(500)
    .collect()
)

CPU times: user 776 ms, sys: 172 ms, total: 948 ms
Wall time: 425 ms


In [42]:
rate_map = annotate.rate_maps["chr1"]

In [81]:
test_df.head(10)

read_name,chrom,sample_id,grch37_reference_start,grch37_reference_end,grch37_chromosome_size_in_bp,mid_quality_snp_positions,between_mid_quality_snps_cM,before_read_cM,after_read_cM,idx_transitions
str,str,str,i64,i64,i32,list[i64],list[f64],f64,f64,list[i64]
"""m64178e_220829_002441/31917701/ccs""","""chr1""","""PD50477f""",57080451,57096052,249250621,[],,0.000202,0.000112,
"""m64230e_220906_003948/8128517/ccs""","""chr1""","""PD50477f""",55655495,55673769,249250621,[],,0.000625,0.000751,
"""m64230e_220906_003948/107806804/ccs""","""chr1""","""PD50477f""",42122479,42134593,249250621,[],,0.000329,0.000161,
"""m64230e_220906_003948/27853954/ccs""","""chr1""","""PD50477f""",85897457,85911642,249250621,"[703, 1004, … 13494]","[0.000009, 0.000004, … 0.00001]",0.000656,0.000105,
"""m64094e_220917_231317/171508784/ccs""","""chr1""","""PD50477f""",227673258,227687338,249250621,[10689],"[0.003215, 0.00101]",0.000815,0.011108,
"""m64230e_220906_003948/53085521/ccs""","""chr1""","""PD50477f""",21270608,21282758,249250621,"[4791, 11214]","[0.000082, 0.000066, 0.000008]",0.000793,4.1e-05,
"""m64178e_220829_002441/59771296/ccs""","""chr1""","""PD50477f""",57237238,57250383,249250621,"[1285, 3981, … 10668]","[0.000428, 0.00188, … 0.000697]",0.000893,0.044708,
"""m64178e_220829_002441/51513292/ccs""","""chr1""","""PD50477f""",208895949,208911281,249250621,[],,0.00012,0.003563,
"""m64094e_220917_231317/95814353/ccs""","""chr1""","""PD50477f""",50422570,50433494,249250621,[10352],"[0.000415, 0.000031]",8.9e-05,0.00027,
"""m64178e_220829_002441/78972673/ccs""","""chr1""","""PD50477f""",230380596,230395375,249250621,"[2663, 2924, … 11950]","[0.000021, 0.000002, … 0.00004]",4e-05,0.000105,


In [21]:
values = np.concatenate(test_df["mid_quality_snp_positions"].to_numpy())

In [23]:
len(values)

4340

In [11]:
lens = test_df.select(pl.col("mid_quality_snp_positions").list.len()).to_numpy().ravel()

In [40]:
test_df["mid_quality_snp_positions"].to_numpy()[:10]

array([array([], dtype=int64), array([], dtype=int64),
       array([], dtype=int64),
       array([  703,  1004,  1671,  1758,  1971,  3269,  3851,  3909,  4140,
               4587,  7526,  8926,  9356, 10169, 11192, 11327, 11753, 13320,
              13436, 13494])                                                ,
       array([10689]), array([ 4791, 11214]),
       array([ 1285,  3981,  5372,  5683,  5852,  6280,  6680,  8657,  8840,
               8947,  9050,  9355, 10668])                                  ,
       array([], dtype=int64), array([10352]),
       array([ 2663,  2924,  4561,  5753,  6696,  7631,  8193,  8284,  8404,
               9445, 10652, 11373, 11950])                                  ],
      dtype=object)

In [41]:
for i,x,y in zip(range(len(test_df)), np.array_split(values, np.cumsum(lens)), test_df["mid_quality_snp_positions"].to_numpy()):
    if len(x) != len(y):
        print(i, x, y)

In [7]:
pratto = pd.read_csv(
    "/lustre/scratch126/casm/team154pc/sl17/03.sperm/01.data/07.references/06.hotspots/pratto_2014_hotspots.tsv",
    delim_whitespace = True,
    comment = "#",
)

In [8]:
pratto.head()

Unnamed: 0,chrom,start,end,AA1_strength,AA2_strength,AB1_strength,AB2_strength,AC_strength,AA1_hotspots,AA2_hotspots,...,A_hotspots_in_AC,C_hotspots,AA1_down,AA1_up,AA2_down,AA2_up,AB1_down,AB1_up,at_LCR,Stronger_than_Berg_et_al_Hotspots
0,chr1,12769,13928,9,20,6,23,2,0,0,...,0,0,0,0,0,0,0,0,0,0
1,chr1,15497,18631,145,130,144,83,98,1,1,...,1,0,0,0,1,0,0,0,0,0
2,chr1,36224,37432,33,17,18,16,12,1,0,...,1,0,0,0,0,0,0,0,0,0
3,chr1,38426,40527,158,203,183,56,192,1,1,...,1,0,0,0,0,0,0,0,0,0
4,chr1,90761,91987,8,16,32,0,22,0,0,...,0,0,0,0,0,0,0,0,1,0


In [11]:
(
    pybedtools.BedTool.from_dataframe(
        pratto[["chrom", "start", "end", "AA1_strength"]].sort_values(["chrom", "start"])
    ).intersect(
        pybedtools.BedTool.from_dataframe(
            test_df
                .select("chrom", "grch37_reference_start", "grch37_reference_end")
                .sort("chrom", "grch37_reference_start", "grch37_reference_end")
                .to_pandas()
        )
    )
).head()

chr1	1517726	1520037	412
 chr1	1892101	1892581	166
 chr1	1892888	1894125	14
 chr1	1900372	1902441	256
 chr1	2068699	2070097	1
 chr1	2584705	2586012	0
 chr1	5858381	5859402	10
 chr1	6059391	6060994	77
 chr1	6064204	6066013	211
 chr1	6204691	6205582	10
 