In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
import tqdm
import glob

from pathlib import Path


# Parse raw data to generate config tsv-s for snakemake

In [6]:
import gzip

def max_quality_in_first_10_reads(fastq_gz_file):
    max_quality = 0
    with gzip.open(fastq_gz_file, 'rt') as f:
        line_count = 0
        for line in f:
            line_count += 1
            # Every 4th line starting from the 4th is a quality score line
            if line_count % 4 == 0:
                # Convert ASCII characters to Phred quality scores
                qualities = [ord(char) - 33 for char in line.strip()]
                max_quality = max(max_quality, max(qualities))
                # Stop after processing 10 reads (40 lines total)
                if line_count >= 40:
                    break
    return max_quality

import gzip

def num_qualities_in_first_10_reads(fastq_gz_file):
    all_qualities = set()
    with gzip.open(fastq_gz_file, 'rt') as f:
        line_count = 0
        for line in f:
            line_count += 1
            # Every 4th line starting from the 4th is a quality score line
            if line_count % 4 == 0:
                # Convert ASCII characters to Phred quality scores
                qualities = [ord(char) - 33 for char in line.strip()]
                all_qualities |= set(qualities)
                # Stop after processing 10 reads (40 lines total)
                if line_count >= 40:
                    break
    return all_qualities



## CEPH data

### Files table

In [4]:
flow_cells_paths = list(Path("/lustre/scratch122/tol/projects/sperm/data/CEPH").glob("*/m*.filtered.fastq.gz"))

In [8]:
rows = []
for flow_cell_path in tqdm.tqdm(flow_cells_paths):
    # Get names
    sample_id = flow_cell_path.parts[-2]
    flow_cell_id = flow_cell_path.stem.split('.')[0]
    
    # Figure out type - in this case, if max BQ is 40, it's revio
    #print(sample_id, flow_cell_id, max_quality_in_first_10_reads(flow_cell_path), sorted(list(num_qualities_in_first_10_reads(flow_cell_path))))
    if (M := max_quality_in_first_10_reads(flow_cell_path)) == 40:
        flow_cell_type = "revio_40"
    elif M == 50:
        flow_cell_type = "revio_50"
    else:
        flow_cell_type = "sequel_ii"
    
    # Each sample set has a single sample
    row = [
        sample_id,
        sample_id,
        flow_cell_id,
        flow_cell_type,
        str(flow_cell_path),
    ]
    
    rows.append(row)

100%|██████████| 107/107 [00:02<00:00, 49.06it/s]


In [9]:
pd.DataFrame(
    rows,
    columns = ["sample_set", "sample_id", "flow_cell", "flow_cell_type", "path"],
).to_csv(
    "/nfs/users/nfs_r/rs42/rs42/git/sperm/configs/CEPH.tsv",
    index=False,
    header=True,
    sep="\t",
)

### Assemblies table

In [36]:
rows = []

for path in glob.glob("/lustre/scratch126/casm/team154pc/sl17/03.sperm/01.data/02.ccs/CEPH/assemblies/*/hifiasm/*/*.hifiasm.dip.hap1.p_ctg.gfa.fasta"):
    rows.append([Path(path).parents[2].name, path.replace("hap1", r"hap{haplotype}")])

for path in glob.glob("/lustre/scratch126/casm/team154pc/sl17/03.sperm/01.data/02.ccs/CEPH/assemblies/*/hifiasm/*//K1463_*_1.fasta.gz"):
    rows.append([Path(path).parents[2].name, path.replace("_1.fasta", r"_{haplotype}.fasta")])    
    
for path in glob.glob("/lustre/scratch126/casm/team154pc/sl17/03.sperm/01.data/02.ccs/CEPH/assemblies/*/verkko/*/assembly.haplotype1.fasta"):
    rows.append([Path(path).parents[2].name, path.replace("haplotype1", r"haplotype{haplotype}")])    

    
pd.DataFrame(
    rows,
    columns = ["sample_set", "fasta_wildcard_path"],
).to_csv(
    "/nfs/users/nfs_r/rs42/rs42/git/sperm/configs/CEPH_assemblies.tsv",
    index=False,
    header=True,
    sep="\t",
)

## Rahbari samples

In [7]:
flow_cells_paths = list(Path("/lustre/scratch122/tol/projects/sperm/data/Rahbari").glob("*/*/m*.filtered.fastq.gz"))

In [8]:
rahbari_sample_ids = [
    #
    # Sperm samples
    #
    "PD50477f",
    # "PD50508bf", -- ignore; merged two sampling dates just for phasing, but should be analyzed separately
    "PD50519d",
    # "PD47269d", -- don't use, not there
    "PD50508f",
    # "PD50511e", -- don't use, likely mixture
    "PD50523b",
    # "PD48473b", -- don't use, not there
    "PD50521b",
    "PD50508b",
    # "PD50521be", -- ignore; merged two sampling dates just for phasing, but should be analyzed separately
    "PD46180c",
    # "PD50502f", -- don't use, likely mixture
    "PD50521e",
    # "PD50511e_SS",  --- don't use
    "PD50489e",
    # 
    # Blood samples:
    #
    "PD47269d",
    "PD48473b",
]

sample_id_to_sample_set = {
    "PD50477f": "PD50477f",
#    "PD50519d": "PD46180c_PD50519d",
    "PD50519d": "PD50519d",
    "PD50508f": "PD50508b_PD50508f",
    "PD50523b": "PD50523b",
    "PD50521b": "PD50521b_PD50521e",
    "PD50508b": "PD50508b_PD50508f",
#    "PD46180c": "PD46180c_PD50519d",
    "PD46180c": "PD46180c",
    "PD50521e": "PD50521b_PD50521e",
    "PD50489e": "PD50489e",
    "PD47269d": "PD47269d",
    "PD48473b": "PD48473b",
}

In [9]:
rows = []
for flow_cell_path in tqdm.tqdm(flow_cells_paths):
    # Get names
    sample_id = flow_cell_path.parts[-3]
    if sample_id not in rahbari_sample_ids:
        continue
    sample_set = sample_id_to_sample_set[sample_id]
    flow_cell_id = flow_cell_path.stem.split('.')[0]
    
    # Figure out type - in this case, if max BQ is 40, it's revio
    #print(sample_id, flow_cell_id, max_quality_in_first_10_reads(flow_cell_path), sorted(list(num_qualities_in_first_10_reads(flow_cell_path))))
    if (M := max_quality_in_first_10_reads(flow_cell_path)) == 40:
        flow_cell_type = "revio_40"
    elif "02.revio" in str(flow_cell_path):
        flow_cell_type = "revio_unbinned"
    else:
        flow_cell_type = "sequel_ii"
    
    # Each sample set has a single sample
    row = [
        sample_set,
        sample_id,
        flow_cell_id,
        flow_cell_type,
        str(flow_cell_path),
        #sorted(list(num_qualities_in_first_10_reads(flow_cell_path)))
    ]
    
    rows.append(row)

100%|██████████| 69/69 [00:05<00:00, 12.28it/s]


In [37]:
with pd.option_context('display.max_rows', 10, 'display.max_columns', 10):
    display(pd.DataFrame(rows))

Unnamed: 0,0,1,2,3,4
0,PD50521b_PD50521e,PD50521b,m84093_241012_145529_s3,revio_40,/lustre/scratch122/tol/projects/sperm/data/Rah...
1,PD50521b_PD50521e,PD50521b,m84047_240924_113521_s4,revio_unbinned,/lustre/scratch122/tol/projects/sperm/data/Rah...
2,PD50521b_PD50521e,PD50521b,m64178e_240721_224443,sequel_ii,/lustre/scratch122/tol/projects/sperm/data/Rah...
3,PD50521b_PD50521e,PD50521b,m64229e_220912_125215,sequel_ii,/lustre/scratch122/tol/projects/sperm/data/Rah...
4,PD50521b_PD50521e,PD50521b,m64016e_220901_135237,sequel_ii,/lustre/scratch122/tol/projects/sperm/data/Rah...
...,...,...,...,...,...
55,PD46180c,PD46180c,m64178e_220824_153720,sequel_ii,/lustre/scratch122/tol/projects/sperm/data/Rah...
56,PD46180c,PD46180c,m64016e_220904_102424,sequel_ii,/lustre/scratch122/tol/projects/sperm/data/Rah...
57,PD46180c,PD46180c,m64094e_220916_121850,sequel_ii,/lustre/scratch122/tol/projects/sperm/data/Rah...
58,PD50508b_PD50508f,PD50508f,m64178e_220827_132801,sequel_ii,/lustre/scratch122/tol/projects/sperm/data/Rah...


In [38]:
pd.DataFrame(
    rows,
    columns = ["sample_set", "sample_id", "flow_cell", "flow_cell_type", "path"],
).to_csv(
    "/nfs/users/nfs_r/rs42/rs42/git/sperm/configs/Rahbari.tsv",
    index=False,
    header=True,
    sep="\t",
)

### Use previous Sequel-only assemblies in assemblies table

In [39]:
rows = []

for path in glob.glob("/lustre/scratch122/tol/projects/sperm/data/Rahbari/hifiasm_0.19.5-r592_assemblies/*/*.hap1.p_ctg.fasta"):
    old_joint = Path(path).parents[0].name
    sample_set = old_joint
    if old_joint == "PD50508bf":
        sample_set = "PD50508b_PD50508f"
    elif old_joint == "PD50521be":
        sample_set = "PD50521b_PD50521e"
    rows.append([sample_set, path.replace("hap1", r"hap{haplotype}")])

    
pd.DataFrame(
    rows,
    columns = ["sample_set", "fasta_wildcard_path"],
).to_csv(
    "/nfs/users/nfs_r/rs42/rs42/git/sperm/configs/Rahbari_assemblies.tsv",
    index=False,
    header=True,
    sep="\t",
)

### Sequel-only subset

In [7]:
(
    pd.DataFrame(
        rows,
        columns = ["sample_set", "sample_id", "flow_cell", "flow_cell_type", "path"],
    )
    .query("flow_cell_type == 'sequel_ii'")
    .to_csv(
        "/nfs/users/nfs_r/rs42/rs42/git/sperm/configs/Rahbari_sequel_only.tsv",
        index=False,
        header=True,
        sep="\t",
    )
)

### Revio-only subset

#### ALL

In [8]:
(
    pd.DataFrame(
        rows,
        columns = ["sample_set", "sample_id", "flow_cell", "flow_cell_type", "path"],
    )
    .query("flow_cell_type != 'sequel_ii'")
    .to_csv(
        "/nfs/users/nfs_r/rs42/rs42/git/sperm/configs/Rahbari_revio_only.tsv",
        index=False,
        header=True,
        sep="\t",
    )
)

#### Two of each, to make assembly possible

In [12]:
(
    pd.DataFrame(
        rows,
        columns = ["sample_set", "sample_id", "flow_cell", "flow_cell_type", "path"],
    )
    .query("flow_cell_type == 'revio_unbinned'")
    .groupby(["sample_id"]).head(2)
    .to_csv(
        "/nfs/users/nfs_r/rs42/rs42/git/sperm/configs/Rahbari_revio_only_subset.tsv",
        index=False,
        header=True,
        sep="\t",
    )
)

### Revio-only Assemblies 

In [11]:
rows = []

for path in glob.glob("/lustre/scratch122/tol/projects/sperm/results/Rahbari_revio_only_subset_20250114/assemblies/*/haplotype_1.fasta"):
    sample_set = Path(path).parents[0].name
    rows.append([sample_set, path.replace("haplotype_1", r"haplotype_{haplotype}")])

    
pd.DataFrame(
    rows,
    columns = ["sample_set", "fasta_wildcard_path"],
).to_csv(
    "/nfs/users/nfs_r/rs42/rs42/git/sperm/configs/Rahbari_revio_only_assemblies.tsv",
    index=False,
    header=True,
    sep="\t",
)

## Sudmant

In [43]:
flow_cells_paths = list(Path("/lustre/scratch122/tol/projects/sperm/data/PeterSudmant").glob("*/m*.filtered.fastq.gz"))

In [45]:
sudmant_sample_ids = [    
    "1894",
    "3898",
    "7899",
    "95619",
    "115980",
    "2895",
    # "105621", -- weird assembly, takes a long time, errors, and CO reads are x5 what they should be
    # "6901",  --- weird assembly, maybe contamination?
    # "8900",  --- surprisingly low coverage
]

In [48]:
rows = []
for flow_cell_path in tqdm.tqdm(flow_cells_paths):
    # Get names
    sample_id = flow_cell_path.parts[-2]
    if sample_id not in sudmant_sample_ids:
        continue
    flow_cell_id = flow_cell_path.stem.split('.')[0]
        
        
    # Each sample set has a single sample
    row = [
        sample_id,
        sample_id,
        flow_cell_id,
        "revio_40",   # They are all revio with max 40
        str(flow_cell_path),
#        sorted(list(num_qualities_in_first_10_reads(flow_cell_path)))
    ]
    
    rows.append(row)

100%|██████████| 20/20 [00:00<00:00, 106589.68it/s]


In [49]:
pd.DataFrame(
    rows,
    columns = ["sample_set", "sample_id", "flow_cell", "flow_cell_type", "path"],
).to_csv(
    "/nfs/users/nfs_r/rs42/rs42/git/sperm/configs/Sudmant.tsv",
    index=False,
    header=True,
    sep="\t",
)