In [1]:
%load_ext autoreload
%autoreload 2

In [13]:
import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
import tqdm
import glob

from pathlib import Path


# Parse raw data to generate config tsv-s for snakemake

In [28]:
import gzip

def max_quality_in_first_10_reads(fastq_gz_file):
    max_quality = 0
    with gzip.open(fastq_gz_file, 'rt') as f:
        line_count = 0
        for line in f:
            line_count += 1
            # Every 4th line starting from the 4th is a quality score line
            if line_count % 4 == 0:
                # Convert ASCII characters to Phred quality scores
                qualities = [ord(char) - 33 for char in line.strip()]
                max_quality = max(max_quality, max(qualities))
                # Stop after processing 10 reads (40 lines total)
                if line_count >= 40:
                    break
    return max_quality


## CEPH data

### Files table

In [29]:
flow_cells_paths = list(Path("/lustre/scratch122/tol/projects/sperm/data/CEPH").glob("*/m*.filtered.fastq.gz"))

In [30]:
rows = []
for flow_cell_path in tqdm.tqdm(flow_cells_paths):
    # Get names
    sample_id = flow_cell_path.parts[-2]
    flow_cell_id = flow_cell_path.stem.split('.')[0]
    
    # Figure out type - in this case, if max BQ is 40, it's revio
    if max_quality_in_first_10_reads(flow_cell_path) == 40:
        flow_cell_type = "revio"
    else:
        flow_cell_type = "sequel_ii"
    
    # Each sample set has a single sample
    row = [
        sample_id,
        sample_id,
        flow_cell_id,
        flow_cell_type,
        str(flow_cell_path),
    ]
    
    rows.append(row)

100%|██████████| 40/40 [00:02<00:00, 14.28it/s]


In [31]:
pd.DataFrame(
    rows,
    columns = ["sample_set", "sample", "flow_cell", "flow_cell_type", "path"],
).to_csv(
    "/nfs/users/nfs_r/rs42/rs42/git/sperm/configs/CEPH.tsv",
    index=False,
    header=True,
    sep="\t",
)

### Assemblies table

In [34]:
rows = []

for path in glob.glob("/lustre/scratch126/casm/team154pc/sl17/03.sperm/01.data/02.ccs/CEPH/assemblies/*/hifiasm/*/*.hifiasm.dip.hap1.p_ctg.gfa.fasta"):
    rows.append([Path(path).parents[2].name, path.replace("hap1", r"hap{haplotype}")])
    
for path in glob.glob("/lustre/scratch126/casm/team154pc/sl17/03.sperm/01.data/02.ccs/CEPH/assemblies/*/verkko/*/assembly.haplotype1.fasta"):
    rows.append([Path(path).parents[2].name, path.replace("haplotype1", r"haplotype{haplotype}")])    
    
pd.DataFrame(
    rows,
    columns = ["sample_set", "fasta_wildcard_path"],
).to_csv(
    "/nfs/users/nfs_r/rs42/rs42/git/sperm/configs/CEPH_assemblies.tsv",
    index=False,
    header=True,
    sep="\t",
)