In [1]:
import pathlib
import random
import string
import subprocess
from concurrent.futures import ProcessPoolExecutor, as_completed

import pandas as pd

In [2]:
def sort_and_split_large_fragment_files(
    path, prefix, sort=True, cpu=1, mem_gb_per_core=3, barcode_col=4, max_rows=100000000, temp_dir="/tmp"
):
    """
    Split large fragment files into smaller files and sort them by barcode.

    Parameters
    ----------
    path : str
        Path to the fragment file.
    prefix : str
        Prefix for the split files.
    sort : bool, optional
        Sort the fragments by barcode. Default is True.
    cpu : int, optional
        Number of CPUs to use for sorting. Default is 1.
    mem_gb_per_core : int, optional
        Memory in GB per core for sorting. Default is 3.
    barcode_col : int, optional
        Column index for the barcode in the fragments file, start from 1. Default is 4.
    max_rows : int, optional
        Maximum number of rows for each split file. Default is 100,000,000.
    temp_dir : str, optional
        Temporary directory to store the sorted fragments file. Default is "/tmp".

    Returns
    -------
    None
    """
    path = pathlib.Path(path)

    # Generate a random string of length 10
    random_string = "".join(random.choices(string.ascii_letters + string.digits, k=10))

    unzip_cmd = f"gunzip -c {path} > {temp_dir}/{random_string}.tsv"
    if sort:
        sort_cmd = (
            f"sort -S {int(cpu*mem_gb_per_core)}G -T {temp_dir} "
            f"--compress-program gzip --parallel {cpu} "
            f"-k{barcode_col},{barcode_col} {temp_dir}/{random_string}.tsv "
            f"| split -l {max_rows} - {temp_dir}/{random_string}{prefix}."
        )
    else:
        sort_cmd = f"split -l {max_rows} {temp_dir}/{random_string}.tsv {temp_dir}/{random_string}{prefix}."
    zip_cmd = f"rm -f {temp_dir}/{random_string}.tsv && pigz -p {cpu} {temp_dir}/{random_string}{prefix}*"

    subprocess.check_call(unzip_cmd, shell=True)
    subprocess.check_call(sort_cmd, shell=True)
    subprocess.check_call(zip_cmd, shell=True)

    new_path_temp_list = []
    for p in pathlib.Path(temp_dir).glob(f"{random_string}{prefix}*.gz"):
        new_path_temp = path.parent / f"{p.name[len(random_string):-3]}.fragments.tsv.gz_temp"
        subprocess.check_call(f"mv {p} {new_path_temp}", shell=True)
        new_path_temp_list.append(new_path_temp)
    for new_path_temp in new_path_temp_list:
        new_path_temp.rename(new_path_temp.parent / new_path_temp.name[:-5])
    return

In [3]:
frag_paths = pd.Series(
    {
        ".".join(p.name.split(".")[:-3]): p
        for p in pathlib.Path("/tempdata/Calderon2022Science/").glob("*.fragments.txt.gz")
    }
).sample(16)
frag_paths

exp2_hrs02-04_b2    /tempdata/Calderon2022Science/exp2_hrs02-04_b2...
exp2_hrs12-16_b1    /tempdata/Calderon2022Science/exp2_hrs12-16_b1...
exp2_hrs01-03_b2    /tempdata/Calderon2022Science/exp2_hrs01-03_b2...
exp1_hrs08-12_b1    /tempdata/Calderon2022Science/exp1_hrs08-12_b1...
exp2_hrs03-07_b2    /tempdata/Calderon2022Science/exp2_hrs03-07_b2...
exp1_hrs03-07_b1    /tempdata/Calderon2022Science/exp1_hrs03-07_b1...
exp2_hrs14-18_b1    /tempdata/Calderon2022Science/exp2_hrs14-18_b1...
exp1_hrs06-10_b1    /tempdata/Calderon2022Science/exp1_hrs06-10_b1...
exp1_hrs16-20_b1    /tempdata/Calderon2022Science/exp1_hrs16-20_b1...
exp2_hrs10-14_b1    /tempdata/Calderon2022Science/exp2_hrs10-14_b1...
exp1_hrs04-08_b1    /tempdata/Calderon2022Science/exp1_hrs04-08_b1...
exp2_hrs00-02_b2    /tempdata/Calderon2022Science/exp2_hrs00-02_b2...
exp1_hrs10-14_b1    /tempdata/Calderon2022Science/exp1_hrs10-14_b1...
exp2_hrs16-20_b1    /tempdata/Calderon2022Science/exp2_hrs16-20_b1...
exp1_hrs14-18_b1    

In [4]:
with ProcessPoolExecutor(6) as exe:
    fs = []
    for sample, path in frag_paths.items():
        f = exe.submit(
            sort_and_split_large_fragment_files,
            path=path,
            prefix=sample,
            sort=True,
            cpu=16,
            mem_gb_per_core=3,
            barcode_col=4,
            max_rows=100000000,
            temp_dir="/tmp",
        )
        fs.append(f)
    for f in as_completed(fs):
        f.result()