In [None]:
"""Transform TCGA CNV (copy number variant) segment info into a binary BED spanning the whole genome.
0 represents no CNV, i.e. nbmajor=nbminor=1
1 represents a CNV, i.e. nbmajor!=nbminor
"""

# pylint: disable=redefined-outer-name, import-error, duplicate-code

In [81]:
from __future__ import annotations

import multiprocessing
import subprocess
from pathlib import Path

import pandas as pd

In [43]:
current_folder = Path(globals()["_dh"][0])

In [10]:
segment_dir = current_folder / "segments"
segment_files = list(segment_dir.glob("*.txt"))

In [53]:
test_file = segment_files[0]

In [54]:
df = pd.read_csv(test_file, sep="\t")

In [74]:
def to_bedgraph_df(df: pd.DataFrame) -> pd.DataFrame:
    """Go from TCGA CNV segment df to bedgraph format
    (4 columns: chrom, start, end, has_cnv)
    """
    df["chrom"] = "chr" + df["chr"].astype(str)
    df["has_cnv"] = ~((df["nMajor"] == 1) & (df["nMinor"] == 1))
    df["has_cnv"] = df["has_cnv"].astype(int)
    df = df[["chrom", "startpos", "endpos", "has_cnv"]]
    df.columns = ["chrom", "start", "end", "has_cnv"]
    return df

In [75]:
def run_bedtools_genomecov(bg_file: str) -> pd.DataFrame:
    """
    Executes bedtools genomecov command with specified parameters, filters the output for regions of zero coverage,
    and collects the result into a pandas DataFrame.

    Args:
        bg_file (str): Path to the bedgraph file.

    Returns:
        pd.DataFrame: A DataFrame containing the filtered output of the bedtools command.
    """
    chrom_sizes_path = (
        Path().home() / "Projects/epiclass/input/chromsizes/hg38.noy.chrom.sizes"
    )
    # Construct the command as a list of arguments
    cmd = ["bedtools", "genomecov", "-g", chrom_sizes_path, "-i", bg_file, "-bga"]

    # Execute the bedtools command and pipe to awk to filter rows where the fourth column equals 0
    result = subprocess.run(cmd, capture_output=True, text=True, check=True)
    awk_cmd = ["awk", "$4==0"]
    with subprocess.Popen(
        awk_cmd, stdin=subprocess.PIPE, stdout=subprocess.PIPE, text=True
    ) as awk_process:
        awk_result, _ = awk_process.communicate(result.stdout)

    # Convert the filtered result into a DataFrame
    data = [line.split() for line in awk_result.strip().split("\n")]
    df = pd.DataFrame(data, columns=["chrom", "start", "end", "coverage"])
    df = df.drop(columns=["coverage"])

    df[["start", "end"]] = df[["start", "end"]].astype(int)

    return df

In [79]:
def complete_bedgraph(bg_file: Path, bg_df: pd.DataFrame) -> pd.DataFrame:
    """
    Completes the bedgraph file by adding regions of zero coverage to the file.

    Args:
        bg_file (str): Path to the bedgraph file.

    Returns:
        pd.DataFrame: A DataFrame containing the complete bedgraph file.
    """
    # Run bedtools genomecov and collect the result into a DataFrame
    df = run_bedtools_genomecov(str(bg_file))

    # Add a column for the has_cnv value, zero coverage means no CNV
    df["has_cnv"] = 0

    # Concatenate the original bedgraph file with the result of bedtools genomecov
    df = pd.concat([bg_df, df], ignore_index=True)
    df = df.sort_values(by=["chrom", "start", "end"])

    # Write the result to a new file
    complete_bg_file = bg_file.with_name(bg_file.stem + ".complete.bg")
    df.to_csv(
        complete_bg_file,
        sep="\t",
        index=False,
        header=False,
        columns=["chrom", "start", "end", "has_cnv"],
    )

    return df

In [80]:
def process_one_file(segment_file: Path) -> None:
    """From a segment file, create a bedgraph file and complete the missing coverage regions with value 0."""
    df = pd.read_csv(segment_file, sep="\t")
    bedgraph_df = to_bedgraph_df(df)
    bedgraph_file = segment_file.with_suffix(".bg")
    bedgraph_df.to_csv(bedgraph_file, sep="\t", index=False, header=False)
    complete_bedgraph(bedgraph_file, bedgraph_df)

In [82]:
def process_all_files(segment_files: list[Path]) -> None:
    """Process all segment files in parallel."""
    with multiprocessing.Pool() as pool:
        pool.map(process_one_file, segment_files)

In [83]:
process_all_files(segment_files)