In [None]:
"""
CNV Analysis Pipeline Module

This module is designed for the analysis of Copy Number Variations (CNVs) in genomic data.
It integrates global TCGA segment information with sample-specific and CNV signature data to generate signature BED files. (~20 of them)
These BED files are then used to intersect with predefined important genomic features to derive the distribution of signature hits versus random expectations.

Key Features:
- Transformation of global CNV segment data, separating segments from different signatures into BED file format for downstream analysis.
- Generation of random BED files for control comparisons, utilizing utility functions for random bed generation.
- Intersecting signature BED files with important genomic features to analyze the distribution of CNV signatures in relation to these features.
- Statistical analysis of intersections to determine the significance of observed distributions compared to random controls.

Output:
The pipeline produces signature BED files, intersects these with important genomic features,
and statistically analyzes the results to provide insights into the distribution of CNV signatures
in the context of SHAP values for NN cancer classifier.

Note: This module assumes access to a predefined set of input files and directories,
including lists of HDF5 files containing CNV data, and outputs data to specified locations for further analysis.
"""

In [None]:
# pylint: disable=import-error, subprocess-run-check, use-dict-literal

In [None]:
import re
import subprocess  # pylint: disable=unused-import
from pathlib import Path

import pandas as pd
import plotly.express as px  # pylint: disable=unused-import
import plotly.graph_objects as go
from IPython.display import display

from epi_ml.utils.bed_utils import create_new_random_bed  # pylint: disable=unused-import

### Matching CNV names to create subset

In [None]:
base = Path.home() / "Projects/epiclass/input/hdf5_list/CNV"

full_hdf5_list_path = base / "CNV_100kb_all_none.list"

# subset_names_path = base / "list_EpiAtlas_cancer-type_TCGA_files_onlyLeukemia.txt"
subset_names_path = base / "list_EpiAtlas_cancer-type_TCGA.txt"

In [None]:
with open(full_hdf5_list_path, "r", encoding="utf8") as f:
    full_hdf5_list = [line.strip() for line in f.readlines()]

In [None]:
all_names = set(line.split("/")[-1].split(".")[0] for line in full_hdf5_list)

In [None]:
with open(subset_names_path, "r", encoding="utf8") as f:
    subset_names = set(line.strip() for line in f.readlines())

In [None]:
ok_subset = subset_names & all_names
print(len(ok_subset), len(subset_names) - len(ok_subset))

In [None]:
# new_100kb_list = [
#     f"/lustre07/scratch/rabyj/other_data/CNV/hdf5/100kb_all_none/{name}.segments.complete_100kb_all_none.hdf5"
#     for name in ok_subset
# ]

# with open(base / "CNV_EpiAtlas_cancer_onlyLeukemia_100kb_all_none.list", "w", encoding="utf8") as f:
#     f.write("\n".join(new_100kb_list))

### CNV signatures per sample

#### new random features = len(cancer_intersection_merge_samplings.bed)

control beds for intersection results

In [None]:
HDF5_SIZE_100KB = 30321
desired_size = 336
resolution = 100 * 1000

n_beds = 200
output_dir = Path.home() / "Projects/epiclass/input/filter" / "random_n336"

# create_new_random_bed(
#     HDF5_SIZE_100KB, desired_size, resolution, output_dir=output_dir, n_bed=n_beds
# )

#### Creating signatures

In [None]:
base = Path.home() / "Projects/epiclass/input/hdf5_list/CNV"
subset_names_path = base / "list_EpiAtlas_cancer-type_TCGA.txt"
with open(subset_names_path, "r", encoding="utf8") as f:
    subset_names = set(line.strip() for line in f.readlines())

In [None]:
print(len(subset_names))

In [None]:
dir_path = Path.home() / "Projects/epiclass/input/CNV"
filepath = dir_path / "CNV_signatures_per_sample.txt"

df = pd.read_csv(filepath, sep="\t")

In [None]:
df["chr"] = "chr" + df["chr"].astype(str)

In [None]:
# filter for cancer type to match epiatlas training
print(df.shape)
df = df[df["sample"].isin(subset_names)]
print(df.shape)

In [None]:
display(df["sample"].nunique())
display(df.groupby("sample").size().sum())

In [None]:
signature_subset_name = "epiatlas_cancer_types"

In [None]:
grouped_CNV = df.groupby("CNsignatureMapping")
new_cols = list(df.columns)
new_cols.remove("sample")
new_cols = new_cols + ["sample"]

for group in grouped_CNV.groups:
    group_df = grouped_CNV.get_group(group).copy()
    group_df.sort_values(["chr", "startpos", "endpos"], inplace=True)
    group_df.to_csv(
        dir_path / "signatures" / f"signature_{group}_{signature_subset_name}.tsv",
        sep="\t",
        index=False,
        header=True,
        columns=new_cols,
    )
    group_df.to_csv(
        dir_path / "signatures" / f"signature_{group}_{signature_subset_name}.bed",
        sep="\t",
        index=False,
        header=False,
        columns=["chr", "startpos", "endpos"],
    )

#### Intersect bed files with signatures

In [None]:
signature_dir = Path.home() / "Projects/epiclass/input/CNV/signatures"
all_signatures = sorted(
    list(path for path in signature_dir.glob(f"*{signature_subset_name}.bed"))
)
display(all_signatures)

In [None]:
pattern = r"(CN\d+)_" + signature_subset_name
output_index_dict = {
    i + 1: re.search(pattern=pattern, string=path.stem).group(1)
    for i, path in enumerate(all_signatures)
}
display(output_index_dict)

In [None]:
joined_signatures = " ".join((str(path) for path in all_signatures))

In [None]:
output_dir = signature_dir.parent / "signature_analysis" / signature_subset_name

In [None]:
# random beds

random_output_dir = output_dir / "random_n336"

# input_dir = Path.home() / "Projects/epiclass/input/filter" / "random_n336"
# for input_bed_path in input_dir.glob("*random_n336*.bed"):
#     input_bed_name = input_bed_path.name.split(".")[1]

#     output_path = (
#         random_output_dir
#         / f"{input_bed_name}_intersect_CNV_signatures_{signature_subset_name}.tsv"
#     )

#     subprocess.check_call(
#         f"bedtools intersect -C -f 0.5 -a {input_bed_path} -b {joined_signatures} > {output_path}",
#         shell=True,
#     )

In [None]:
# Important cancer features bed

# input_dir = (
#     Path.home()
#     / "scratch/epiclass/join_important_features/hg38_100kb_all_none/global_info/cancer"
# )
# input_bed_path = input_dir / "cancer_intersection_merge_samplings.bed"
# output_path = (
#     output_dir
#     / f"{input_bed_path.stem}_intersect_CNV_signatures_{signature_subset_name}.tsv"
# )

# subprocess.check_call(
#     f"bedtools intersect -C -f 0.5 -a {input_bed_path} -b {joined_signatures} > {output_path}",
#     shell=True,
# )

#### Compute statistics for specified features VS random features

In [None]:
results_dict = {}

intersect_files = list(
    output_dir.glob(f"*intersect_CNV_signatures_{signature_subset_name}.tsv")
)
if not intersect_files:
    raise FileNotFoundError("No intersect files found")
intersect_files.extend(
    list(random_output_dir.glob(f"*intersect_CNV_signatures_{signature_subset_name}.tsv"))
)

In [None]:
# display(intersect_files)

In [None]:
for intersect_file in intersect_files:
    # print(intersect_file)
    df = pd.read_csv(intersect_file, sep="\t", header=None)
    df.columns = ["chr", "startpos", "endpos", "signature_index", "nb_hits"]
    # display(df.head())
    results = df.groupby("signature_index").agg({"nb_hits": "sum"})
    results["signature_name"] = results.index.map(output_index_dict)
    results = results.sort_values("nb_hits", ascending=False)
    results.to_csv(output_dir / intersect_file.with_suffix(".summary.tsv"), sep="\t")

    # Graphical representation
    # fig = px.bar(results, x="signature_name", y="nb_hits", title=intersect_file.stem)
    # fig.update_layout(xaxis_title="Signature", yaxis_title="Number of hits")
    # fig.update_layout(yaxis=dict(range=[0,450*1000]))
    # fig.write_html(output_dir / intersect_file.with_suffix(".summary.html"))
    # fig.write_image(output_dir / intersect_file.with_suffix(".summary.png"))
    # fig.show()

    results_dict[str(intersect_file.stem)] = results

In [None]:
random_names = [set_name for set_name in results_dict if "random" in set_name]
n_beds = len(random_names)

# Compute the average of hits and stdev for random beds
combined_df = pd.concat([results_dict[name] for name in random_names], axis=0)
stats = combined_df.groupby("signature_name")["nb_hits"].agg(["mean", "std"])
stats.to_csv(output_dir / f"random_beds_stats_N{n_beds}_size{desired_size}.tsv", sep="\t")

In [None]:
# Compare values of important cancer features bed with random beds
selected_name = [set_name for set_name in results_dict if "random" not in set_name][0]
cancer_df = results_dict[selected_name]
cancer_df = cancer_df.set_index("signature_name")

new_records = []
for signature in cancer_df.index:
    cancer_hits = cancer_df.loc[signature, "nb_hits"]
    random_mean = stats.loc[signature, "mean"]
    random_std = stats.loc[signature, "std"]
    z_score = (cancer_hits - random_mean) / random_std
    new_records.append((z_score, random_mean, random_std))
    # print(f"{signature}: {cancer_hits} hits, rnd_mean: {random_mean:.0f}, rnd_std: {random_std:.1f}, z_score: {z_score:.2f}")

cancer_df["z_score"] = [record[0] for record in new_records]
cancer_df["rnd_mean"] = [record[1] for record in new_records]
cancer_df["rnd_std"] = [record[2] for record in new_records]

# cancer_df.to_csv(output_dir / "important_cancer_features_z_scores.tsv", sep="\t")

In [None]:
# # Graphical representation

# # Assign groups
# CN_groups = [
#     [f"CN{i}" for i in range(1,4)],
#     [f"CN{i}" for i in range(9,13)],
#     [f"CN{i}" for i in range(13,17)],
#     [f"CN{i}" for i in range(17, 18)],
#     [f"CN{i}" for i in range(18, 22)],
#     [f"CN{i}" for i in range(4, 9)],
#     ]

# CN_names = [
#     "CN1-CN3",
#     "CN9-CN12",
#     "CN13-CN16",
#     "CN17",
#     "CN18-CN21",
#     "CN4-CN8",
#     ]


# for i, group in enumerate(CN_groups):
#     cancer_df.loc[cancer_df.index.isin(group), 'group'] = CN_names[i]
# color_map = {name:px.colors.qualitative.Set1[i] for i, name in enumerate(CN_names)}


# # Create the figure
# fig = go.Figure()

# cancer_df = cancer_df.sort_values(['group', 'z_score'], ascending=[True, False])
# for group in cancer_df['group'].unique():
#     group_data = cancer_df[cancer_df['group'] == group]
#     fig.add_trace(go.Bar(
#         x=group_data.index,
#         y=group_data['z_score'],
#         name=group,
#         marker_color=color_map[group]
#     ))

# fig.update_layout(
#     title=f"epiatlas cancer types - Hits on top SHAP features vs {n_beds} Random feature selections",
#     xaxis_title="Signature",
#     yaxis_title="z-score",
#     barmode='group',
#     legend_title="Group"
# )

# # Add vertical lines to separate groups
# group_ends = cancer_df.groupby('group').apply(lambda x: x.index[-1])
# for end in group_ends[:-1]:
#     fig.add_vline(x=cancer_df.index.get_loc(end) + 0.5, line_dash="dash", line_color="gray")

# # Save the figure
# fig.show()
# # fig.write_image(output_dir / "important_cancer_features_z_scores.png")
# # fig.write_image(output_dir / "important_cancer_features_z_scores.svg")

In [None]:
# output_dir

In [None]:
# Graphical representation
# Assign groups
CN_groups = [
    [f"CN{i}" for i in range(1, 4)],
    [f"CN{i}" for i in range(9, 13)],
    [f"CN{i}" for i in range(13, 17)],
    [f"CN{i}" for i in range(17, 18)],
    [f"CN{i}" for i in range(18, 22)],
    [f"CN{i}" for i in range(4, 9)],
]
CN_names = [
    "CN1-CN3",
    "CN9-CN12",
    "CN13-CN16",
    "CN17",
    "CN18-CN21",
    "CN4-CN8",
]

# Assign groups to the DataFrame
cancer_df["group"] = "Other"
for i, group in enumerate(CN_groups):
    cancer_df.loc[cancer_df.index.isin(group), "group"] = CN_names[i]

# Create color map
color_map = {name: px.colors.qualitative.Set1[i] for i, name in enumerate(CN_names)}

# Sort groups
group_medians = (
    cancer_df.groupby("group")["z_score"].median().sort_values(ascending=False)
)
sorted_CN_names = group_medians.index.tolist()

# Create the figure
fig = go.Figure()

for group in sorted_CN_names:
    group_data = cancer_df[cancer_df["group"] == group]
    marker_size = 4 if group != "CN17" else 6

    # Add the box plot without points
    fig.add_trace(
        go.Box(
            y=group_data["z_score"],
            name=group,
            boxmean=True,
            boxpoints=False,  # Don't show points in the box plot
            line=dict(color="black"),
            fillcolor="rgba(255,255,255,0)",
            showlegend=False,
        )
    )

    # Add scatter plot for individual points
    fig.add_trace(
        go.Scatter(
            x=[group] * len(group_data),
            y=group_data["z_score"],
            mode="markers",
            marker=dict(
                color="red",
                size=marker_size,
            ),
            name=group,
            showlegend=False,
            text=group_data.index,  # Use CN names as hover text
            hoverinfo="text+y",  # Show CN name and y-value on hover
        )
    )
# Update layout
fig.update_layout(
    title={
        "text": f"Z-scores of top SHAP features (N=336) vs {n_beds} random feature sets of same size<br>on epiatlas cancer types"
    },
    xaxis_title="Cancer Type Group",
    yaxis_title="Z-score",
)


# Add a horizontal line at y=0 for reference
fig.add_hline(y=0, line_color="grey", line_width=0.8)

# Show and save the figure
fig.show()
name = "important_cancer_features_z_scores_boxplot"
fig.write_image(output_dir / f"{name}.png")
fig.write_image(output_dir / f"{name}.svg")
fig.write_html(output_dir / f"{name}.html")