In [157]:
"""Plot PCA representation for various datasets."""
# pylint: disable=redefined-outer-name,use-dict-literal,import-error

'Plot PCA representation for various datasets.'

## SETUP

In [158]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [159]:
from __future__ import annotations

from pathlib import Path

import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
import skops.io as skio
from IPython.display import display  # pylint: disable=unused-import

from epi_ml.core.hdf5_loader import Hdf5Loader
from epi_ml.utils.notebooks.paper.paper_utilities import (
    ASSAY_ORDER,
    MetadataHandler,
    merge_epiatlas_CA_metadata,
)

In [160]:
CORE_ASSAYS = ASSAY_ORDER[0:7]

In [161]:
base_dir = Path.home() / "Projects/epiclass/output/paper"
base_data_dir = base_dir / "data"
base_fig_dir = base_dir / "figures"
paper_dir = base_dir

if not base_fig_dir.exists():
    raise FileNotFoundError(f"Directory {base_fig_dir} does not exist.")

In [162]:
metadata_handler = MetadataHandler(paper_dir)
metadata_v2 = metadata_handler.load_metadata("v2")

In [163]:
chromsize_path = base_data_dir / "chromsizes" / "hg38.noy.chrom.sizes"
hdf5_loader = Hdf5Loader(chrom_file=chromsize_path, normalization=True)

### Metadata setup

In [164]:
ca_pred_path = (
    base_data_dir
    / "training_results"
    / "predictions"
    / "C-A"
    / "assay_epiclass"
    / "CA_metadata_4DB+all_pred_subset.20240606_mod2.tsv"
)
ca_pred_df = pd.read_csv(ca_pred_path, sep="\t", low_memory=False)

In [165]:
PLOT_LABEL = "plot_label"

In [167]:
custom_metadata_df = merge_epiatlas_CA_metadata(metadata_v2, ca_pred_df)

### PCA results loading

In [168]:
pca_dir = base_data_dir / "pca"
pca_fit = skio.load(pca_dir / "IPCA_fit_n88777.skops")
pca_results = skio.load(pca_dir / "X_IPCA_n88777.skops")

In [169]:
pca_data = pca_results["X_ipca"]

In [170]:
ipca_fit = pca_fit["ipca_fit"]
pca_samples = pca_fit["file_names"]
explained_variance = ipca_fit.explained_variance_ratio_

In [171]:
global_pca_df = pd.DataFrame(pca_data)
global_pca_df.columns = [f"PC{i+1}" for i in range(global_pca_df.shape[1])]
global_pca_df["sample_id"] = pca_samples

In [172]:
filtered_pca_df = global_pca_df[global_pca_df["sample_id"].isin(custom_metadata_df.index)]

In [173]:
final_pca_df = filtered_pca_df.merge(
    custom_metadata_df, left_on="sample_id", right_index=True
)

In [174]:
final_pca_df["DB"] = final_pca_df[PLOT_LABEL].apply(lambda x: x.split("_", maxsplit=1)[0])
final_pca_df["assay"] = final_pca_df[PLOT_LABEL].apply(
    lambda x: x.split("_", maxsplit=1)[1]
)

In [175]:
final_pca_df["DB"].value_counts()

C-A         48669
epiatlas    20922
Name: DB, dtype: int64

## Plotting

In [176]:
output_dir = base_fig_dir / "pca"

In [189]:
core_assay_df = final_pca_df[final_pca_df["assay"].isin(CORE_ASSAYS)]

In [190]:
color_dict = {
    "C-A": px.colors.qualitative.Dark24[0],
    "epiatlas": px.colors.qualitative.Dark24[1],
}

fig = go.Figure()
for db_label, color in color_dict.items():
    filtered_df = final_pca_df[final_pca_df["DB"] == db_label]
    fig.add_trace(
        go.Scatter3d(
            x=filtered_df["PC1"],
            y=filtered_df["PC2"],
            z=filtered_df["PC3"],
            mode="markers",
            marker=dict(
                size=1,
                color=color,
                opacity=0.5,
            ),
            hovertemplate="%{text}",
            text=[
                f"{id_label}: {assay} ({db_label})"
                for id_label, assay, db_label in zip(
                    filtered_df["sample_id"],
                    filtered_df["assay"],
                    filtered_df["DB"],
                )
            ],
            name=f"{db_label} (N={filtered_df.shape[0]})",
            showlegend=True,
        )
    )

axis_titles = [f"PC {i+1} ({explained_variance[i]:.2%})" for i in range(3)]

fig.update_layout(
    title="3D PCA - epiATLAS and ChiP-Atlas - all samples",
    scene=dict(
        xaxis_title=axis_titles[0],
        yaxis_title=axis_titles[1],
        zaxis_title=axis_titles[2],
    ),
    legend={"itemsizing": "constant"},
)

fig.write_html(output_dir / "pca_all_samples_C-A_epiatlas_3D.html")

In [191]:
color_dict = {
    "C-A": px.colors.qualitative.Dark24[0],
    "epiatlas": px.colors.qualitative.Dark24[1],
}

fig = go.Figure()
for db_label, color in color_dict.items():
    filtered_df = core_assay_df[core_assay_df["DB"] == db_label]
    fig.add_trace(
        go.Scatter3d(
            x=filtered_df["PC1"],
            y=filtered_df["PC2"],
            z=filtered_df["PC3"],
            mode="markers",
            marker=dict(
                size=1,
                color=color,
                opacity=0.5,
            ),
            hovertemplate="%{text}",
            text=[
                f"{id_label}: {assay} ({db_label})"
                for id_label, assay, db_label in zip(
                    filtered_df["sample_id"],
                    filtered_df["assay"],
                    filtered_df["DB"],
                )
            ],
            name=f"{db_label} (N={filtered_df.shape[0]})",
            showlegend=True,
        )
    )

axis_titles = [f"PC {i+1} ({explained_variance[i]:.2%})" for i in range(3)]

fig.update_layout(
    title="3D PCA - epiATLAS and ChiP-Atlas - core7 samples",
    scene=dict(
        xaxis_title=axis_titles[0],
        yaxis_title=axis_titles[1],
        zaxis_title=axis_titles[2],
    ),
    legend={"itemsizing": "constant"},
)

fig.write_html(output_dir / "pca_core7_C-A_epiatlas_3D.html")

In [192]:
color_dict = {
    plot_label: px.colors.qualitative.Dark24[i]
    for i, plot_label in enumerate(final_pca_df[PLOT_LABEL].unique())
}

fig = go.Figure()
for plot_label, color in color_dict.items():
    filtered_df = core_assay_df[core_assay_df[PLOT_LABEL] == plot_label]
    fig.add_trace(
        go.Scatter3d(
            x=filtered_df["PC1"],
            y=filtered_df["PC2"],
            z=filtered_df["PC3"],
            mode="markers",
            marker=dict(
                size=1,
                color=color,
                opacity=0.5,
            ),
            hovertemplate="%{text}",
            text=[
                f"{id_label}: {assay} ({db_label})"
                for id_label, assay, db_label in zip(
                    filtered_df["sample_id"],
                    filtered_df["assay"],
                    filtered_df["DB"],
                )
            ],
            name=f"{plot_label} (N={filtered_df.shape[0]})",
            showlegend=True,
        )
    )

axis_titles = [f"PC {i+1} ({explained_variance[i]:.2%})" for i in range(3)]

fig.update_layout(
    title="3D PCA - epiATLAS and ChiP-Atlas - core7 samples",
    scene=dict(
        xaxis_title=axis_titles[0],
        yaxis_title=axis_titles[1],
        zaxis_title=axis_titles[2],
    ),
    legend={"itemsizing": "constant"},
)

fig.write_html(output_dir / "pca_core7_per_assay_C-A_epiatlas_3D.html")

In [193]:
color_dict = {
    plot_label: px.colors.qualitative.Dark24[i]
    for i, plot_label in enumerate(final_pca_df[PLOT_LABEL].unique())
}

fig = go.Figure()
for plot_label, color in color_dict.items():
    filtered_df = core_assay_df[core_assay_df[PLOT_LABEL] == plot_label]
    fig.add_trace(
        go.Scatter(
            x=filtered_df["PC1"],
            y=filtered_df["PC2"],
            mode="markers",
            marker=dict(
                size=1,
                color=color,
                opacity=0.5,
            ),
            hovertemplate="%{text}",
            text=[
                f"{id_label}: {assay} ({db_label})"
                for id_label, assay, db_label in zip(
                    filtered_df["sample_id"],
                    filtered_df["assay"],
                    filtered_df["DB"],
                )
            ],
            name=f"{plot_label} (N={filtered_df.shape[0]})",
            showlegend=True,
        )
    )

axis_titles = [f"PC {i+1} ({explained_variance[i]:.2%})" for i in range(2)]

fig.update_layout(
    title="2D PCA - epiATLAS and ChiP-Atlas - core7 samples",
    xaxis_title=axis_titles[0],
    yaxis_title=axis_titles[1],
    legend={"itemsizing": "constant"},
)

name = "pca_core7_per_assay_C-A_epiatlas_2D"
fig.write_html(output_dir / f"{name}.html")
fig.write_image(output_dir / f"{name}.png")
fig.write_image(output_dir / f"{name}.svg")

In [194]:
color_dict = {
    "C-A": px.colors.qualitative.Dark24[0],
    "epiatlas": px.colors.qualitative.Dark24[1],
}

fig = go.Figure()
for db_label, color in color_dict.items():
    filtered_df = core_assay_df[core_assay_df["DB"] == db_label]
    fig.add_trace(
        go.Scatter(
            x=filtered_df["PC1"],
            y=filtered_df["PC2"],
            mode="markers",
            marker=dict(
                size=1,
                color=color,
                opacity=0.8,
            ),
            hovertemplate="%{text}",
            text=[
                f"{id_label}: {assay} ({db_label})"
                for id_label, assay, db_label in zip(
                    filtered_df["sample_id"],
                    filtered_df["assay"],
                    filtered_df["DB"],
                )
            ],
            name=f"{db_label} (N={filtered_df.shape[0]})",
            showlegend=True,
        )
    )

axis_titles = [f"PC {i+1} ({explained_variance[i]:.2%})" for i in range(2)]

fig.update_layout(
    title="2D PCA - epiATLAS and ChiP-Atlas - core7 samples",
    xaxis_title=axis_titles[0],
    yaxis_title=axis_titles[1],
    legend={"itemsizing": "constant"},
)

name = "pca_core7_C-A_epiatlas_2D"
fig.write_html(output_dir / f"{name}.html")
fig.write_image(output_dir / f"{name}.png")
fig.write_image(output_dir / f"{name}.svg")