In [None]:
"""See markdown"""
# pylint: disable=line-too-long, redefined-outer-name, import-error, pointless-statement

Trying to analyze correlation values of a given sample with other samples of different targets.

At 1kb, the max correlation can generally be taken as the correct prediction, but targets k27ac and k4me3 are mixed.

Is there any pattern in the correlation with other targets that could help us differentiate k27ac and k4me3?

In [None]:
from pathlib import Path

# import numpy as np
import pandas as pd
import plotly.graph_objects as go

In [None]:
path = (
    Path.home()
    / "downloads"
    / "temp"
    / "avr_median_ca_epiatlas_2023_05_31--corr100kb.xlsx"
)
df = pd.read_excel(path, sheet_name=0, index_col=0, header=0)

In [None]:
df = df[df.index.notnull()]  # drop rows with no md5sum (extra computation rows)

In [None]:
df_epiatlas = df[df["Source"].str.contains("EpiAtlas")]
df_epiatlas = df_epiatlas.dropna(axis=1, how="all")

In [None]:
to_drop = df_epiatlas.columns[df.columns.str.contains("median")]
df_epiatlas = df_epiatlas.drop(to_drop, axis=1, errors="ignore")

In [None]:
# df_epiatlas.columns

In [None]:
df_k27ac = df_epiatlas[df_epiatlas["assay"].str.contains("k27ac")]

In [None]:
df_k27ac

In [None]:
target_diff = df_k27ac["h3k27ac_average"] - df_k27ac["h3k4me3_average"]

target_cols = df_k27ac.columns[df_k27ac.columns.str.contains("average")].drop(
    ["h3k4me3_average", "h3k27ac_average"]
)
cutoff = 0

for name, df in zip(
    [f"k27ac-k4me3>{cutoff}", f"k27ac-k4me3<-{cutoff}"],
    [df_k27ac[target_diff > cutoff], df_k27ac[target_diff < -cutoff]],
):
    fig = go.Figure()
    print(df.shape)

    # Iterate through each target and add a violin plot for it
    for target in target_cols:
        fig.add_trace(
            go.Violin(
                y=df[target],
                name=target,
                box_visible=True,
                meanline_visible=True,
                points="all",
            )
        )

    fig.update_layout(
        title_text=f"k27ac mean correlation to core assays, {name}",
        yaxis_title="Correlation Value",
        xaxis_title="Target",
    )
    fig.update_yaxes(range=[-0.1, 0.5])

    fig.show()

In [None]:
target_diff = df_k27ac["h3k27ac_average"] - df_k27ac["h3k4me3_average"]

target_cols = df_k27ac.columns[df_k27ac.columns.str.contains("average")].drop(
    ["h3k4me3_average", "h3k27ac_average"]
)
# target_cols = df_k27ac.columns[df_k27ac.columns.str.contains("average")]
cutoff = 0.05

fig = go.Figure()

combined_names = [
    f"{target}, {name}"
    for name in [f"k27ac-k4me3>{cutoff}", f"k27ac-k4me3<-{cutoff}"]
    for target in target_cols
]
current_categories = (
    fig.layout.xaxis.categoryarray if "categoryarray" in fig.layout.xaxis else []
)
sorted_combined_names = sorted(combined_names)

fig = go.Figure()

for name in sorted_combined_names:
    target, condition = name.split(", ")
    if condition == f"k27ac-k4me3>{cutoff}":
        data_frame = df_k27ac[target_diff > cutoff]
        color = "red"
    else:
        data_frame = df_k27ac[target_diff < -cutoff]
        color = "blue"
    fig.add_trace(
        go.Violin(
            y=data_frame[target.strip()],
            name=name,
            box_visible=True,
            meanline_visible=True,
            points="all",
            line_color=color,
        )
    )

fig.update_layout(
    title_text="1kb: k27ac mean correlation to core assays",
    yaxis_title="Correlation Value",
    xaxis_title="Target",
)
fig.update_yaxes(range=[-0.1, max(df_k27ac[target_cols].max())])

fig.update_layout(
    xaxis={"categoryorder": "array", "categoryarray": sorted_combined_names}
)

fig.show()