## Load data

In [None]:
import pandas as pd
df = pd.read_parquet("/Users/isabell/Documents/projects/MAPKprot/full_proteome/result.parquet")
df_long = df[["Protein.Group","Run","PG.MaxLFQ", "Protein.Names"]].drop_duplicates().reset_index(drop=True)
df_long.columns = ['protein_id', 'filename', 'intensity', 'protein_name']
df_long['filename'] = df_long['filename'].str.split('_').str[-1]
df_long["filename"] = pd.to_numeric(df_long["filename"], errors="coerce").astype(str)
df_long

In [None]:
# Read Excel file
mapk_df = (
    pd.read_excel("/Users/isabell/Documents/projects/MAPKprot/annotation.xlsx")
    .drop(columns=["Unnamed: 0"], errors="ignore")
    .rename(columns=lambda x: x.strip().replace(" ", "_"))
    [["Sample_ID", "type", "alteration","sex",
      "age", "mapk", "location"]]
)
mapk_df["filename"] = mapk_df["Sample_ID"].astype(str)
mapk_df = mapk_df[["filename","type", "alteration","sex",
      "age", "mapk", "location"]]
mapk_df

In [None]:
mapk_df.type.unique()

## Load CoPro

In [None]:
import os
import random
from pathlib import Path
import warnings
import numpy as np
import pandas as pd
import scipy as sp
import anndata as ad
import scanpy as sc
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib as mpl

import copro as cp

random.seed(42)

cwd = Path('.').resolve()
root = cwd.parent
os.chdir(root)

In [None]:
adata = cp.read.proteins_long_from_df(intensities_df=df_long, filename_annotation_df=mapk_df) #, protein_annotation_df=Karayel_2020_meta_var)

In [None]:
adata.obs

In [None]:
adata = adata[~adata.obs["type"].isna(), :].copy()
adata.obs

In [None]:
adata.var.head()

In [None]:
set(adata.obs.type)

In [None]:
# Colors for plotting
adata.uns['colors_type'] = {
    #"DMG": "#377EB8",       # blue
    #"DMG_MAPK": "#109ABC",
    "DNT": "#C850D3",       # purple
    "EVNCYT": "#3A1C0B",    # brown
    "GG": "#F781BF",        # pink
    "PA_CORT": "#66C2A5",   # teal
    "PA_INF": "#117E5A",  # dark turquoise
    "PA_MID": "#1B759E",  # dark turquoise
    "PA_SPINE":"#184D7E",  # dark turquoise 
    "PXA": "#471F7F",
    "CN": "#E41A1C",        # red
    "MNG": "#999999",       # grey
    "EPN_PFA": "#FF7F00",   # orange
    "EPN_PFB": "#FFAA00",   
 }

In [None]:
cp.pl.n_samples_by_category(
        adata, 
        category_cols='type',
        color_scheme=adata.uns['colors_type'],
        sort_by_counts=False
        )

In [None]:
cp.pl.n_detected_proteins_per_sample(
    adata,
    zero_to_na=True,
    group_by='type',
    group_by_order=adata.uns['colors_type'].keys(),
    color_scheme=adata.uns['colors_type'],
    xlabel_rotation=90,
    group_by_label_rotation=40,
    )

In [None]:
cp.pp.filter_obs_by_min_nr_var(adata, min_nr=6000)

In [None]:
cp.pl.n_detected_proteins_per_sample(
    adata,
    zero_to_na=True,
    group_by='type',
    group_by_order=adata.uns['colors_type'].keys(),
    color_scheme=adata.uns['colors_type'],
    xlabel_rotation=90,
    group_by_label_rotation=40,
    )

In [None]:
cp.pl.var_completeness(adata, zero_to_na=True)

In [None]:
cp.pp.filter_var_completeness(adata, min_fraction=1, group_by='type')

In [None]:
cp.pl.var_completeness(adata, zero_to_na=True)

In [None]:
cp.pp.filter_var_completeness(adata, min_fraction=0.33, zero_to_na=True)

In [None]:
cp.pl.var_completeness(adata, zero_to_na=True)

In [None]:
cp.pl.n_detected_proteins_per_sample(
    adata,
    zero_to_na=True,
    group_by='type',
    group_by_order=adata.uns['colors_type'].keys(),
    color_scheme=adata.uns['colors_type'],
    xlabel_rotation=90,
    group_by_label_rotation=40,
    )

In [None]:
cp.pp.calculate_groupwise_cv(
    adata,
    groupby='type'
    )

In [None]:
cp.pl.cv_distribution(
    adata,
    figsize=(6, 4),
    group_label_rotation=55,
    order=adata.uns['colors_type'].keys(),
    color_scheme=adata.uns['colors_type'],
    hline=0.5
)

In [None]:
adata.X = np.log2(adata.X)

In [None]:
cp.pl.intensity_distribution_per_obs(
    adata,
    group_by='type',
    zero_to_na=True,
    group_by_order=adata.uns['colors_type'].keys(),
    color_scheme=adata.uns['colors_type'],
    group_by_label_rotation=45,
    xlabel_rotation=55,
    show=True,
    figsize=(9, 6)
    )

In [None]:
cp.pp.median_normalize(adata)

In [None]:
cp.pl.intensity_distribution_per_obs(
    adata,
    group_by='type',
    zero_to_na=True,
    group_by_order=adata.uns['colors_type'].keys(),
    color_scheme=adata.uns['colors_type'],
    group_by_label_rotation=45,
    xlabel_rotation=55,
    show=True,
    figsize=(9, 6)
    )

In [None]:
cp.pp.impute_downshift(adata, width=0.3, downshift=1.8, random_state=123)

In [None]:
measured_n = int((adata.layers["bool_imputed"] == 0).sum())
imputed_n  = int((adata.layers["bool_imputed"] == 1).sum())
print("Measured count:", measured_n, " | Imputed count:", imputed_n)

In [None]:
# 1) Single combined histogram
cp.pl.intensity_hist_imputed(adata, density=False)

# 2) Per-sample small multiples for all samples
#cp.pl.intensity_hist_imputed(adata, per_sample=True, ncols=5, legend_loc="upper right", density=False, figsize=(17, 12))

In [None]:
sc.tl.pca(adata)
sc.pl.pca_variance_ratio(adata, n_pcs=50, log=True)
sc.pl.pca(
    adata,
    color='type',
    dimensions=(0, 1),
    ncols=2,
    size=90,
    palette=adata.uns['colors_type'],
    )
sc.pl.pca(
    adata,
    color='alteration',
    dimensions=(0, 1),
    ncols=2,
    size=90,
    #palette=adata.uns['colors_cell_type'],
    )
sc.pl.pca(
    adata,
    color='sex',
    dimensions=(0, 1),
    ncols=2,
    size=90,
    #palette=adata.uns['colors_cell_type'],
    )
sc.pl.pca(
    adata,
    color='age',
    dimensions=(0, 1),
    ncols=2,
    size=90,
    #palette=adata.uns['colors_cell_type'],
    )
sc.pl.pca(
    adata,
    color='mapk',
    dimensions=(0, 1),
    ncols=2,
    size=90,
    #palette=adata.uns['colors_cell_type'],
    )
sc.pl.pca(
    adata,
    color='location',
    dimensions=(0, 1),
    ncols=2,
    size=90,
    #palette=adata.uns['colors_cell_type'],
    )

In [None]:
sc.pp.neighbors(adata, n_neighbors=5)
sc.tl.umap(adata)
sc.pl.umap(
    adata,
    color='type',
    size=100,
    palette=adata.uns['colors_type'],
    )
sc.pl.umap(
    adata,
    color='alteration',
    size=100,
    #palette=adata.uns['colors_cell_type'],
    )
sc.pl.umap(
    adata,
    color='sex',
    size=100,
    #palette=adata.uns['colors_cell_type'],
    )
sc.pl.umap(
    adata,
    color='age',
    size=100,
    #palette=adata.uns['colors_cell_type'],
    )
sc.pl.umap(
    adata,
    color='mapk',
    size=100,
    #palette=adata.uns['colors_cell_type'],
    )
sc.pl.umap(
    adata,
    color='location',
    size=100,
    #palette=adata.uns['colors_cell_type'],
    )

In [None]:
#sc.pp.combat(
#    adata, 
#    key='batch_id', 
#    #covariates='tumor_type', 
#    inplace=True
#    )

In [None]:
cp.pl.obs_correlation_matrix(
    adata,
    method="pearson",
    groupby="type",
    color_scheme=adata.uns["colors_type"],
    xticklabels=True,
    figsize=(13, 9),
)

In [None]:
cp.pl.obs_correlation_matrix(
    adata,
    method="pearson",
    groupby="mapk",
    #color_scheme=adata.uns["colors_type"],
    xticklabels=True,
    figsize=(13, 9),
)

In [None]:
# Mark the 250 most variable proteins (HVG-style)
sc.pp.highly_variable_genes(
    adata,
    n_top_genes=250,
    inplace=True
)

In [None]:
# Get those variable features
adata_hvg = adata[:, adata.var['highly_variable']].copy()

# Clustered heatmap (Scanpy style)
sc.pl.clustermap(
    adata_hvg,
    obs_keys='mapk',
    z_score=1,
    figsize=(8,6),
    dendrogram_ratio=0.15,
    show=True,
    xticklabels=False
)

In [None]:
adata_hvg.var.head()

## What's missing:

- (optional) log10 intensity vs abundance rank plot with option to highlight specific proteins > fig EV1D
- (optional) clustering of selected vars (e.g. most variable, diff. exp.) >> fig 2B
- (optional) plotting 'profiles' for each 'cluster' >> fig 2B
- differential analysis (t-test)
- (optional) differential analysis (anova)
- volcano plot

## Differential analysis

In [None]:
import numpy as np
import pandas as pd
from scipy import stats

def ttest_ind_all_vars(
    adata,
    group1_query: str,
    group2_query: str,
    equal_var: bool = True,
):
    """
    Perform scipy.stats.ttest_ind for all variables in adata,
    compute log2 fold changes, and BH-FDR — with strict NA checking.

    Raises:
        ValueError if any NA appears in either group's expression matrix.

    Returns:
        DataFrame with mean1, mean2, log2FC, t_stat, p_val, p_adj
    """

    # select obs indices
    idx1 = adata.obs.query(group1_query).index
    idx2 = adata.obs.query(group2_query).index

    X = adata.X

    # slice rows for each group
    X1 = X[adata.obs.index.isin(idx1), :]
    X2 = X[adata.obs.index.isin(idx2), :]

    # convert to dense if sparse
    if not isinstance(X1, np.ndarray):
        X1 = X1.toarray()
        X2 = X2.toarray()

    # strict NA validation
    if np.isnan(X1).any() or np.isnan(X2).any():
        raise ValueError("Input expression matrix contains NA values — clean your data first.")

    # compute means
    mean1 = X1.mean(axis=0)
    mean2 = X2.mean(axis=0)

    # check for zeros that would break log2 fold-change
    if np.any(mean1 == 0) or np.any(mean2 == 0):
        raise ValueError("Zero mean detected in one of the groups; log2 fold change undefined.")

    # log2 fold change
    log2fc = np.log2(mean1 / mean2)

    # t-test
    t_stats, p_vals = stats.ttest_ind(
        X1, X2,
        axis=0,
        equal_var=equal_var
    )

    # BH-FDR (Benjamini–Hochberg)
    p = p_vals.copy()
    n = p.size

    order = np.argsort(p)
    ranks = np.arange(1, n + 1, dtype=float)

    bh = (p[order] * n) / ranks
    bh = np.minimum.accumulate(bh[::-1])[::-1]

    p_adj = np.empty_like(bh)
    p_adj[order] = bh

    return pd.DataFrame({
        "mean1": mean1,
        "mean2": mean2,
        "log2FC": log2fc,
        "t_stat": t_stats,
        "p_val": p_vals,
        "p_adj": p_adj,
    }, index=adata.var_names)


In [None]:
import numpy as np
import matplotlib.pyplot as plt

def volcano_plot(
    res_df,
    fc_col: str = "log2FC",
    p_col: str = "p_adj",   # or "p_val" if you prefer
    fc_thresh: float = 0.3,
    p_thresh: float = 0.01,
    top_labels: int = 10,
    label_col: str | None = None,
    title: str | None = None,
    ax=None,
):
    """
    Simple volcano plot from ttest_ind_all_vars results.

    Parameters
    ----------
    res_df : pandas.DataFrame
        Result of ttest_ind_all_vars (index = features, columns incl. log2FC, p_adj/p_val).
    fc_col : str
        Column name for log2 fold change.
    p_col : str
        Column name for p-values (e.g. 'p_adj' or 'p_val').
    fc_thresh : float
        Absolute log2FC threshold for calling points 'significant'.
    p_thresh : float
        P-value threshold for calling points 'significant'.
    top_labels : int
        Number of most significant points to label (by p-value).
    label_col : str or None
        Column to use for labels. If None, use res_df.index.
    title : str or None
        Plot title.
    ax : matplotlib Axes, optional
        Axes to plot into. If None, a new figure and axes are created.
    """

    if ax is None:
        fig, ax = plt.subplots(figsize=(6, 5))

    # extract values
    log2fc = res_df[fc_col].values
    pvals = res_df[p_col].values

    # -log10 p
    neg_log10_p = -np.log10(pvals)

    # significance masks
    sig_mask = (np.abs(log2fc) >= fc_thresh) & (pvals <= p_thresh)
    up_mask = sig_mask & (log2fc > 0)
    down_mask = sig_mask & (log2fc < 0)
    non_sig_mask = ~sig_mask

    # scatter plot
    ax.scatter(
        log2fc[non_sig_mask],
        neg_log10_p[non_sig_mask],
        s=10,
        alpha=0.5,
        label="n.s.",
    )
    ax.scatter(
        log2fc[up_mask],
        neg_log10_p[up_mask],
        s=12,
        alpha=0.8,
        label="up",
    )
    ax.scatter(
        log2fc[down_mask],
        neg_log10_p[down_mask],
        s=12,
        alpha=0.8,
        label="down",
    )

    # threshold lines
    ax.axvline(fc_thresh, color="grey", linestyle="--", linewidth=1)
    ax.axvline(-fc_thresh, color="grey", linestyle="--", linewidth=1)
    ax.axhline(-np.log10(p_thresh), color="grey", linestyle="--", linewidth=1)

    # axis labels
    ax.set_xlabel("log2 fold change")
    ax.set_ylabel(f"-log10({p_col})")

    if title is None:
        title = f"Volcano plot ({p_col}, |log2FC| ≥ {fc_thresh}, p ≤ {p_thresh})"
    ax.set_title(title)

    # label top hits
    if top_labels > 0:
        # sort by p-value
        order = np.argsort(pvals)
        top_idx = order[:top_labels]

        if label_col is None:
            labels = res_df.index.to_numpy()
        else:
            labels = res_df[label_col].to_numpy()

        for i in top_idx:
            ax.text(
                log2fc[i],
                neg_log10_p[i],
                str(labels[i]),
                fontsize=8,
                ha="center",
                va="bottom",
            )

    ax.legend(frameon=False)
    ax.spines["top"].set_visible(False)
    ax.spines["right"].set_visible(False)

    return ax


In [None]:
import numpy as np
import pandas as pd
import statsmodels.formula.api as smf

def group_difference_with_covariates(
    adata,
    group: str,
    covariates: list,
    strict_na: bool = True,
):
    """
    Test whether groups differ after adjusting for covariates using
    statsmodels OLS for all variables in adata.X.

    Parameters
    ----------
    adata : AnnData
        Expression matrix (n_obs × n_vars)
    group : str
        Column name in adata.obs defining the main group (tested effect)
    covariates : list[str]
        Additional covariates to adjust for (categorical or continuous)
    strict_na : bool
        If True, error on NA values.

    Returns
    -------
    DataFrame:
        index = var_names
        columns = ['coef', 'p_val', 'p_adj']
    """

    # Build data frame with covariates
    meta = adata.obs[[group] + covariates].copy()

    # handle categorical variables
    for col in meta:
        if meta[col].dtype.name == "category":
            meta[col] = meta[col].astype(str)

    X = adata.X
    if not isinstance(X, np.ndarray):
        X = X.toarray()

    if strict_na and np.isnan(X).any():
        raise ValueError("Expression matrix contains NA values.")

    results = []
    n_vars = X.shape[1]

    # Build formula string
    rhs = " + ".join(
        [f"C({group})"] +
        [f"C({c})" if meta[c].dtype == object else c for c in covariates]
    )

    # Loop over variables
    for idx, var_name in enumerate(adata.var_names):
        meta_local = meta.copy()
        meta_local["y"] = X[:, idx]

        if strict_na and np.isnan(meta_local["y"]).any():
            raise ValueError(f"NA values found in variable {var_name}")

        formula = f"y ~ {rhs}"
        model = smf.ols(formula, data=meta_local).fit()

        # extract the main group comparisons
        # (assuming two groups; extension possible)
        terms = model.params.index
        group_terms = [t for t in terms if t.startswith(f"C({group})")]

        if len(group_terms) == 0:
            raise ValueError("Group variable must have at least 2 levels.")

        # For a binary group: single coefficient
        term = group_terms[0]
        coef = model.params[term]
        pval = model.pvalues[term]

        results.append((var_name, coef, pval))

    df = pd.DataFrame(results, columns=["var", "coef", "p_val"]).set_index("var")

    # BH correction
    p = df["p_val"].values
    order = np.argsort(p)
    ranks = np.arange(1, len(p) + 1)
    bh = (p[order] * len(p)) / ranks
    bh = np.minimum.accumulate(bh[::-1])[::-1]
    df["p_adj"] = np.nan
    df.iloc[order, df.columns.get_loc("p_adj")] = bh

    return df


In [None]:
import numpy as np
import matplotlib.pyplot as plt

def volcano_plot(res, covar_threshold=0.05, p_adj_threshold=0.05, log2fc_threshold=0.3):
    # Copy to avoid modifying original
    df = res.copy()

    # Avoid log10(0) → inf
    # Replace zeros (if any) by smallest non-zero p-value or a tiny number
    tiny = 1e-300
    p = df["p_adj"].replace(0, np.nan)
    min_nonzero = p[p > 0].min()
    df["p_adj_safe"] = df["p_adj"].replace(0, min_nonzero if not np.isnan(min_nonzero) else tiny)

    # Compute -log10(p_adj)
    df["neg_log10_p"] = -np.log10(df["p_adj_safe"])

    # Boolean mask for covariate-significant points
    sig = df["covar_p_adj"] <= covar_threshold

    plt.figure(figsize=(6, 5))

    # Non-significant points
    plt.scatter(
        df.loc[~sig, "log2FC"],
        df.loc[~sig, "neg_log10_p"],
        s=10,
        color="lightgray",
        alpha=0.7,
        label=f"covar_p_adj > {covar_threshold}"
    )

    # Significant points
    plt.scatter(
        df.loc[sig, "log2FC"],
        df.loc[sig, "neg_log10_p"],
        s=10,
        color="crimson",
        alpha=0.8,
        label=f"covar_p_adj ≤ {covar_threshold}"
    )

    plt.axhline(-np.log10(p_adj_threshold), linestyle="--", color="black", linewidth=1, alpha=0.5)
    plt.axvline(-log2fc_threshold, linestyle="--", color="black", linewidth=1, alpha=0.5)
    plt.axvline(log2fc_threshold, linestyle="--", color="black", linewidth=1, alpha=0.5)

    plt.xlabel("log2 fold change")
    plt.ylabel("-log10(p_adj)")
    plt.title("Volcano plot")
    plt.legend(frameon=False)
    plt.tight_layout()
    plt.show()


In [None]:
res_covariates = group_difference_with_covariates(
    adata,
    group="sex",
    covariates=["type","age"],
)
res_ttest = ttest_ind_all_vars(
    adata,
    group1_query='sex == "Female"',
    group2_query='sex == "Male"'
)

res_covariates = res_covariates.add_prefix("covar_")
res_merged = res_ttest.join(res_covariates, how="inner")
res_merged.head()

In [None]:
volcano_plot(res_merged, covar_threshold=0.01)

In [None]:
res_covariates = group_difference_with_covariates(
    adata,
    group="mapk",
    covariates=["age","sex"], # "age", "type"
)
res_ttest = ttest_ind_all_vars(
    adata,
    group1_query='mapk == "positive"',
    group2_query='mapk == "negative"'
)

res_covariates = res_covariates.add_prefix("covar_")
res_merged = res_ttest.join(res_covariates, how="inner")
res_merged.head()

In [None]:
mapk_all = res_merged.index.tolist()
with open("mapk_all.txt", "w") as f:
    for item in mapk_all:
        f.write(str(item) + "\n")

mapk_up = res_merged[(res_merged["p_adj"] < 0.05) & (res_merged["log2FC"] > 0.3)].index.tolist()
with open("mapk_up.txt", "w") as f:
    for item in mapk_up:
        f.write(str(item) + "\n")

mapk_down = res_merged[(res_merged["p_adj"] < 0.05) & (res_merged["log2FC"] < -0.3)].index.tolist()
with open("mapk_down.txt", "w") as f:
    for item in mapk_down:
        f.write(str(item) + "\n")


In [None]:
volcano_plot(res_merged, covar_threshold=0.05)

In [None]:
adata_sub = adata[adata.obs["type"].isin(["PA_SPINE", "PA_INF","PA_CORT", "PA_MID"]), :].copy()

res_covariates = group_difference_with_covariates(
    adata_sub,
    group="location",
    covariates=["sex","age"],
)
res_ttest = ttest_ind_all_vars(
    adata_sub,
    group1_query='location == "Spinal"',
    group2_query='location == "Posterior fossa"'
)

res_covariates = res_covariates.add_prefix("covar_")
res_merged = res_ttest.join(res_covariates, how="inner")
res_merged.sort_values("p_adj").head()

In [None]:
volcano_plot(res_merged, covar_threshold=0.05)

In [None]:
adata_sub = adata[adata.obs["alteration"].isin(["BRAF","KIAA1549::BRAF"]), :].copy()

res_covariates = group_difference_with_covariates(
    adata_sub,
    group="alteration",
    covariates=["sex","age"], #,"type"],
)
res_ttest = ttest_ind_all_vars(
    adata_sub,
    group1_query='alteration == "BRAF"',
    group2_query='alteration == "KIAA1549::BRAF"'
)

res_covariates = res_covariates.add_prefix("covar_")
res_merged = res_ttest.join(res_covariates, how="inner")
res_merged.sort_values("p_adj").head()

In [None]:
res_merged[(res_merged["p_adj"] < 0.05) & (np.abs(res_merged["log2FC"]) > 0.3)].sort_values("covar_p_adj").head()

In [None]:
volcano_plot(res_merged, covar_threshold=0.01)

In [None]:
braf_up = res_merged[(res_merged["p_adj"] < 0.05) & (res_merged["log2FC"] > 0.3)].index.tolist()
with open("braf_up.txt", "w") as f:
    for item in braf_up:
        f.write(str(item) + "\n")

braf_down = res_merged[(res_merged["p_adj"] < 0.05) & (res_merged["log2FC"] < -0.3)].index.tolist()
with open("braf_down.txt", "w") as f:
    for item in braf_down:
        f.write(str(item) + "\n")


In [None]:
#! pip install gseapy

In [None]:
import gseapy as gp

# Download KEGG gene sets (one-time)
lib = gp.get_library(name="KEGG_2016", organism="Human")

# Pick the MAPK pathway
# (exact key name may vary slightly; print(list(lib.keys())[:20]) to inspect)
mapk_key = "MAPK signaling pathway Homo sapiens hsa04010"
mapk_genes = lib[mapk_key]

# Wrap in dict for ssGSEA
gene_sets = {mapk_key: mapk_genes}


In [None]:
import mygene
import pandas as pd

mg = mygene.MyGeneInfo()

def symbols_to_uniprot(gene_symbols, species="human"):
    """
    Convert a list of gene symbols to UniProt accessions.
    Returns a list of UniProt IDs (deduplicated).
    """
    res = mg.querymany(
        gene_symbols,
        scopes="symbol",
        fields="uniprot.Swiss-Prot,uniprot.TrEMBL",
        species=species
    )
    df = pd.DataFrame(res)

    # Drop entries that weren't mapped
    if "notfound" in df.columns:
        df = df[~df["notfound"].fillna(False)]

    # ---- Normalize 'uniprot' field into a flat list of strings ----
    def normalize_uniprot(u):
        # u can be: str, dict, list, or NaN
        if isinstance(u, str):
            return [u]
        if isinstance(u, dict):
            vals = []
            for key in ("Swiss-Prot", "TrEMBL"):
                v = u.get(key)
                if isinstance(v, str):
                    vals.append(v)
                elif isinstance(v, list):
                    vals.extend(v)
            return vals
        if isinstance(u, list):
            # assume it's already a list of strings
            return [x for x in u if isinstance(x, str)]
        # anything else (NaN, etc.)
        return []

    df["uniprot_list"] = df["uniprot"].apply(normalize_uniprot)

    # explode so each UniProt ID is on its own row
    df = df.explode("uniprot_list")

    # drop empties
    df = df.dropna(subset=["uniprot_list"])

    # deduplicate and return as a simple list
    return sorted(df["uniprot_list"].unique().tolist())


In [None]:
mapk_genes = symbols_to_uniprot(gene_sets[mapk_key])

In [None]:
mapk_genes

In [None]:
# mapk_uniprot is your UniProt list from symbols_to_uniprot(...)
# make sure they exist in adata
mapk_genes_in_data = sorted(set(mapk_genes).intersection(set(adata.var_names)))
print(f"{len(mapk_genes_in_data)} MAPK genes overlap with adata features.")

gene_sets = {"MAPK": mapk_genes_in_data}


In [None]:
import gseapy as gp

# expression matrix: samples x features
expr = adata.to_df()      # uses .X by default
# transpose to features x samples for gseapy
expr_t = expr.T

ss = gp.ssgsea(
    data=expr_t,
    gene_sets=gene_sets,   # {"MAPK": [UniProt IDs]}
    outdir=None,           # no output to disk
    sample_norm=True,
    min_size=1,            # or 5, but 1 is safer if overlap is small
    max_size=5000,
    permutation_num=0      # standard for ssGSEA
)

scores_df = ss.res2d       # rows = pathways, cols = samples


In [None]:
scores_df

In [None]:
# 1. Make a mapping: filename → NES
nes_map = dict(zip(scores_df["Name"], scores_df["NES"]))

# 2. Map it into adata.obs
adata.obs["MAPK_ssGSEA"] = adata.obs["filename"].map(nes_map)


In [None]:
adata.obs

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

df = adata.obs

plt.figure(figsize=(8, 5))
sns.boxplot(data=df, x="mapk", y="MAPK_ssGSEA")
sns.stripplot(data=df, x="mapk", y="MAPK_ssGSEA", color="black", alpha=0.5)

plt.title("MAPK ssGSEA scores by mapk")
plt.show()


In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd

df = adata.obs.copy()

# ---- 1️⃣ Extract correct order & colors from adata.uns ----
type_order = adata.uns["colors_type"].keys()
type_colors = adata.uns["colors_type"].values()

# Map type → color
type_to_color = dict(zip(type_order, type_colors))

# ---- 2️⃣ Sort samples by Type (and keep sample name index) ----
df["type"] = pd.Categorical(df["type"], categories=type_order, ordered=True)
df = df.sort_values("type")

# ---- 3️⃣ Create color array matching each sample ----
sample_colors = df["type"].map(type_to_color)

# ---- 4️⃣ Bar plot (each bar = sample) ----
plt.figure(figsize=(14, 5))
plt.bar(
    x=df.index,
    height=df["MAPK_ssGSEA"],
    color=sample_colors
)

plt.xticks(rotation=90)
plt.ylabel("MAPK ssGSEA")
plt.xlabel("Samples")
plt.title("MAPK ssGSEA per sample grouped by type")
plt.tight_layout()

# Add legend manually
handles = [plt.Rectangle((0,0),1,1, color=type_to_color[t]) for t in type_order]
plt.legend(handles, type_order, title="Type", bbox_to_anchor=(1.05, 1), loc="upper left")

plt.show()


In [None]:
from scipy.stats import ranksums

group1 = adata.obs.loc[adata.obs["mapk"] == "positive", "MAPK_ssGSEA"]
group2 = adata.obs.loc[adata.obs["mapk"] == "negative", "MAPK_ssGSEA"]

stat, pval = ranksums(group1, group2)
print("Wilcoxon rank-sum p-value:", pval)


In [None]:
res = ttest_ind_all_vars(
    adata,
    group1_query='mapk == "positive"',
    group2_query='mapk == "negative"'
)

res[(res["p_adj"] < 0.05) & (abs(res["log2FC"]) > 0.4)] #.sort_values("log2FC", ascending=False).head(10)


In [None]:
volcano_plot(res)

In [None]:
res.loc["A0A024RBG1"]

In [None]:
sp.stats.ttest_ind(
    adata[adata.obs["type"] == "MNG", "A0A024RBG1"].X.toarray().flatten(),
    adata[adata.obs["type"] == "GG", "A0A024RBG1"].X.toarray().flatten(),
    #equal_var=False,
    nan_policy='omit'
    )

In [None]:
adata.obs

In [None]:
res = group_difference_with_covariates(
    adata,
    group="sex",
    covariates=["type","age"],
)

res.sort_values("p_adj").head()


In [None]:
res[res["p_adj"] < 0.05]
# all 3 on y chromosome

In [None]:
adata_sub = adata[(adata.obs["mapk"] == "pos") | (adata.obs["mapk"] == "neg")].copy()

res = group_difference_with_covariates(
    adata_sub,
    group="mapk",
    covariates=["type","age","sex"],
)

res.sort_values("p_adj").head()

In [None]:
res[res["p_adj"] < 0.01]

In [None]:
import numpy as np
import pandas as pd
from scipy import stats

def anova_all_vars(
    adata,
    groupby: str,
    groups=None,
    strict_na: bool = True,
):
    """
    One-way ANOVA for all variables in `adata.X` across groups defined in `adata.obs[groupby]`.

    Parameters
    ----------
    adata : AnnData
        Input AnnData object.
    groupby : str
        Column name in adata.obs defining the groups (e.g. 'cell_type').
    groups : list-like, optional
        Specific group labels to include (subset/order). By default, use all unique values.
    strict_na : bool, default True
        If True, raise ValueError if any NA is present in the data.

    Returns
    -------
    pandas.DataFrame
        Index: adata.var_names
        Columns:
            - F        : F-statistic
            - p_val    : raw p-value
            - p_adj    : BH-FDR
            - mean_<g> : mean expression in group g for each variable
    """

    obs_col = adata.obs[groupby]

    if groups is None:
        # preserve order of appearance
        groups = pd.unique(obs_col)
    else:
        groups = pd.Index(groups)

    # collect group matrices, sizes, and means
    X = adata.X
    group_mats = []
    group_sizes = []
    group_means = []
    group_ss_within = []

    for g in groups:
        idx = (obs_col == g).values
        if not np.any(idx):
            raise ValueError(f"No observations found for group '{g}' in column '{groupby}'.")

        Xg = X[idx, :]
        # to dense if sparse
        if not isinstance(Xg, np.ndarray):
            Xg = Xg.toarray()

        if strict_na and np.isnan(Xg).any():
            raise ValueError(
                f"Input expression matrix for group '{g}' contains NA values — clean your data first."
            )

        n_g = Xg.shape[0]
        mean_g = Xg.mean(axis=0)

        # within-group SS: sum (x_ig - mean_g)^2
        ss_within_g = ((Xg - mean_g) ** 2).sum(axis=0)

        group_mats.append(Xg)
        group_sizes.append(n_g)
        group_means.append(mean_g)
        group_ss_within.append(ss_within_g)

    group_sizes = np.array(group_sizes)[:, None]   # shape (k, 1)
    group_means = np.vstack(group_means)          # shape (k, n_vars)
    group_ss_within = np.vstack(group_ss_within)  # shape (k, n_vars)

    # total N and grand mean per variable
    N_total = group_sizes.sum(axis=0)[0]
    grand_mean = (group_sizes * group_means).sum(axis=0) / N_total

    # SS_between and SS_within
    ss_between = (group_sizes * (group_means - grand_mean) ** 2).sum(axis=0)
    ss_within = group_ss_within.sum(axis=0)

    k = len(groups)
    df_between = k - 1
    df_within = N_total - k

    # mean squares
    ms_between = ss_between / df_between
    ms_within = ss_within / df_within

    # avoid division by zero: where ms_within == 0, set F and p to NaN
    with np.errstate(divide="ignore", invalid="ignore"):
        F = ms_between / ms_within
    F[~np.isfinite(F)] = np.nan

    # p-values from F-distribution
    p_vals = stats.f.sf(F, df_between, df_within)
    p_vals[np.isnan(F)] = np.nan

    # Benjamini–Hochberg FDR (ignore NaNs in ordering)
    p = p_vals.copy()
    n = p.size

    # treat NaNs as 1.0 for sorting so they go to the end
    p_for_sort = np.nan_to_num(p, nan=1.0)
    order = np.argsort(p_for_sort)
    ranks = np.arange(1, n + 1, dtype=float)

    bh = (p_for_sort[order] * n) / ranks
    bh = np.minimum.accumulate(bh[::-1])[::-1]

    p_adj = np.empty_like(bh)
    p_adj[order] = bh

    # build result DataFrame
    data = {
        "F": F,
        "p_val": p_vals,
        "p_adj": p_adj,
    }

    # add per-group means: mean_<group>
    for i, g in enumerate(groups):
        data[f"mean_{g}"] = group_means[i, :]

    res = pd.DataFrame(data, index=adata.var_names)

    return res


In [None]:
res_anova = anova_all_vars(
    adata,
    groupby="Type",
    # optionally: groups=["Progenitor", "Ortho", "OtherType"]
)

# top ANOVA hits
res_anova.sort_values("p_adj").head()


In [None]:
sp.stats.ttest_ind(
    adata[adata.obs["cell_type"] == "Ortho", "A0A024RBG1"].X.toarray().flatten(),
    adata[adata.obs["cell_type"] == "Progenitor", "A0A024RBG1"].X.toarray().flatten(),
    equal_var=False,
    nan_policy='omit'
    )

In [None]:
ttest_groups_equalvar(adata, groupby="cell_type", group1="Progenitor", group2="Ortho", layer=None)

In [None]:
sc.tl.rank_genes_groups(
        adata,
        groupby="cell_type",
        groups=["Ortho"],
        reference="Progenitor",   
        method="t-test",
        corr_method="benjamini-hochberg",
        n_jobs=1
    )

In [None]:
adata.uns['rank_genes_groups']

In [None]:
diff_Ortho_Progenitor = sc.get.rank_genes_groups_df(adata, group="Ortho", gene_symbols="gene_name")
diff_Ortho_Progenitor.sort_values(by="logfoldchanges", ascending=False)

In [None]:
diff_Ortho_Progenitor[diff_Ortho_Progenitor["names"].str.contains("P02730", case=False, na=False)]

In [None]:
diff_Ortho_Progenitor[diff_Ortho_Progenitor["names"].str.contains("Q08495", case=False, na=False)]

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

def volcano_plot(
    res: pd.DataFrame,
    fc_col: str = "log2fc",
    p_col: str = "qval",          # will fall back to 'pval' if missing
    fc_thresh: float = 1.0,       # |log2FC| cutoff
    p_thresh: float = 0.05,       # FDR (or p) cutoff
    label_top: int = 10,          # annotate top N most significant hits
    title: str | None = None,
    ax: bool = False,
    show: bool = True,
    save: str | None = None,
):
    """
    Simple volcano plot:
      x = log2 fold-change
      y = -log10(q-value) (or p-value if qval not present)

    Expects columns:
      - fc_col (default 'log2fc')
      - p_col (default 'qval', falls back to 'pval' if not found)
      - 'protein_id' for labels (falls back to 'names' or index)
    """
    df = res.copy()

    # choose p column
    if p_col not in df.columns:
        if "pval" in df.columns:
            p_col = "pval"
        else:
            raise ValueError(f"Neither '{p_col}' nor 'pval' found in columns: {list(df.columns)}")

    # coerce numeric and clean
    df[fc_col] = pd.to_numeric(df[fc_col], errors="coerce")
    df[p_col]  = pd.to_numeric(df[p_col],  errors="coerce")

    # guard against zeros -> inf
    eps = 1e-300
    p_vals = df[p_col].clip(lower=eps)
    df["_mlog10p"] = -np.log10(p_vals)

    # masks
    sig_mask  = (p_vals <= p_thresh) & (df[fc_col].abs() >= fc_thresh)
    up_mask   = sig_mask & (df[fc_col] > 0)
    down_mask = sig_mask & (df[fc_col] < 0)
    ns_mask   = ~sig_mask

    # axis
    fig, ax = plt.subplots(figsize=(6, 5))

    # scatter (no explicit colors -> matplotlib defaults)
    ax.scatter(df.loc[ns_mask, fc_col],   df.loc[ns_mask, "_mlog10p"], s=10, alpha=0.3,c='#B2AEA9')
    ax.scatter(df.loc[up_mask, fc_col],   df.loc[up_mask, "_mlog10p"], s=14, alpha=0.4, c='#FF4C00')
    ax.scatter(df.loc[down_mask, fc_col], df.loc[down_mask, "_mlog10p"], s=14, alpha=0.4, c='#466CE7')

    # thresholds
    ax.axvline(+fc_thresh, linestyle="--", linewidth=1, c='gray')
    ax.axvline(-fc_thresh, linestyle="--", linewidth=1, c='gray')
    ax.axhline(-np.log10(p_thresh), linestyle="--", linewidth=1, c='gray')

    # labels & title
    ax.set_xlabel("log2 fold change")
    ax.set_ylabel("-log10(p)")
    if title:
        ax.set_title(title)

    # annotate top hits by significance
    name_col = "gene_name" if "gene_name" in df.columns else ("names" if "names" in df.columns else None)
    if label_top and name_col:
        top = df.loc[sig_mask].sort_values(by=[p_col, fc_col], ascending=[True, False]).head(label_top)
        for _, r in top.iterrows():
            ax.annotate(
                str(r[name_col]),
                (r[fc_col], r["_mlog10p"]),
                xytext=(3, 3),
                textcoords="offset points",
                fontsize=8
            )

    # make x-lims symmetric around 0 for a nicer look
    xmax = np.nanpercentile(df[fc_col].abs(), 99)
    if np.isfinite(xmax) and xmax > 0:
        ax.set_xlim(-xmax, xmax)

    plt.tight_layout()
    if save:
        plt.savefig(save, dpi=300, bbox_inches="tight")
    if show:
        plt.show()
    if ax:
        return ax


In [None]:
volcano_plot(
    res = diff_Ortho_Progenitor,
    fc_col = "logfoldchanges",
    p_col = "pvals_adj",         
    fc_thresh = 1.0,  
    p_thresh = 0.01,
    label_top=50,
    title="Ortho vs Progenitor Volcano Plot"
    )

In [None]:
prot_diff_Ortho_Progenitor_up = diff_Ortho_Progenitor[(diff_Ortho_Progenitor['pvals_adj'] < 0.01) & (diff_Ortho_Progenitor['logfoldchanges'] > 1)].names.tolist()
prot_diff_Ortho_Progenitor_up = [x.split(";")[0].strip() for x in prot_diff_Ortho_Progenitor_up]
pd.Series(prot_diff_Ortho_Progenitor_up).to_csv("/Users/isabell/Documents/projects/protypy/Karayel2020/prot_diff_Ortho_Progenitor_up.csv", index=False)
len(prot_diff_Ortho_Progenitor_up)

In [None]:
prot_diff_Ortho_Progenitor_down = diff_Ortho_Progenitor[(diff_Ortho_Progenitor['pvals_adj'] < 0.01) & (diff_Ortho_Progenitor['logfoldchanges'] < -1)].names.tolist()
prot_diff_Ortho_Progenitor_down = [x.split(";")[0].strip() for x in prot_diff_Ortho_Progenitor_down]
pd.Series(prot_diff_Ortho_Progenitor_down).to_csv("/Users/isabell/Documents/projects/protypy/Karayel2020/prot_diff_Ortho_Progenitor_down.csv", index=False)
len(prot_diff_Ortho_Progenitor_down)

## ANOVA

In [None]:
import numpy as np
import pandas as pd
import statsmodels.formula.api as smf
from statsmodels.stats.anova import anova_lm
from statsmodels.stats.multitest import multipletests
from statsmodels.stats.multicomp import pairwise_tukeyhsd
import scanpy as sc

def anova_statsmodels(adata, groupby, covariates=None, layer=None, typ=2, q_thresh=0.05):
    """
    One-way (or ANCOVA) per-feature ANOVA using statsmodels.
    Returns an ANOVA table plus post-hoc Tukey HSD for significant features.
    """
    covariates = covariates or []

    # matrix
    X = adata.layers[layer] if layer is not None else adata.X
    X = X.A if hasattr(X, "A") else (X.toarray() if hasattr(X, "toarray") else X)
    df_obs = adata.obs[covariates + [groupby]].copy()
    df_obs[groupby] = df_obs[groupby].astype("category")

    # ---- ANOVA per feature
    rows = []
    for j, feat in enumerate(adata.var_names):
        df = df_obs.copy()
        df["y"] = X[:, j]
        rhs = "C(" + groupby + ")" + (" + " + " + ".join(covariates) if covariates else "")
        try:
            model = smf.ols(f"y ~ {rhs}", data=df, missing="drop").fit()
            aov = anova_lm(model, typ=typ)
            fval = aov.loc[f"C({groupby})", "F"]
            pval = aov.loc[f"C({groupby})", "PR(>F)"]
        except Exception:
            fval, pval = np.nan, np.nan
        rows.append((feat, fval, pval))

    aov_res = pd.DataFrame(rows, columns=["feature", "F", "pval"])
    aov_res["qval"] = multipletests(aov_res["pval"], method="fdr_bh")[1]
    aov_res = aov_res.sort_values("qval").reset_index(drop=True)

    # ---- Post-hoc: Tukey HSD on significant features
    sig_feats = aov_res.loc[aov_res["qval"] <= q_thresh, "feature"].tolist()
    tukey_rows = []
    if len(sig_feats):
        groups = df_obs[groupby].astype("category")
        cats = list(groups.cat.categories)

        for feat in sig_feats:
            y = X[:, adata.var_names.get_loc(feat)]
            g = groups  # same for all features
            # Tukey works on arrays (ignores covariates, i.e. pure one-way groups)
            try:
                tk = pairwise_tukeyhsd(endog=y, groups=g, alpha=0.05)
                tk_df = pd.DataFrame(
                    tk._results_table.data[1:], columns=tk._results_table.data[0]
                )
                # normalize column names
                tk_df.columns = ["group1", "group2", "meandiff", "p_adj", "lower", "upper", "reject"]
                tk_df.insert(0, "feature", feat)
                # Add direction (which group has higher mean?)
                # Compute group means for feature:
                means = pd.Series({cat: np.nanmean(y[g == cat]) for cat in cats})
                tk_df["mean_g1"] = tk_df["group1"].map(means)
                tk_df["mean_g2"] = tk_df["group2"].map(means)
                tk_df["direction"] = np.where(tk_df["meandiff"] > 0, "group1>group2", "group2>group1")
                tukey_rows.append(tk_df)
            except Exception:
                pass

    tukey_res = pd.concat(tukey_rows, ignore_index=True) if tukey_rows else pd.DataFrame(
        columns=["feature","group1","group2","meandiff","p_adj","lower","upper","reject","mean_g1","mean_g2","direction"]
    )

    return aov_res, tukey_res


In [None]:
# 1) Run ANOVA + post-hoc
aov_res, tukey_res = anova_statsmodels(
    adata,
    groupby="cell_type",     # your grouping in adata.obs
    covariates=None,    # optional; [] if none
    layer=None,              # or a layer name like "log1p"
    typ=2,                   # Type II SS (robust with imbalance)
    q_thresh=0.05
)

# 2) Top ANOVA hits
aov_res.head()

# 3) Post-hoc table: which groups differ, by how much, and in which direction
tukey_res.head()


In [None]:
import numpy as np
from scipy import sparse
import scanpy as sc
import pandas as pd

def diff_proteins_scanpy(
    adata,
    groupby: str,           # e.g. 'tumor_type' in adata.obs
    group1: str,            # e.g. 'GBM'
    group2: str = 'rest',   # or a specific label, e.g. 'LGG'
    method: str = 'wilcoxon',   # 't-test', 'wilcoxon', 'logreg', 't-test_overestim_var'
    layer: str | None = None,   # choose a layer or use .X
    impute: str = 'col_median'  # or 'none'
) -> pd.DataFrame:
    """
    Differential analysis on AnnData using Scanpy.
    Returns a DataFrame with protein_id, log2fc, stat, pval, qval, and n_in/n_out.
    """
    ad = adata.copy()
    if layer is not None:
        ad.X = adata.layers[layer]

    # ensure dense float and handle NaNs simply (Scanpy tests expect finite values)
    X = ad.X.toarray() if sparse.issparse(ad.X) else np.asarray(ad.X, dtype=float)
    if impute == 'col_median':
        col_med = np.nanmedian(X, axis=0)
        r, c = np.where(~np.isfinite(X))
        if r.size:
            X[r, c] = col_med[c]
    elif impute == 'none':
        # replace remaining non-finite with 0 just to avoid crashes (conservative)
        X[~np.isfinite(X)] = 0.0
    ad.X = X

    # run DE
    sc.tl.rank_genes_groups(
        ad,
        groupby=groupby,
        groups=[group1],
        reference=group2,      # 'rest' or a specific label
        method=method,
        n_jobs=1
    )

    # collect results
    df = sc.get.rank_genes_groups_df(ad, group=group1)
    df = df.rename(columns={
        'names': 'protein_id',
        'logfoldchanges': 'log2fc',
        'scores': 'stat',
        'pvals': 'pval',
        'pvals_adj': 'qval'
    })
    # add sample counts for each side (helpful QC)
    n_in = (ad.obs[groupby] == group1).sum()
    n_out = (ad.obs[groupby] != group1).sum() if group2 == 'rest' else (ad.obs[groupby] == group2).sum()
    df['n_in'] = n_in
    df['n_out'] = n_out
    return df


In [None]:
res_n_p1 = diff_proteins_scanpy(
    adata, 
    groupby='cell_type', 
    group1='Negativefrac', 
    group2='P1andP2', 
    method='t-test')

res_n_p1.sort_values(by='qval', ascending=True)[0:20]

In [None]:
volcano_plot(res_n_p1, fc_thresh=1.0, p_thresh=0.05, label_top=5, title="Negativefrac vs P1andP2 (t-test)") 

In [None]:
import numpy as np
from scipy import sparse
import scanpy as sc
import pandas as pd

def diff_proteins_scanpy(
    adata,
    groupby: str,           # e.g. 'tumor_type' in adata.obs
    group1: str,            # e.g. 'GBM'
    group2: str = 'rest',   # or a specific label, e.g. 'LGG'
    method: str = 'wilcoxon',   # 't-test', 'wilcoxon', 'logreg', 't-test_overestim_var'
    layer: str | None = None,   # choose a layer or use .X
    impute: str = 'col_median'  # or 'none'
) -> pd.DataFrame:
    """
    Differential analysis on AnnData using Scanpy.
    Returns a DataFrame with protein_id, log2fc, stat, pval, qval, and n_in/n_out.
    """
    ad = adata.copy()
    if layer is not None:
        ad.X = adata.layers[layer]

    # ensure dense float and handle NaNs simply (Scanpy tests expect finite values)
    X = ad.X.toarray() if sparse.issparse(ad.X) else np.asarray(ad.X, dtype=float)
    if impute == 'col_median':
        col_med = np.nanmedian(X, axis=0)
        r, c = np.where(~np.isfinite(X))
        if r.size:
            X[r, c] = col_med[c]
    elif impute == 'none':
        # replace remaining non-finite with 0 just to avoid crashes (conservative)
        X[~np.isfinite(X)] = 0.0
    ad.X = X

    # run DE
    sc.tl.rank_genes_groups(
        ad,
        groupby=groupby,
        groups=[group1],
        reference=group2,      # 'rest' or a specific label
        method=method,
        n_jobs=1
    )

    # collect results
    df = sc.get.rank_genes_groups_df(ad, group=group1)
    df = df.rename(columns={
        'names': 'protein_id',
        'logfoldchanges': 'log2fc',
        'scores': 'stat',
        'pvals': 'pval',
        'pvals_adj': 'qval'
    })
    # add sample counts for each side (helpful QC)
    n_in = (ad.obs[groupby] == group1).sum()
    n_out = (ad.obs[groupby] != group1).sum() if group2 == 'rest' else (ad.obs[groupby] == group2).sum()
    df['n_in'] = n_in
    df['n_out'] = n_out
    return df


In [None]:
diff_neg_p1 = diff_proteins_scanpy(
    adata, 
    groupby='cell_type', 
    group1='Negativefrac', 
    group2='P1andP2', 
    method='t-test')

diff_neg_p1

In [None]:
volcano_plot(diff_neg_p1, 
             fc_col = "log2fc",
             p_col = "qval",
             fc_thresh=1.0, p_thresh=0.05, label_top=5, title="Negativefrac vs P1andP2 (t-test)")

In [None]:
sc.pl.rank_genes_groups(adata, n_genes=25, sharey=False, key = "t-test")

In [None]:
sc.pl.rank_genes_groups_heatmap(adata, n_genes=10, key="t-test", groupby="cell_type", show_gene_labels=True)

In [None]:
sc.pl.rank_genes_groups_matrixplot(adata, n_genes=5, key="t-test", groupby="cell_type")

In [None]:
sc.pl.rank_genes_groups_dotplot(adata, n_genes=5, key="t-test", groupby="cell_type")

In [None]:
# Install if needed
!pip install h5py

# Import in your Python script or notebook
import h5py


In [None]:
adata = ad.read_h5ad("/Users/isabell/Downloads/adata_analyzed.h5ad")

In [None]:
adata

In [None]:
adata.obs

In [None]:
set(adata.var[adata.var.proteoform_score > 0.4].protein_id)

In [None]:
adata.var[adata.var['protein_id'] == 'Q9Y2J2'][["protein_id","Genes","Stripped.Sequence","cluster_id","proteoform_score","proteoform_score_pval_adj"]].sort_values(by="cluster_id")

In [None]:
prots_bludauI = ['Q9Y2J2']  # RNT4, LIMA1
cp.pl.proteoform_intensities(
    adata,
    prots_bludauI,
    group_by='tumor_family',
    xlab_rotation=45,
    #log_transform=True,
    z_transform=False,
    show_zeros=True,
    group_by_label_rotation=30,
    figsize=(100,10),
)

In [None]:
prots_bludauI = ['Q9Y2J2']  # RNT4, LIMA1
cp.pl.proteoform_intensities(
    adata,
    prots_bludauI,
    group_by='batch_id', #'tumor_family',
    xlab_rotation=45,
    #log_transform=True,
    z_transform=False,
    show_zeros=True,
    group_by_label_rotation=30,
    figsize=(100,10),
)