In [1]:
# conda activate anndata

import os
import re
import numpy as np
import pandas as pd
import scanpy as sc
import anndata as ad
from scipy import sparse
import matplotlib.pyplot as plt
from matplotlib.backends.backend_pdf import PdfPages

pd.set_option('display.max_columns', None)

In [2]:
# def _col_idx_for(df, stat):
#     return df.columns.str.contains(stat)

def _safe(name):
    # simple filename sanitizer
    return re.sub(r'[^A-Za-z0-9_.-]+', '_', str(name))

def _barplot_colors(ctypes, working_ctype):
    colors = ["tab:blue"] * len(ctypes)
    idx = pd.Index(ctypes).get_indexer([working_ctype])[0]
    colors[idx] = "tab:red"
    return colors

def _flatten_1d(x):
    if sparse.issparse(x):   # AnnData/sparse safe
        return x.toarray().ravel()
    return np.asarray(x, dtype=float).ravel()

def plot_barplot_by_cell_type(means, errs, ctypes, working_ctype, title, ylabel, show=True):
    # mean_cols = _col_idx_for(df, "mean")
    # se_cols  = _col_idx_for(df, "SE")
    # means = df.loc[exon, mean_cols].astype(float).values
    # errs = df.loc[exon, se_cols].astype(float).values * 2.0
   
    means = np.asarray(means)
    errs = np.asarray(errs) * 2.0 
    
    colors = _barplot_colors(ctypes, working_ctype)
    x = np.arange(len(ctypes))
    fig, ax = plt.subplots()
    ax.bar(x, means, yerr=errs, capsize=3, color=colors)
    ax.set_xticks(x); ax.set_xticklabels(ctypes, rotation=45, ha="right")
    ax.set_ylabel(ylabel); ax.set_title(title)
    fig.tight_layout()
    if show:
        plt.show()
    return fig

def plot_ME_vs_PSI(mod_eig, exon_psi, title, show=True):
    fig, ax = plt.subplots()
    ax.scatter(mod_eig, exon_psi, s=18)
    ax.set_xlabel("Module eigengene")
    ax.set_ylabel("Exon PSI")
    ax.set_title(title)
    plt.tight_layout()
    if show:
        plt.show()
    return fig

def violin_with_points(values_by_ct, ctypes, focus=None,
                       base_fc='tab:blue', highlight_fc='tab:red',
                       ylabel="Value", title=None,
                       jitter=0.08, point_size=8, point_alpha=0.35,
                       max_points_per_ct=None, seed=0, violin_alpha=0.5,
                       show=False):                         # <- default False when saving
    rng = np.random.default_rng(seed)
    pos = np.arange(1, len(ctypes)+1)

    fig, ax = plt.subplots()                                # <- create fig
    vp = ax.violinplot(values_by_ct, positions=pos, showmeans=True, showextrema=True)

    # color violins (no outlines)
    for i, b in enumerate(vp['bodies']):
        col = highlight_fc if (focus is not None and ctypes[i] == focus) else base_fc
        b.set_facecolor(col)
        b.set_edgecolor('none')
        b.set_alpha(violin_alpha)
    for k in ('cmeans','cmins','cmaxes','cbars','cmedians'):
        if k in vp: vp[k].set_visible(False)

    # jitter points
    for i, v in enumerate(values_by_ct, start=1):
        v = np.asarray(v, float)
        v = v[np.isfinite(v)]
        if v.size == 0: continue
        if max_points_per_ct and v.size > max_points_per_ct:
            v = v[rng.choice(v.size, size=max_points_per_ct, replace=False)]
        x = i + (rng.random(v.size) - 0.5) * 2 * jitter
        col = highlight_fc if (focus is not None and ctypes[i-1] == focus) else base_fc
        ax.scatter(x, v, s=point_size, alpha=point_alpha, color=col, edgecolors='none', rasterized=True)

    ax.set_xticks(pos); ax.set_xticklabels(ctypes, rotation=45, ha='right')
    ax.set_ylabel(ylabel); 
    if title: ax.set_title(title)
    fig.tight_layout()

    if show:
        plt.show()

    return fig  

In [5]:
pseudobulk_str = "20pcntCells_30pcntVar_200samples"
psi_data = f"SyntheticDataset1_{pseudobulk_str}_SJ_pseudobulk_min_observed0.05_minPsi0.05_PSI"
merge_param = "0.9"
pdf_path_prefix = f"figures/tasic_2018_ALM_STAR_{psi_data}"
psi = pd.read_csv(f"data/tasic_2018_ALM_STAR_{psi_data}.csv", index_col=0)
corr_df = pd.read_csv(f"data/tasic_2018_ALM_STAR_{psi_data}_exon_corr.csv", index_col=0)
top_qval_mods_df = pd.read_csv(f"data/tasic_2018_ALM_STAR_donor_cell_type_pseudobulk_pairwise_DE_genes_dream_{pseudobulk_str}_log2_pseudobulk_PosBC_top_Qval_modules.csv")

In [13]:
# pseudobulk_str = "25pcntCells_100SD_200samples"
# psi_data = f"SyntheticDataset1_{pseudobulk_str}_SJ_pseudobulk_min_observed0.05_PSI"
# merge_param = "0.9"
# pdf_path_prefix = f"figures/tasic_2018_ALM_STAR_{psi_data}"
# psi = pd.read_csv(f"data/tasic_2018_ALM_STAR_{psi_data}.csv", index_col=0)
# corr_df = pd.read_csv(f"data/tasic_2018_ALM_STAR_{psi_data}_exon_corr.csv", index_col=0)
# top_qval_mods_df = pd.read_csv(f"data/tasic_2018_ALM_STAR_donor_cell_type_pseudobulk_pairwise_DE_genes_dream_{pseudobulk_str}_log2_pseudobulk_PosBC_top_Qval_modules.csv")

In [14]:
top_qval_mods_df.index = top_qval_mods_df['Cell_type']
gene_exon_df = corr_df['Gene']

In [15]:
# Load single-cell PSI data
sdata = ad.read_h5ad("data/tasic_2018_ALM_STAR_SJ_counts_annotated_PSI.hd5")
# Load single-cell gene expression data
adata = ad.read_h5ad("data/tasic_2018_ALM_STAR_model/tasic_2018_ALM_STAR_gene_counts_scVI.h5ad")

In [16]:
adata.obs['cell_subclass'] = adata.obs['cell_subclass'].astype(str).str.replace("/", "_", regex=False).str.replace(" ", "_", regex=False)
sdata.obs['cell_subclass'] = sdata.obs['cell_subclass'].astype(str).str.replace("/", "_", regex=False).str.replace(" ", "_", regex=False)

In [17]:
# Work with full gene space
adata_raw = adata.raw.to_adata()
adata_raw.X = adata_raw.X.toarray()

In [18]:
ctypes = corr_df.columns[1:]

In [19]:
top_n = 15
ascending = False

for w_ctype in ctypes:
    print(w_ctype)
    outdir = f"figures/{w_ctype}/{pseudobulk_str}/{merge_param}"
    os.makedirs(outdir, exist_ok=True)

    row = top_qval_mods_df.loc[w_ctype]
    mod_df = pd.read_csv(row['ME_path'])
    mod_eig_df = mod_df.set_index("Sample")[row['Module']]
    mod_eig = pd.to_numeric(mod_eig_df, errors="coerce")

    # Get mean expression of cell type exon/gene in each cell type

    corr_df = corr_df.sort_values(w_ctype, ascending=ascending)
    top_exons = corr_df[w_ctype].index.tolist()[:top_n]

    cols = [f"{ct}_{stat}" for ct in ctypes for stat in ("mean", "SE")]
    ctype_psi_df = pd.DataFrame(columns=cols, index=top_exons) 
    ctype_expr_df = pd.DataFrame(columns=cols, index=top_exons)

    for exon in top_exons:
        exon_mask = sdata.var_names.isin([exon])
        sdata_sub = sdata[:, exon_mask].copy()
        gene_mask = adata_raw.var_names.isin([gene_exon_df.loc[exon]])
        adata_sub = adata_raw[:, gene_mask].copy()
        
        psi_vals_by_ct = []
        expr_vals_by_ct = []
        mean_psi_by_ct = [] 
        mean_psi_se_by_ct = []
        mean_expr_by_ct = []
        mean_expr_se_by_ct = []

        for ct in ctypes:
            cell_mask = adata_sub.obs['cell_subclass'] == ct
            n = np.sum(cell_mask)
            psi_per_cell = _flatten_1d(sdata_sub.X[cell_mask, :])
            expr_per_cell = _flatten_1d(adata_sub.X[cell_mask, :])

            psi_vals_by_ct.append(psi_per_cell)
            expr_vals_by_ct.append(np.log1p(expr_per_cell))

            mean_psi_by_ct.append(np.mean(psi_per_cell))
            mean_psi_se_by_ct.append(np.sqrt(np.var(psi_per_cell) / n))

            mean_expr = np.mean(adata_raw.X[cell_mask, :]) 
            mean_expr_by_ct.append(np.mean(expr_per_cell))
            mean_expr_se_by_ct.append(np.sqrt(np.var(expr_per_cell) / n) / mean_expr)

        corr = round(corr_df.loc[exon, w_ctype], 2)
        gene = gene_exon_df.loc[exon]
        exon_psi = psi.loc[exon]
        exon_label = ''.join(str(exon).split("_")[1:])
        pdf_path = f"{outdir}/{_safe(w_ctype)}_{_safe(gene)}_{_safe(exon_label)}_ascending{ascending}.pdf"

        with PdfPages(pdf_path) as pdf:
            fig = plot_barplot_by_cell_type(mean_psi_by_ct, mean_psi_se_by_ct, 
                                            ctypes, w_ctype,
                                            title=f"PSI for {gene} {exon_label} exon",
                                            ylabel="Mean PSI", show=False)
            pdf.savefig(fig); plt.close(fig)

            fig = plot_barplot_by_cell_type(mean_expr_by_ct, mean_expr_se_by_ct, 
                                            ctypes, w_ctype,
                                            title=f"Gene expression for {gene}",
                                            ylabel="Mean expression (normalized)", 
                                            show=False)
            pdf.savefig(fig); plt.close(fig)

            fig = plot_ME_vs_PSI(mod_eig, exon_psi,
                                 title=f"{w_ctype} ME vs. PSI for {gene} {exon_label} exon\nCorr: {corr}",
                                 show=False)
            pdf.savefig(fig); plt.close(fig)

            fig = violin_with_points(psi_vals_by_ct, ctypes, focus=w_ctype,
                                     ylabel="PSI",
                                     title=f"PSI distribution for {gene} {exon_label} exon",
                                     jitter=0.2, max_points_per_ct=3000, violin_alpha=0.2, 
                                     show=False)
            pdf.savefig(fig); plt.close(fig)
            
            fig = violin_with_points(expr_vals_by_ct, ctypes, focus=w_ctype,
                                     ylabel="Expression (log2)",
                                     title=f"Count distribution for {gene}",
                                     jitter=0.2, max_points_per_ct=3000, violin_alpha=0.2, 
                                     show=False)
            pdf.savefig(fig); plt.close(fig)

        print("Saved", exon)
        print("")
    

Astro
Saved ENSMUSG00000027574_ProteinCoding_1

Saved ENSMUSG00000032076_ProteinCoding_2

Saved ENSMUSG00000032076_ProteinCoding_1

Saved ENSMUSG00000066456_ProteinCoding_1

Saved ENSMUSG00000031342_ProteinCoding_2

Saved ENSMUSG00000022564_ProteinCoding_3

Saved ENSMUSG00000024302_ProteinCoding_1

Saved ENSMUSG00000040407_ProteinCoding_1

Saved ENSMUSG00000037697_ProteinCoding_1

Saved ENSMUSG00000053332_other_32

Saved ENSMUSG00000033981_NMD_1

Saved ENSMUSG00000039178_NMD_1

Saved ENSMUSG00000022253_ProteinCoding_2

Saved ENSMUSG00000047454_ProteinCoding_1

Saved ENSMUSG00000005871_ProteinCoding_1

Endo
Saved ENSMUSG00000025085_ProteinCoding_2

Saved ENSMUSG00000037936_ProteinCoding_1

Saved ENSMUSG00000035863_ProteinCoding_1

Saved ENSMUSG00000025085_ProteinCoding_1

Saved ENSMUSG00000011958_ProteinCoding_1

Saved ENSMUSG00000025006_ProteinCoding_8

Saved ENSMUSG00000028559_ProteinCoding_1

Saved ENSMUSG00000028613_ProteinCoding_5

Saved ENSMUSG00000002504_ProteinCoding_1

Saved EN