In [None]:
# https://www.sc-best-practices.org/conditions/gsea_pathway.html#id380
# Kang HM, Subramaniam M, Targ S, et al. Multiplexed droplet single-cell RNA-sequencing using natural genetic variation
#   Nat Biotechnol. 2020 Nov;38(11):1356]. Nat Biotechnol. 2018;36(1):89-94. doi:10.1038/nbt.4042

In [None]:
%load_ext autoreload
%autoreload 2

import scanpy as sc
import tensorflow as tf
from tensorflow.keras.models import Model
from ivae_scorer.datasets import load_kang
from tensorflow.keras import callbacks
import shutil
from ivae_scorer.utils import set_all_seeds
from ivae_scorer.bio import get_adj_matrices, sync_gexp_adj, build_hipathia_renamers
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import minmax_scale
import matplotlib.pyplot as plt
import seaborn as sns
import dotenv
from pathlib import Path
import pandas as pd
import numpy as np


project_path = Path(dotenv.find_dotenv()).parent
results_path = project_path.joinpath("results")
results_path.mkdir(exist_ok=True, parents=True)
data_path = project_path.joinpath("data")
data_path.mkdir(exist_ok=True, parents=True)
figs_path = results_path.joinpath("figs")
figs_path.mkdir(exist_ok=True, parents=True)
tables_path = results_path.joinpath("tables")
tables_path.mkdir(exist_ok=True, parents=True)

set_all_seeds(seed=42)

tf.config.experimental.enable_op_determinism()

sc.set_figure_params(dpi=300, color_map="inferno")
sc.settings.verbosity = 1
sc.logging.print_header()

In [None]:
adata = load_kang(data_folder=data_path, normalize=True, n_genes=4000)

In [None]:
x_trans = adata.to_df()

In [None]:
circuit_adj, circuit_to_pathway_adj = get_adj_matrices(
    gene_list=x_trans.columns.to_list()
)

circuit_renamer, pathway_renamer, circuit_to_effector = build_hipathia_renamers()

circuit_names = circuit_adj.rename(columns=circuit_renamer).columns

pathway_names = circuit_to_pathway_adj.rename(columns=pathway_renamer).columns

circuit_adj.head()

In [None]:
x_trans, circuit_adj = sync_gexp_adj(gexp=x_trans, adj=circuit_adj)
x_trans.shape, circuit_adj.shape, circuit_to_pathway_adj.shape

In [None]:
from ivae_scorer.models import build_kegg_vae


def get_importances(data, abs=False):
    if abs:
        return np.abs(data).mean(axis=0)
    else:
        return data.mean(axis=0)


def get_activations(act_model, layer_id, data):
    data_encoded = act_model.predict(data)[layer_id]
    return data_encoded

In [None]:
results = []

results_path_model = results_path.joinpath("ivae_kegg")
shutil.rmtree(results_path_model)
results_path_model.mkdir(exist_ok=True, parents=True)

for seed in range(100):
    tf.keras.backend.clear_session()
    obs = adata.obs.copy()

    x_train, x_test = train_test_split(
        x_trans.apply(minmax_scale),
        test_size=0.33,
        stratify=obs["cell_type"],
        random_state=seed,
    )
    x_train = x_train.astype("float32")
    x_test = x_test.astype("float32")

    vae, encoder, decoder = build_kegg_vae(
        circuits=circuit_adj, pathways=circuit_to_pathway_adj, seed=seed
    )

    batch_size = 32

    callback = callbacks.EarlyStopping(
        monitor="val_loss",  # Stop training when `val_loss` is no longer improving
        min_delta=1e-1,  # "no longer improving" being defined as "no better than 1e-5 less"
        patience=100,  # "no longer improving" being further defined as "for at least 3 epochs"
        verbose=0,
    )

    history = vae.fit(
        x_train.values,
        shuffle=True,
        verbose=0,
        epochs=10,
        batch_size=batch_size,
        callbacks=[callback],
        validation_data=(x_test.values, None),
    )

    layer_outputs = [layer.output for layer in encoder.layers]
    activation_model = Model(inputs=encoder.input, outputs=layer_outputs)

    for layer_id in [1, 2]:
        if layer_id == 1:
            colnames = circuit_names
        else:
            colnames = pathway_names

        encodings = get_activations(
            act_model=activation_model,
            layer_id=layer_id,
            data=x_trans.apply(minmax_scale),
        )
        encodings = pd.DataFrame(encodings, index=x_trans.index, columns=colnames)
        encodings["is_train"] = 1
        encodings.loc[x_test.index, "is_train"] = 0
        encodings["layer"] = layer_id
        encodings["seed"] = seed
        encodings.to_pickle(
            results_path_model.joinpath(
                f"encodings_layer-{layer_id:02d}_seed-{seed:02d}.pkl"
            )
        )