In [77]:
from pathlib import Path

import glob
import pickle



import dotenv
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import scanpy as sc
import seaborn as sns
import tensorflow as tf
import argparse
from scipy.stats import weightedtau
from sklearn.model_selection import train_test_split

from isrobust_TFG.bio import (
    build_hipathia_renamers,
    get_adj_matrices,
    get_random_adj,
    get_reactome_adj,
    sync_gexp_adj,
    get_importances,
    get_activations,
    train_val_test_split,
)

from isrobust_TFG.datasets import load_kang
from isrobust_TFG.utils import set_all_seeds


from multiprocessing import cpu_count

from sklearn.cluster import MiniBatchKMeans
from sklearn.metrics.cluster import adjusted_mutual_info_score

In [78]:
config = dotenv.dotenv_values()
model_kind = "ivae_random"
frac = 0.25
seed_start = 0
seed_step = 1
seed_stop = 2
results_path = Path(config["RESULTS_FOLDER"])
results_path.mkdir(exist_ok=True, parents=True)
project_path = Path(dotenv.find_dotenv()).parent
data_path = project_path.joinpath("data")


In [79]:
results_path

PosixPath('path')

In [80]:
results_path.mkdir(exist_ok=True, parents=True)
figs_path = results_path.joinpath("figs")
figs_path.mkdir(exist_ok=True, parents=True)
tables_path = results_path.joinpath("tables")
tables_path.mkdir(exist_ok=True, parents=True)

In [81]:
if("ivae_random" in model_kind):
    results_path_model = results_path.joinpath(model_kind + f"-{frac}")
    results_path_model.mkdir(exist_ok=True, parents=True)
else:
    results_path_model = results_path.joinpath(model_kind)
    results_path_model.mkdir(exist_ok=True, parents=True)
print(f"{results_path_model}")

path/ivae_random-0.25


In [82]:
adata = load_kang(data_folder=data_path, normalize=True, n_genes=None)
obs = adata.obs.copy()
x_trans = adata.to_df()

tf.config.experimental.enable_op_determinism()

sc.set_figure_params(dpi=300, color_map="inferno")
sc.settings.verbosity = 1
sc.logging.print_header()


  adata.obs["label"] = adata.obs["label"].replace(


scanpy==1.9.8 anndata==0.11.3 umap==0.5.7 numpy==1.26.4 scipy==1.10.1 pandas==2.2.3 scikit-learn==1.5.2 statsmodels==0.14.4 igraph==0.10.8 pynndescent==0.5.13


In [83]:
seeds = list(
        range(
            seed_start,
            seed_stop + 1,
            seed_step,
        )
    )
N_ITERS = len(seeds)
debug=False
if debug:
    N_EPOCHS = 2
else:
    N_EPOCHS = 300

In [84]:
if model_kind == "ivae_kegg":
        n_encoding_layers = 3
        circuit_adj, circuit_to_pathway_adj = get_adj_matrices(
        gene_list=x_trans.columns.to_list()
        )
        circuit_renamer, pathway_renamer, circuit_to_effector = build_hipathia_renamers()
        kegg_circuit_names = circuit_adj.rename(columns=circuit_renamer).columns
        kegg_pathway_names = circuit_to_pathway_adj.rename(columns=pathway_renamer).columns
        circuit_adj.head()
        x_trans, circuit_adj = sync_gexp_adj(gexp=x_trans, adj=circuit_adj)
        layer_ids = [1, 2, 3]
        
elif model_kind == "ivae_reactome":
    n_encoding_layers = 2
    reactome = get_reactome_adj()
    reactome_pathway_names = reactome.columns
    x_trans, reactome = sync_gexp_adj(x_trans, reactome)
    layer_ids = [1, 2]

elif "ivae_random" in model_kind:
    reactome = get_reactome_adj()
    n_encoding_layers = 2
    n_genes = 3000 # En los otros models no se pone porque el default ya es None
    state = np.random.get_state()
    random_layer, random_layer_names = get_random_adj(
    frac, shape=reactome.shape, size=reactome.size, index=reactome.index, seed=0)
    np.random.set_state(state)
    x_trans, random_layer = sync_gexp_adj(x_trans, random_layer)
    layer_ids = [1, 2]

else:
    raise NotImplementedError(f"{model_kind} not implemented yet.")

In [85]:
print(f"{debug=} {model_kind=}")
    
    
non_layer_names = ["split", "layer", "seed", "cell_type", "condition", "model"]


scores_metrics = [
    pd.read_pickle(results_path_model.joinpath(f"metrics-seed-{seed:02d}.pkl")) 
    for seed in seeds
]
scores_metrics = pd.concat(scores_metrics, axis=0, ignore_index=True)
scores_metrics.to_pickle(results_path_model.joinpath("scores_metrics.pkl"))
scores_metrics.head()

debug=False model_kind='ivae_random'


Unnamed: 0,seed,metric,split,score,model
0,0,loss,train,47.510967,ivae_random
1,0,mse,train,0.003121,ivae_random
2,0,loss,val,47.599075,ivae_random
3,0,mse,val,0.003128,ivae_random
4,0,loss,test,47.534157,ivae_random


In [86]:
scores_informed = {}

for layer_id in layer_ids:
    if results_path_model.joinpath(
        f"encodings_layer-{layer_id:02d}_seed-00.pkl"
    ).exists():
        results_layer = [
            pd.read_pickle(
                results_path_model.joinpath(
                    f"encodings_layer-{layer_id:02d}_seed-{seed:02d}.pkl"
                )
            )
            for seed in seeds
        ]
    else:
        continue

    scores_informed[layer_id] = {}
    for split in ["train", "test", "val"]:
        results = [
            x.loc[x["split"] == split].drop(non_layer_names, axis=1)
            for x in results_layer
        ]
        scores_informed[layer_id][split] = []
        for seed_i in seeds:
            for seed_j in range(seed_i + 1, N_ITERS):
                scores_informed[layer_id][split].append(
                    weightedtau(
                        get_importances(data=results[seed_i], abs=True),
                        get_importances(data=results[seed_j], abs=True),
                    )[0]
                )


In [87]:
scores_informed = (
    pd.DataFrame.from_dict(scores_informed)
    .melt(var_name="layer", value_name="score", ignore_index=False)
    .reset_index(names=["split"])
    .explode("score")
)
scores_informed["score"] = scores_informed["score"].astype("float")
scores_informed["model"] = model_kind
scores_informed.to_pickle(results_path_model.joinpath("scores_informed.pkl"))

results_path_model.joinpath("scores_informed.pkl")

PosixPath('path/ivae_random-0.25/scores_informed.pkl')

In [88]:
scores_informed.head()

Unnamed: 0,split,layer,score,model
0,train,1,-0.017844,ivae_random
0,train,1,-0.008756,ivae_random
0,train,1,0.009547,ivae_random
1,test,1,-0.017664,ivae_random
1,test,1,-0.00116,ivae_random


In [89]:
clust_scores = {}

for layer_id in layer_ids:
    if results_path_model.joinpath(
        f"encodings_layer-{layer_id:02d}_seed-00.pkl"
    ).exists():
        results_layer = [
            pd.read_pickle(
                results_path_model.joinpath(
                    f"encodings_layer-{layer_id:02d}_seed-{seed:02d}.pkl"
                )
            )
            for seed in range(N_ITERS)
        ]
    else:
        continue

    train_embeddings_lst = [
        x.loc[(x["split"] == "train") & (x["condition"] == "control")]
        for x in results_layer
    ]
    val_embeddings_lst = [
        x.loc[(x["split"] == "val") & (x["condition"] == "control")]
        for x in results_layer
    ]
    test_embeddings_lst = [
        x.loc[(x["split"] == "test") & (x["condition"] == "control")]
        for x in results_layer
    ]

    clust_scores[layer_id] = {}
    clust_scores[layer_id]["train"] = []
    clust_scores[layer_id]["val"] = []
    clust_scores[layer_id]["test"] = []

    for seed in range(N_ITERS):
        y_train = train_embeddings_lst[seed]["cell_type"]
        y_val = val_embeddings_lst[seed]["cell_type"]
        y_test = test_embeddings_lst[seed]["cell_type"]

        train_embeddings = train_embeddings_lst[seed].drop(non_layer_names, axis=1)
        val_embeddings = val_embeddings_lst[seed].drop(non_layer_names, axis=1)
        test_embeddings = test_embeddings_lst[seed].drop(non_layer_names, axis=1)

        model = MiniBatchKMeans(n_clusters=y_train.nunique(), batch_size=256 * cpu_count() + 1)
        model.fit(train_embeddings)
        clust_scores[layer_id]["train"].append(
            adjusted_mutual_info_score(y_train, model.labels_)
        )
        clust_scores[layer_id]["val"].append(
            adjusted_mutual_info_score(y_val, model.predict(val_embeddings))
        )
        clust_scores[layer_id]["test"].append(
           adjusted_mutual_info_score(y_test, model.predict(test_embeddings))
        )

In [90]:
results_path_model.joinpath("scores_clustering.pkl")

clust_scores = (
    pd.DataFrame.from_dict(clust_scores)
    .melt(var_name="layer", value_name="score", ignore_index=False)
    .reset_index(names=["split"])
    .explode("score")
)
clust_scores["score"] = clust_scores["score"].astype("float")
clust_scores["model"] = model_kind
clust_scores.to_pickle(results_path_model.joinpath("scores_clustering.pkl"))

In [91]:
clust_scores.head()

Unnamed: 0,split,layer,score,model
0,train,1,0.665108,ivae_random
0,train,1,0.668041,ivae_random
0,train,1,0.617996,ivae_random
1,val,1,0.665219,ivae_random
1,val,1,0.678493,ivae_random
