In [7]:
%load_ext autoreload
%autoreload 2

from pathlib import Path

import dotenv
import os
import matplotlib.pyplot as plt
import scanpy as sc
import tensorflow as tf
import pandas as pd

from keras import callbacks
from keras.models import Model
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import minmax_scale

from isrobust_TFG.bio import get_reactome_adj, sync_gexp_adj
from isrobust_TFG.datasets import load_kang
from isrobust_TFG.utils import set_all_seeds
from isrobust_TFG.CI_VAE_CLASS import InformedVAE

project_path = Path(dotenv.find_dotenv()).parent
results_path = project_path.joinpath("results")
results_path.mkdir(exist_ok=True, parents=True)
data_path = project_path.joinpath("data")
data_path.mkdir(exist_ok=True, parents=True)
figs_path = results_path.joinpath("figs")
figs_path.mkdir(exist_ok=True, parents=True)
tables_path = results_path.joinpath("tables")
tables_path.mkdir(exist_ok=True, parents=True)


tf.config.experimental.enable_op_determinism()

sc.set_figure_params(dpi=300, color_map="inferno")
sc.settings.verbosity = 1
sc.logging.print_header()

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
scanpy==1.10.4 anndata==0.11.3 umap==0.5.7 numpy==1.26.4 scipy==1.15.1 pandas==2.2.3 scikit-learn==1.6.1 statsmodels==0.14.4 igraph==0.10.8 pynndescent==0.5.13


In [8]:
env_path = dotenv.find_dotenv()
dir_path = os.path.dirname(env_path)

print(dir_path)


/home/sfernandez/TFG/robustness_informed_TFG


In [9]:
adata = load_kang(data_folder=data_path, normalize=True, n_genes=4000)

  adata.obs["label"] = adata.obs["label"].replace(


In [10]:
x_trans = adata.to_df()

In [11]:
reactome = get_reactome_adj()
reactome.head()

geneset,REACTOME_2_LTR_CIRCLE_FORMATION,REACTOME_ABACAVIR_METABOLISM,REACTOME_ABACAVIR_TRANSMEMBRANE_TRANSPORT,REACTOME_ABACAVIR_TRANSPORT_AND_METABOLISM,REACTOME_ABC_FAMILY_PROTEINS_MEDIATED_TRANSPORT,REACTOME_ABC_TRANSPORTERS_IN_LIPID_HOMEOSTASIS,REACTOME_ABC_TRANSPORTER_DISORDERS,REACTOME_ABERRANT_REGULATION_OF_MITOTIC_EXIT_IN_CANCER_DUE_TO_RB1_DEFECTS,REACTOME_ABERRANT_REGULATION_OF_MITOTIC_G1_S_TRANSITION_IN_CANCER_DUE_TO_RB1_DEFECTS,REACTOME_ABORTIVE_ELONGATION_OF_HIV_1_TRANSCRIPT_IN_THE_ABSENCE_OF_TAT,...,REACTOME_WNT5A_DEPENDENT_INTERNALIZATION_OF_FZD2_FZD5_AND_ROR2,REACTOME_WNT5A_DEPENDENT_INTERNALIZATION_OF_FZD4,REACTOME_WNT_LIGAND_BIOGENESIS_AND_TRAFFICKING,REACTOME_WNT_MEDIATED_ACTIVATION_OF_DVL,REACTOME_XENOBIOTICS,REACTOME_YAP1_AND_WWTR1_TAZ_STIMULATED_GENE_EXPRESSION,REACTOME_ZBP1_DAI_MEDIATED_INDUCTION_OF_TYPE_I_IFNS,REACTOME_ZINC_EFFLUX_AND_COMPARTMENTALIZATION_BY_THE_SLC30_FAMILY,REACTOME_ZINC_INFLUX_INTO_CELLS_BY_THE_SLC39_GENE_FAMILY,REACTOME_ZINC_TRANSPORTERS
genesymbol,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A1BG,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A1CF,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A2M,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A4GNT,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
AAAS,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [12]:
x_trans, reactome = sync_gexp_adj(x_trans, reactome)

In [13]:
x_trans.shape, reactome.shape

((24673, 2160), (2160, 1615))

In [14]:
obs = adata.obs.copy()

x_train, x_test = train_test_split(
    x_trans.apply(minmax_scale),
    test_size=0.33,
    stratify=obs["cell_type"],
    random_state=42,
)
x_train = x_train.astype("float32")
x_test = x_test.astype("float32")

In [15]:
x_train.shape, reactome.shape

((16530, 2160), (2160, 1615))

In [16]:
model = InformedVAE(
    adjacency_matrices=reactome, adjacency_names="reactome", adjacency_activation="tanh",seed=42
)

model._build_vae()

batch_size = 32

callback = callbacks.EarlyStopping(
    monitor="val_loss",
    min_delta=1e-1,
    patience=100,
    verbose=0,
)

history = model.fit(
    x_train,
    x_train,
    shuffle=True,
    verbose=1,
    epochs=100,
    batch_size=batch_size,
    callbacks=[callback],
    validation_data=(x_test, x_test),
)

W0000 00:00:1738833931.868129 1504912 gpu_device.cc:2344] Cannot dlopen some GPU libraries. Please make sure the missing libraries mentioned above are installed properly if you would like to use GPU. Follow the guide at https://www.tensorflow.org/install/gpu for how to download and setup the required libraries for your platform.
Skipping registering GPU devices...


Epoch 1/100


2025-02-06 09:25:32.453942: E tensorflow/core/framework/node_def_util.cc:676] NodeDef mentions attribute use_unbounded_threadpool which is not in the op definition: Op<name=MapDataset; signature=input_dataset:variant, other_arguments: -> handle:variant; attr=f:func; attr=Targuments:list(type),min=0; attr=output_types:list(type),min=1; attr=output_shapes:list(shape),min=1; attr=use_inter_op_parallelism:bool,default=true; attr=preserve_cardinality:bool,default=false; attr=force_synchronous:bool,default=false; attr=metadata:string,default=""> This may be expected if your graph generating binary is newer  than this binary. Unknown attributes will be ignored. NodeDef: {{node ParallelMapDatasetV2/_15}}


[1m517/517[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 41ms/step - loss: 24.0822 - mse: 0.0111

2025-02-06 09:25:55.340164: E tensorflow/core/framework/node_def_util.cc:676] NodeDef mentions attribute use_unbounded_threadpool which is not in the op definition: Op<name=MapDataset; signature=input_dataset:variant, other_arguments: -> handle:variant; attr=f:func; attr=Targuments:list(type),min=0; attr=output_types:list(type),min=1; attr=output_shapes:list(shape),min=1; attr=use_inter_op_parallelism:bool,default=true; attr=preserve_cardinality:bool,default=false; attr=force_synchronous:bool,default=false; attr=metadata:string,default=""> This may be expected if your graph generating binary is newer  than this binary. Unknown attributes will be ignored. NodeDef: {{node ParallelMapDatasetV2/_15}}


[1m517/517[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m25s[0m 45ms/step - loss: 24.0791 - mse: 0.0111 - val_loss: 19.3501 - val_mse: 0.0088
Epoch 2/100
[1m517/517[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 44ms/step - loss: 18.6099 - mse: 0.0085 - val_loss: 16.8375 - val_mse: 0.0077
Epoch 3/100
[1m517/517[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 43ms/step - loss: 16.3758 - mse: 0.0075 - val_loss: 15.1081 - val_mse: 0.0069
Epoch 4/100
[1m517/517[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 45ms/step - loss: 14.7158 - mse: 0.0067 - val_loss: 13.6397 - val_mse: 0.0062
Epoch 5/100
[1m517/517[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 47ms/step - loss: 13.3415 - mse: 0.0061 - val_loss: 12.4681 - val_mse: 0.0057
Epoch 6/100
[1m517/517[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 45ms/step - loss: 12.2303 - mse: 0.0055 - val_loss: 11.6037 - val_mse: 0.0053
Epoch 7/100
[1m428/517[0m [32m━━━━━━━━━━━━━━━━[0m[37m━━━━[0m [1

KeyboardInterrupt: 

In [None]:
encoder = model.encoder
decoder = model.decoder

In [None]:
x_train_encoded = encoder.predict(x_train, batch_size=batch_size)[0]
x_train_encoded = pd.DataFrame(x_train_encoded, index=x_train.index)

adata = sc.AnnData(X=x_train_encoded)
adata.obs = obs.loc[x_train.index]

# sc.pp.log1p(adata)
sc.pp.neighbors(adata, use_rep="X", random_state=42)
sc.tl.leiden(adata, random_state=42)
sc.tl.umap(adata, random_state=42)

In [None]:
layer_outputs = [layer.output for layer in encoder.layers]
activation_model = Model(inputs=encoder.input, outputs=layer_outputs)

In [None]:
# circuit activity extracted from first layer
layer_id = 1

entitie_names = reactome.columns.str.replace("REACTOME_", "").str.replace("_", " ")
# circuit_names = circuit_to_pathway_adj.rename(columns=pathway_renamer).columns

x_train_encoded = activation_model.predict(x_train, batch_size=batch_size)[layer_id]
x_train_encoded = pd.DataFrame(
    x_train_encoded, index=x_train.index, columns=entitie_names
)

entities_adata = sc.AnnData(X=x_train_encoded.abs())
entities_adata.obs = obs.loc[x_train.index]

# sc.pp.log1p(adata)
sc.pp.neighbors(entities_adata, use_rep="X", random_state=42)
sc.tl.leiden(entities_adata, random_state=42)
sc.tl.umap(entities_adata, random_state=42)

In [None]:
method = "wilcoxon"
sc.tl.rank_genes_groups(
    entities_adata, "condition", refrence="control", key_added=method, method=method
)

result = entities_adata.uns[method]
groups = ["stimulated"]
dacs = pd.DataFrame(
    {
        group + "_" + key: result[key][group]
        for group in groups
        for key in ["names", "scores", "pvals", "pvals_adj", "logfoldchanges"]
    }
)

dacs.head(10)

In [None]:
dacs_to_write = dacs.head(10).copy()
dacs_to_write.columns = dacs_to_write.columns.str.replace("stimulated_", "")
dacs_to_write = dacs_to_write.rename(columns={"names": "pathways"})
dacs_to_write["pathways"] = dacs_to_write["pathways"].str.title()
dacs_to_write = dacs_to_write.drop("pvals", axis=1)
dacs_to_write["pathways"] = dacs_to_write["pathways"].str.replace(
    "Alpha Beta", r"$\alpha, \beta$"
)
dacs_to_write["pathways"] = dacs_to_write["pathways"].str.replace("Ifn", "IFN")
dacs_to_write["pathways"] = dacs_to_write["pathways"].str.replace("Dna", "DNA")
dacs_to_write["pathways"] = dacs_to_write["pathways"].str.replace("Oas", "OAS")
dacs_to_write["pathways"] = dacs_to_write["pathways"].str.replace("Ddx58", "DDX58")
dacs_to_write["pathways"] = dacs_to_write["pathways"].str.replace("Ns1", "NS1")
dacs_to_write["pathways"] = dacs_to_write["pathways"].str.replace("Ifih1", "IFIH1 ")
long_name_query = dacs_to_write["pathways"].str.contains(
    "Post Translational Modification Synthesis Of G", case=False
)
dacs_to_write["pathways"].loc[long_name_query] = "Post Translational Modification..."
dacs_to_write.to_latex(
    tables_path.joinpath("ivae_scorer_reactome.tex"),
    float_format="%.2f",
    index=False,
    escape=True,
)

In [None]:
dacs_to_write

In [None]:
dacs_top = dacs["stimulated_names"][:10]
# dacs_top = dacs_top[dacs_top.str.contains("interferon|ifn", case=False)]
dacs_top

In [None]:
adata.obsm["pathways"] = x_train_encoded.abs()
adata.obs[dacs_to_write.pathways] = adata.obsm["pathways"][dacs_top]

In [None]:
sc.pl.umap(
    adata,
    color=["condition", "cell_type"] + dacs_to_write.pathways.tolist(),
    frameon=False,
    ncols=2,
    wspace=0.3,
    show=False,
)

plt.savefig(
    figs_path.joinpath("ivae_scorer_reactome_ifn_top10.pdf"), bbox_inches="tight"
)
plt.savefig(
    figs_path.joinpath("ivae_scorer_reactome_ifn_top10.png"),
    bbox_inches="tight",
    dpi=300,
)

In [None]:
adata.obs.rename(
    columns={"condition": "Estado", "cell_type": "Tipo de célula"}, inplace=True
)
adata.obs["Estado"] = adata.obs["Estado"].str.replace("stimulated", "estimulada")

In [None]:
sc.pl.umap(
    adata,
    color=["Estado", "Tipo de célula"],
    frameon=False,
    ncols=2,
    wspace=0.3,
    show=False,
)

plt.savefig(figs_path.joinpath("ivae_scorer_reactome.pdf"), bbox_inches="tight")
plt.savefig(
    figs_path.joinpath("ivae_scorer_reactome.png"), bbox_inches="tight", dpi=300
)

In [None]:
sc.pl.umap(
    adata,
    color=dacs_to_write.pathways.tolist(),
    frameon=False,
    ncols=2,
    wspace=0.3,
    show=False,
)

plt.savefig(figs_path.joinpath("ivae_scorer_reactome_top10.pdf"), bbox_inches="tight")
plt.savefig(
    figs_path.joinpath("ivae_scorer_reactome_top10.png"), bbox_inches="tight", dpi=300
)