In [None]:
import os
#os.chdir('../../TLS_data/')

In [None]:
from sklearn.metrics import auc
from tqdm import tqdm
import plotnine as p9
import pandas as pd
import numpy as np
import yaml
import shutil
#from plotnine_prism import *
import sys
sys.path.append('../')
from src.utils import load_data
from src.utils import bootstrapping
from src.utils import compute_pearson_top_n
from src.utils import compute_area_under_pearson_top_n

In [None]:
out_folder = "out_benchmark"

In [None]:
with open("config_dataset.yaml", "r") as stream:
    config_dataset = yaml.safe_load(stream)
models = config_dataset["MODEL"]
top_n_genes_to_evaluate = config_dataset["top_n_genes_to_evaluate"]
models, top_n_genes_to_evaluate

In [None]:
genes = pd.read_csv(f"{out_folder}/info_highly_variable_genes.csv")
selected_genes_bool = genes.isPredicted.values
genes_predict = genes[selected_genes_bool]
genes_predict

In [None]:
scores = []
for model in models:

    top_model = pd.read_csv(f"{out_folder}/evaluation/{model}/top_model_per_test_sample.csv")
    model_dict = top_model[["test_sample", "model"]].set_index("test_sample").to_dict()['model']

    
    for test in model_dict.keys():
    
        file = f"{out_folder}/evaluation/{test}/gene_scores/test/{model}/{model_dict[test]}.csv"
        score = pd.read_csv(file)
        score = score.set_index("index")
        score.columns = ["pearson"]
        score["test_sample"] = test
        score["model"] = model
        scores.append(score)


scores = pd.concat(scores)
scores["gene"] = scores.index
scores = scores.reset_index(drop=True)
scores = compute_pearson_top_n(scores, "model", genes_predict)
scores

In [None]:
df_plot = scores.groupby(["gene", "model", "top_n"]).pearson.agg("mean").reset_index()
g = (p9.ggplot(df_plot, p9.aes("pearson", color="model")) 
 + p9.geom_density()
 + p9.facet_wrap("~top_n", ncol=1, scales="free_y")
 + p9.theme_bw()
 + p9.theme(figure_size=(8, 12))
# + scale_color_prism(palette = "colors")
)
#g.save(f"{out_folder}/evaluation/pearson_score_test_distribution.png", dpi=300)
g

In [None]:
tab = scores.groupby(["gene", "model", "top_n"]).pearson.agg("mean").reset_index()
tab

In [None]:
auc_scores = compute_area_under_pearson_top_n(tab, "model", "pearson")
auc_scores.model = pd.Categorical(auc_scores.model, auc_scores.sort_values("auc_mean", ascending=True).model)
auc_scores_dict = auc_scores.set_index("model")[["auc_mean", "auc_std"]].to_dict("index")
auc_scores

In [None]:
auc_scores.to_csv(f"{out_folder}/evaluation/auc_scores.csv", index=False)

In [None]:
position_dodge_width = 0.5
g = (p9.ggplot(auc_scores, p9.aes("model", "auc_mean", color="model", group='model')) 
 + p9.geom_point(position=p9.position_dodge(width=position_dodge_width))
 + p9.theme_bw()
 + p9.geom_errorbar(p9.aes(x="model", ymin="auc_mean-auc_std",
                           ymax="auc_mean+auc_std"), 
                    alpha=1, size=0.5, width=0.2, position=p9.position_dodge(width=position_dodge_width))
# + scale_color_prism(palette = "colors")
 + p9.ylab("AU under pearson's highly variable genes curve")
 + p9.xlab("Model")
 + p9.ggtitle("Test set")
 + p9.coord_flip()
 + p9.theme(legend_position = "none")
)
#g.save(f"{out_folder}/evaluation/pearson_score_top_top_n.png", dpi=300)
g

In [None]:
tab = tab.groupby(["model", "top_n"]).pearson.apply(lambda x: bootstrapping(x)).reset_index()
df_plot = pd.DataFrame(tab["pearson"].to_list(), columns=['pearson_median', 'pearson_std'])
df_plot["model"] = tab.model
df_plot["top_n"] = tab.top_n
df_plot

In [None]:
df_plot["model"] = pd.Categorical(df_plot["model"], auc_scores.sort_values("auc_mean", ascending=True).model)
df_plot["model_AUC"] = df_plot.model.apply(lambda x: f"{x}: {auc_scores_dict[x]['auc_mean']:.2f}Â±{auc_scores_dict[x]['auc_std']:.2f}")

In [None]:
df_plot.to_csv(f"{out_folder}/evaluation/pearson_variation.csv", index=False)

In [None]:
position_dodge_width = 0.5

df_plot.top_n = pd.Categorical(df_plot.top_n.astype(str), 
                                    df_plot.top_n.drop_duplicates().sort_values().astype(str))
g = (p9.ggplot(df_plot, p9.aes("top_n", "pearson_median", color="model_AUC", group='model_AUC')) 
 + p9.geom_line(linetype="dashed", alpha=0.8, position=p9.position_dodge(width=position_dodge_width))
 + p9.geom_point(position=p9.position_dodge(width=position_dodge_width))
 + p9.theme_bw()
 + p9.geom_errorbar(p9.aes(x="top_n", ymin="pearson_median-pearson_std",ymax="pearson_median+pearson_std"), 
                    alpha=0.5, size=0.3, width=1, position=p9.position_dodge(width=position_dodge_width))
# + scale_color_prism(palette = "colors")
 + p9.ylab("Pearson correlation")
 + p9.xlab("Top highly variable genes")
 + p9.theme(figure_size=(16, 8))
)
#g.save(f"{out_folder}/evaluation/pearson_score_per_top_n.png", dpi=300)
g

In [None]:
df_plot = df_plot[df_plot.top_n.astype(int) == top_n_genes_to_evaluate]
df_plot.model = df_plot.model.astype(str)
df_plot.model = pd.Categorical(df_plot.model, df_plot.groupby("model").pearson_median.agg('median').sort_values(ascending=True).index)
g = (p9.ggplot(df_plot, p9.aes("model", "pearson_median", color="model", group='model')) 
 + p9.geom_point(position=p9.position_dodge(width=position_dodge_width))
 + p9.theme_bw()
 + p9.geom_errorbar(p9.aes(x="model", ymin="pearson_median-pearson_std",
                           ymax="pearson_median+pearson_std"), 
                    alpha=1, size=0.5, width=0.2, position=p9.position_dodge(width=position_dodge_width))
# + scale_color_prism(palette = "colors")
 + p9.ylab("Pearson correlation")
 + p9.xlab("Model")
 + p9.ggtitle("Test set")
 + p9.coord_flip()
 + p9.theme(legend_position = "none")
)
#g.save(f"{out_folder}/evaluation/pearson_score_top_top_n.png", dpi=300)
g

In [None]:
df_plot.to_csv(f"{out_folder}/evaluation/pearson_scores_top_n_{top_n_genes_to_evaluate}.csv", index=False)