In [None]:
import os
#os.chdir('../../USZ/')

In [None]:
from sklearn.metrics import auc
from tqdm import tqdm
import plotnine as p9
import pandas as pd
import numpy as np
import glob
import yaml
import shutil
from joblib import Parallel, delayed
#from plotnine_prism import *
import sys
sys.path.append('../')
from src.utils import load_data
from src.utils import bootstrapping
from src.utils import compute_pearson_top_n
from src.utils import compute_area_under_pearson_top_n
import multiprocessing

In [None]:
out_folder = "out_benchmark"
model = "LinearRegression"

In [None]:
with open("config_dataset.yaml", "r") as stream:
    config_dataset = yaml.safe_load(stream)

all_samples = set(config_dataset["SAMPLE"])
top_n_genes_to_evaluate = int(config_dataset["top_n_genes_to_evaluate"])
top_n_genes_to_evaluate

In [None]:
genes = pd.read_csv(f"{out_folder}/info_highly_variable_genes.csv")
selected_genes_bool = genes.isPredicted.values
genes_predict = genes[selected_genes_bool]
genes_predict

In [None]:
selected_genes_evaluate = genes_predict.variances_norm_rank <= top_n_genes_to_evaluate
genes_evaluate = genes_predict[selected_genes_evaluate]
genes_evaluate

In [None]:
with open("cross_validation_config.yaml", "r") as stream:
    cross_validation_config = yaml.safe_load(stream)
cross_validation_config.keys()

In [None]:
def load_predictions(validation_predictions, val_barcode, test_barcode, genes):
    for param_pickle_val_path in validation_predictions:
        val_pred = pd.read_pickle(param_pickle_val_path)
        val_pred = val_pred.loc[val_barcode]
        val_pred = val_pred[genes]

        
        param_pickle_test_path = param_pickle_val_path.replace("_validation", "_test")
        test_pred = pd.read_pickle(param_pickle_test_path)
        test_pred = test_pred.loc[test_barcode]
        test_pred = test_pred[genes]
        
        param_name = param_pickle_val_path.split("/")[-1].split(".pkl")[0].replace('_validation', '')

        yield val_pred, test_pred, param_name

In [None]:
def compute_correlations_per_fold(i, fold, out_folder):
    
    test_samples = "_".join(cross_validation_config[fold]["test"])
    validation_samples = "_".join(cross_validation_config[fold]["validation"])
    training_samples = "_".join(cross_validation_config[fold]["training"])
    
    validation_data = load_data(cross_validation_config[fold]["validation"], 
                            out_folder,
                            load_image_features=False)

    test_data = load_data(cross_validation_config[fold]["test"], 
                            out_folder,
                            load_image_features=False)

    val_true = pd.DataFrame(validation_data["y"][:,selected_genes_bool], 
                         index=validation_data["barcode"],
                         columns=genes_predict.gene_name.values)
    

    test_true = pd.DataFrame(test_data["y"][:,selected_genes_bool], 
                         index=test_data["barcode"],
                         columns=genes_predict.gene_name.values)

    validation_predictions = glob.glob(f"{out_folder}/evaluation/{test_samples}/{validation_samples}/{model}/prediction/*_validation.pkl")

    #prediction_scores_df = []
    for val_pred, test_pred, param_name in load_predictions(validation_predictions, 
                                                            validation_data["barcode"], 
                                                            test_data["barcode"], 
                                                            genes_predict.gene_name.values):
        
        
        pearson_score_val = val_true.corrwith(val_pred, method="pearson").fillna(0) # nan = 0    
        pearson_score_val.reset_index().to_csv(f"{out_folder}/evaluation/{test_samples}/gene_scores/validation/{model}/{param_name}.csv", index=False)
        #prediction_scores_df.append(pearson_score_val)
    
        pearson_score_test = test_true.corrwith(test_pred, method="pearson").fillna(0) # nan = 0
        pearson_score_test.reset_index().to_csv(f"{out_folder}/evaluation/{test_samples}/gene_scores/test/{model}/{param_name}.csv", index=False)
        

        #prediction_scores_df.append(pearson_score_test)
    #prediction_scores_df = pd.concat(prediction_scores_df)
    #prediction_scores_df.to_csv(f"{out_folder}/evaluation/{model}/gene_scores/{test_samples}_prediction_scores.pkl")

In [None]:
delayed_compute_correlations_per_fold = delayed(compute_correlations_per_fold)
n_threads = 20

In [None]:
Parallel(n_threads)(delayed_compute_correlations_per_fold(i, fold, out_folder) 
                                      for i, fold in 
                                          enumerate(tqdm(cross_validation_config))) 

In [None]:
files = glob.glob(f"{out_folder}/evaluation/*/gene_scores/*/{model}/*.csv")
len(files)

In [None]:
scores = []
for file in tqdm(files):
    file_csv = file
    file = file.split("/")
    performance_set = file[4] 
    test = file[2]
    param = file[6].split(".csv")[0]

    df = pd.read_csv(file_csv)
    df.columns = ["gene", "score"]
    df["performance_set"] = performance_set
    df["test_sample"] = test
    df["model"] = param
    scores.append(df)

scores = pd.concat(scores)
scores = scores.pivot_table(columns="performance_set", values="score", index=["gene", "test_sample", "model"]).reset_index()
scores

In [None]:
score_top_n = compute_pearson_top_n(scores, "model", genes_predict,)
score_top_n.head()

In [None]:
df_plot = score_top_n.groupby(["gene", "model", "top_n"]).validation.agg("mean").reset_index()
g = (p9.ggplot(df_plot, p9.aes("validation", color="model")) 
 + p9.geom_density()
 + p9.facet_wrap("~top_n", ncol=1, scales="free_y")
 + p9.theme_bw()
 + p9.theme(figure_size=(20, 12), legend_position='none')
# + scale_color_prism(palette = "colors")
)
#g.save(f"{out_folder}/evaluation/{model}/pearson_score_val_distribution.png", dpi=300)
g

In [None]:
tab_val = score_top_n.groupby(["gene", "model", "top_n"]).validation.agg("mean").reset_index()
tab_val

In [None]:
auc_scores_val = compute_area_under_pearson_top_n(tab_val, "model", "validation")
auc_scores_val.model = pd.Categorical(auc_scores_val.model, auc_scores_val.sort_values("auc_mean", ascending=True).model)
auc_scores_val_dict = auc_scores_val.set_index("model")[["auc_mean", "auc_std"]].to_dict("index")
auc_scores_val

In [None]:
position_dodge_width = 0.5
g = (p9.ggplot(auc_scores_val, p9.aes("model", "auc_mean", color="model", group='model')) 
 + p9.geom_point(position=p9.position_dodge(width=position_dodge_width))
 + p9.theme_bw()
 + p9.geom_errorbar(p9.aes(x="model", ymin="auc_mean-auc_std",
                           ymax="auc_mean+auc_std"), 
                    alpha=1, size=0.5, width=0.2, position=p9.position_dodge(width=position_dodge_width))
# + scale_color_prism(palette = "colors")
 + p9.ylab("AU under pearson's highly variable genes curve")
 + p9.xlab("Model")
 + p9.ggtitle("Validation set")
 + p9.coord_flip()
 + p9.theme(legend_position = "none", figure_size=(16,8))
)
#g.save(f"{out_folder}/evaluation/pearson_score_top_top_n.png", dpi=300)
g

In [None]:
tab_val = tab_val.groupby(["model", "top_n"]).validation.apply(lambda x: bootstrapping(x)).reset_index()
df_plot = pd.DataFrame(tab_val["validation"].to_list(), columns=['pearson_median', 'pearson_std'])
df_plot["model"] = tab_val.model
df_plot["top_n"] = tab_val.top_n
df_plot

In [None]:
df_plot["model"] = pd.Categorical(df_plot["model"], auc_scores_val.sort_values("auc_mean", ascending=False).model)
df_plot["model_AUC"] = df_plot.model.apply(lambda x: f"{x}: {auc_scores_val_dict[x]['auc_mean']:.2f}±{auc_scores_val_dict[x]['auc_std']:.2f}")

In [None]:
top_model_to_plot = df_plot[df_plot.top_n.astype(int) == top_n_genes_to_evaluate]
top_model_to_plot = top_model_to_plot.groupby("model").pearson_median.agg('median').sort_values(ascending=True)[-6:].index.values
df_plot = df_plot.query('model in @top_model_to_plot')
top_model_to_plot

In [None]:
position_dodge_width = 0.5
df_plot.top_n = pd.Categorical(df_plot.top_n.astype(str), 
                                    df_plot.top_n.drop_duplicates().sort_values().astype(str))
g = (p9.ggplot(df_plot, p9.aes("top_n", "pearson_median", color="model_AUC", group='model_AUC')) 
 + p9.geom_line(linetype="dashed", alpha=0.8, position=p9.position_dodge(width=position_dodge_width))
 + p9.geom_point(position=p9.position_dodge(width=position_dodge_width))
 + p9.theme_bw()
 + p9.geom_errorbar(p9.aes(x="top_n", ymin="pearson_median-pearson_std",
                           ymax="pearson_median+pearson_std"), 
                    alpha=0.5, size=0.3, width=1, position=p9.position_dodge(width=position_dodge_width))
# + scale_color_prism(palette = "colors")
 + p9.ylab("Pearson correlation")
 + p9.xlab("Top highly variable genes")
 + p9.ggtitle("Validation set")
 + p9.theme(figure_size=(14, 10), legend_position='none')
)
#g.save(f"{out_folder}/evaluation/{model}/pearson_score_per_top_n_validation.png", dpi=300)
g

In [None]:
df_plot = df_plot[df_plot.top_n.astype(int) == top_n_genes_to_evaluate]
df_plot.model = pd.Categorical(df_plot.model, df_plot.groupby("model").pearson_median.agg('median').sort_values(ascending=True).index)
g = (p9.ggplot(df_plot, p9.aes("model", "pearson_median", color="model", group='model')) 
 + p9.geom_point(position=p9.position_dodge(width=position_dodge_width))
 + p9.theme_bw()
 + p9.geom_errorbar(p9.aes(x="model", ymin="pearson_median-pearson_std",
                           ymax="pearson_median+pearson_std"), 
                    alpha=1, size=0.5, width=0.2, position=p9.position_dodge(width=position_dodge_width))
# + scale_color_prism(palette = "colors")
 + p9.ylab("Pearson correlation")
 + p9.xlab("model")
 + p9.ggtitle("Validation set")
 + p9.coord_flip()
 + p9.theme(legend_position = "none", figure_size=(18, 6))
)
#g.save(f"{out_folder}/evaluation/{model}/pearson_score_top_n_validation.png", dpi=300)
g

In [None]:
tab_test = score_top_n.groupby(["gene", "model", "top_n"]).test.agg("mean").reset_index()

In [None]:
auc_scores_test = compute_area_under_pearson_top_n(tab_test, "model", "test")
auc_scores_test.model = pd.Categorical(auc_scores_test.model, auc_scores_test.sort_values("auc_mean", ascending=True).model)
auc_scores_test_dict = auc_scores_test.set_index("model")[["auc_mean", "auc_std"]].to_dict("index")
auc_scores_test

In [None]:
position_dodge_width = 0.5
g = (p9.ggplot(auc_scores_test, p9.aes("model", "auc_mean", color="model", group='model')) 
 + p9.geom_point(position=p9.position_dodge(width=position_dodge_width))
 + p9.theme_bw()
 + p9.geom_errorbar(p9.aes(x="model", ymin="auc_mean-auc_std",
                           ymax="auc_mean+auc_std"), 
                    alpha=1, size=0.5, width=0.2, position=p9.position_dodge(width=position_dodge_width))
# + scale_color_prism(palette = "colors")
 + p9.ylab("AU under pearson's highly variable genes curve")
 + p9.xlab("Model")
 + p9.ggtitle("Test set")
 + p9.coord_flip()
 + p9.theme(legend_position = "none", figure_size=(16,8))
)
#g.save(f"{out_folder}/evaluation/pearson_score_top_top_n.png", dpi=300)
g

In [None]:
tab_test = tab_test.groupby(["model", "top_n"]).test.apply(lambda x: bootstrapping(x)).reset_index()
df_plot = pd.DataFrame(tab_test["test"].to_list(), columns=['pearson_median', 'pearson_std'])
df_plot["model"] = tab_test.model
df_plot["top_n"] = tab_test.top_n

In [None]:
df_plot["model"] = pd.Categorical(df_plot["model"], auc_scores_test.sort_values("auc_mean", ascending=True).model)
df_plot["model_AUC"] = df_plot.model.apply(lambda x: f"{x}: {auc_scores_test_dict[x]['auc_mean']:.2f}±{auc_scores_test_dict[x]['auc_std']:.2f}")

In [None]:
top_model_to_plot = df_plot[df_plot.top_n.astype(int) == top_n_genes_to_evaluate]
top_model_to_plot = top_model_to_plot.groupby("model").pearson_median.agg('median').sort_values(ascending=True)[-6:].index.values
df_plot = df_plot.query('model in @top_model_to_plot')
top_model_to_plot

In [None]:
position_dodge_width = 0.5
df_plot.top_n = pd.Categorical(df_plot.top_n.astype(str), 
                                    df_plot.top_n.drop_duplicates().sort_values().astype(str))
g = (p9.ggplot(df_plot, p9.aes("top_n", "pearson_median", color="model_AUC", group='model_AUC')) 
 + p9.geom_line(linetype="dashed", alpha=0.8, position=p9.position_dodge(width=position_dodge_width))
 + p9.geom_point(position=p9.position_dodge(width=position_dodge_width))
 + p9.theme_bw()
 + p9.geom_errorbar(p9.aes(x="top_n", ymin="pearson_median-pearson_std",
                           ymax="pearson_median+pearson_std"), 
                    alpha=0.5, size=0.3, width=1, position=p9.position_dodge(width=position_dodge_width))
# + scale_color_prism(palette = "colors")
 + p9.ylab("Pearson correlation")
 + p9.xlab("Top highly variable genes")
 + p9.ggtitle("Test set")
 + p9.theme(figure_size=(14, 10), legend_position='none')
)
#g.save(f"{out_folder}/evaluation/{model}/pearson_score_per_top_n_test.png", dpi=300)
g

In [None]:
df_plot = df_plot[df_plot.top_n.astype(int) == top_n_genes_to_evaluate]
df_plot.model = pd.Categorical(df_plot.model, 
                               df_plot.groupby("model").pearson_median.agg('median').sort_values(ascending=True).index)
g = (p9.ggplot(df_plot, p9.aes("model", "pearson_median", color="model", group='model')) 
 + p9.geom_point(position=p9.position_dodge(width=position_dodge_width))
 + p9.theme_bw()
 + p9.geom_errorbar(p9.aes(x="model", ymin="pearson_median-pearson_std",
                           ymax="pearson_median+pearson_std"), 
                    alpha=1, size=0.5, width=0.2, position=p9.position_dodge(width=position_dodge_width))
# + scale_color_prism(palette = "colors")
 + p9.ylab("Pearson correlation")
 + p9.xlab("Model")
 + p9.ggtitle("Test set")
 + p9.coord_flip()
 + p9.theme(legend_position = "none", figure_size=(14, 6))
)
#g.save(f"{out_folder}/evaluation/{model}/pearson_score_top_n_test.png", dpi=300)
g

In [None]:
top_n_scores_val_evaluate_decentile = score_top_n[score_top_n.top_n.astype(int) == top_n_genes_to_evaluate]
top_n_scores_val_evaluate_decentile

In [None]:
top_model_per_sample = top_n_scores_val_evaluate_decentile.groupby(["test_sample", "model", "top_n"]).validation.agg("mean").reset_index()
top_model_per_sample = top_model_per_sample.sort_values('validation').drop_duplicates(['test_sample'], keep='last')
top_model_per_sample

In [None]:
tab_val = score_top_n.groupby(["gene", "model", "top_n", "test_sample"]).validation.agg("mean").reset_index()
top_model_per_sample = tab_val.groupby("test_sample").apply(lambda x: compute_area_under_pearson_top_n(x, "model", "validation").sort_values("auc_mean", ascending=False)[:1]).reset_index()
top_model_per_sample

In [None]:
top_model_per_sample.values

In [None]:
top_model_per_sample.to_csv(f"{out_folder}/evaluation/{model}/top_model_per_test_sample.csv", index=False)

In [None]:
best_model = top_model_per_sample.model.value_counts().sort_values(ascending=False).reset_index()
best_model.columns = ["model_name", "num"]
best_model.to_csv(f"{out_folder}/evaluation/{model}/top_model_overall.csv", index=False)
best_model

In [None]:
top_model_overall = best_model.model_name[0]
top_model_overall

In [None]:
top_model_file = glob.glob(f"{out_folder}/evaluation/*/*/{model}/parameters/*.yaml")
top_model_file = [f for f in top_model_file if top_model_overall in f][0] # we cant filter on glob level.. an issue with re patterns
shutil.move(top_model_file, f"{out_folder}/evaluation/{model}/top_param_overall.yaml")