## Imports

In [1]:
import pandas as pd
import sys; sys.path.insert(0, '..'); sys.path.insert(0, '../utils')
from utils.find_max_similiarity import find_max_similiarity
from utils.tanimoto_baseline import tanimoto_bioactivity_predictor
from utils.baseline import similarity_bioactivity_predictor, similarity_softmax_bioactivity_predictor, aupr_scorer, max_similarity_scorer, auroc_scorer, mean_similarity_scorer, fingerprint_generator_rdkit, fingerprint_generator_ASP, fingerprint_generator_LSTAR, fingerprint_generator_RAD2D, upper_cuartile_scorer, upper_decile_scorer
from utils.plots import roc, scores_label_histogram, tanimoto_datasets_comparison, tanimoto_similarity_2D_plot
from rdkit import DataStructs
import plotly.express as px
import sys; sys.path.insert(0, '/home/sjinich/disco/jcompoundmapper_pywrapper/src')
from jCompoundMapper_pywrapper import JCompoundMapper
from sklearn.metrics import roc_auc_score


## Baseline

In [2]:
fingerprinters = {
    "RDKIT":fingerprint_generator_rdkit,
    "LSTAR":fingerprint_generator_LSTAR,
    "ASP":fingerprint_generator_ASP,
    "RAD2D":fingerprint_generator_RAD2D
}

similarity_functions = {
    "BraunBlanquet": DataStructs.BraunBlanquetSimilarity,
    "Tanimoto": DataStructs.TanimotoSimilarity,
    "Cosine": DataStructs.CosineSimilarity
}

In [3]:
def baseline(target_name, train, validation, fingerprinters, similarity_functions):
    results = {
        f"AUROC_{target_name}":dict(),
        f"AUROC30_{target_name}":dict(),
        f"AUROC15_{target_name}":dict()
    }

    train = train.reset_index(drop=True)
    validation = validation.reset_index(drop=True)

    for title_fps, fps in fingerprinters.items():
        
        train = fps(train)
        validation = fps(validation)

        for title_similarity, similarity in similarity_functions.items():

            prediction_max = similarity_softmax_bioactivity_predictor(train,validation,max_similarity_scorer,fps,similarity)
            prediction_mean = similarity_softmax_bioactivity_predictor(train,validation,mean_similarity_scorer,fps,similarity)
            prediction_cuartile = similarity_softmax_bioactivity_predictor(train,validation,upper_cuartile_scorer,fps,similarity)
            prediction_decile = similarity_softmax_bioactivity_predictor(train,validation,upper_decile_scorer,fps,similarity)
            prediction_auroc = similarity_bioactivity_predictor(train,validation,auroc_scorer,fps,similarity)
            prediction_aupr = similarity_bioactivity_predictor(train,validation,aupr_scorer,fps,similarity)
    
            results[f"AUROC_{target_name}"].update({
                f"max_{title_fps}_{title_similarity}": roc_auc_score(prediction_max["bioactivity"],prediction_max["prediction_score"]),
                f"mean_{title_fps}_{title_similarity}": roc_auc_score(prediction_mean["bioactivity"],prediction_mean["prediction_score"]),
                f"cuartile_{title_fps}_{title_similarity}": roc_auc_score(prediction_cuartile["bioactivity"],prediction_cuartile["prediction_score"]),
                f"decile_{title_fps}_{title_similarity}": roc_auc_score(prediction_decile["bioactivity"],prediction_decile["prediction_score"]),
                f"auroc_{title_fps}_{title_similarity}": roc_auc_score(prediction_auroc["bioactivity"],prediction_auroc["prediction_score"]),
                f"aupr_{title_fps}_{title_similarity}": roc_auc_score(prediction_aupr["bioactivity"],prediction_aupr["prediction_score"]),
            })

            results[f"AUROC30_{target_name}"].update({
                f"max_{title_fps}_{title_similarity}": roc_auc_score(prediction_max["bioactivity"],prediction_max["prediction_score"],max_fpr=0.3),
                f"mean_{title_fps}_{title_similarity}": roc_auc_score(prediction_mean["bioactivity"],prediction_mean["prediction_score"],max_fpr=0.3),
                f"cuartile_{title_fps}_{title_similarity}": roc_auc_score(prediction_cuartile["bioactivity"],prediction_cuartile["prediction_score"],max_fpr=0.3),
                f"decile_{title_fps}_{title_similarity}": roc_auc_score(prediction_decile["bioactivity"],prediction_decile["prediction_score"],max_fpr=0.3),
                f"auroc_{title_fps}_{title_similarity}": roc_auc_score(prediction_auroc["bioactivity"],prediction_auroc["prediction_score"],max_fpr=0.3),
                f"aupr_{title_fps}_{title_similarity}": roc_auc_score(prediction_aupr["bioactivity"],prediction_aupr["prediction_score"],max_fpr=0.3),
            })

            results[f"AUROC15_{target_name}"].update({
                f"max_{title_fps}_{title_similarity}": roc_auc_score(prediction_max["bioactivity"],prediction_max["prediction_score"],max_fpr=0.15),
                f"mean_{title_fps}_{title_similarity}": roc_auc_score(prediction_mean["bioactivity"],prediction_mean["prediction_score"],max_fpr=0.15),
                f"cuartile_{title_fps}_{title_similarity}": roc_auc_score(prediction_cuartile["bioactivity"],prediction_cuartile["prediction_score"],max_fpr=0.15),
                f"decile_{title_fps}_{title_similarity}": roc_auc_score(prediction_decile["bioactivity"],prediction_decile["prediction_score"],max_fpr=0.15),
                f"auroc_{title_fps}_{title_similarity}": roc_auc_score(prediction_auroc["bioactivity"],prediction_auroc["prediction_score"],max_fpr=0.15),
                f"aupr_{title_fps}_{title_similarity}": roc_auc_score(prediction_aupr["bioactivity"],prediction_aupr["prediction_score"],max_fpr=0.15),
            })

    return results

In [4]:

targets = ["/home/sjinich/disco/TrypanoDEEPscreen/data/processed/CHEMBL262_chemblv34.csv"]
           #"/home/sjinich/disco/TrypanoDEEPscreen/data/processed/CHEMBL2581_chemblv34.csv","/home/sjinich/disco/TrypanoDEEPscreen/data/processed/CHEMBL2850_chemblv34.csv","/home/sjinich/disco/TrypanoDEEPscreen/data/processed/CHEMBL4072_chemblv34.csv","/home/sjinich/disco/TrypanoDEEPscreen/data/processed/CHEMBL4657_chemblv34.csv","/home/sjinich/disco/TrypanoDEEPscreen/data/processed/CHEMBL5567_chemblv34.csv"]


all_targets_results = pd.DataFrame()

for target in targets:
    
    df_all = pd.read_csv(target)

    train = df_all[df_all.data_split == "train"]
    test = df_all[df_all.data_split == "test"]
    validation = df_all[df_all.data_split == "validation"]

    target_name_file = target.split("/")[-1]
    target_name = target_name_file[:target_name_file.find("_chemblv34.csv")]

    result = baseline(target_name,train,test,fingerprinters,similarity_functions)

    all_targets_results = pd.concat([all_targets_results,pd.DataFrame(result)],axis=1)
    


Fingerprinting algorithms for your use.
jCompoundMapper provides popular fingerprinting algorithms for chemical graphs such as
depth-first search fingerprints, shortest-path fingerprints, extended connectivity fingerprints,
autocorrelation fingerprints (e.g. CATS2D), radial fingerprints (e.g. Molprint2D), geometrical
Molprint, atom pairs, and pharmacophore fingerprints.

###################################

Should you publish results based on the jCompoundMapper fingerprints, please cite:

jCompoundMapper: An open source Java library and command-line tool for chemical fingerprints.
Hinselmann, G., Rosenbaum, L., Jahn, A., Fechner N., and Zeel A.
Journal of Cheminformatics 2003  3 (3).
DOI: 10.1186/1758-2946-3-3

###################################


Fingerprinting algorithms for your use.
jCompoundMapper provides popular fingerprinting algorithms for chemical graphs such as
depth-first search fingerprints, shortest-path fingerprints, extended connectivity fingerprints,
autocorrelation 

In [14]:
all_targets_results

Unnamed: 0,max_RDKIT_BraunBlanquet,mean_RDKIT_BraunBlanquet,cuartile_RDKIT_BraunBlanquet,decile_RDKIT_BraunBlanquet,auroc_RDKIT_BraunBlanquet,aupr_RDKIT_BraunBlanquet,max_RDKIT_Tanimoto,mean_RDKIT_Tanimoto,cuartile_RDKIT_Tanimoto,decile_RDKIT_Tanimoto,...,decile_RAD2D_Tanimoto,auroc_RAD2D_Tanimoto,aupr_RAD2D_Tanimoto,max_RAD2D_Cosine,mean_RAD2D_Cosine,cuartile_RAD2D_Cosine,decile_RAD2D_Cosine,auroc_RAD2D_Cosine,aupr_RAD2D_Cosine,AUROC
CHEMBL262,0.809021,0.681636,0.707906,0.728973,0.56503,0.641453,0.812571,0.652425,0.680064,0.708324,...,0.781965,0.56664,0.650555,0.828551,0.661137,0.752881,0.774437,0.572381,0.670955,AUROC30


In [15]:
all_targets_results = all_targets_results.transpose()

In [6]:
all_targets_results["AUROC"] = all_targets_results.index.to_series()

In [7]:
all_targets_results["AUROC"] = all_targets_results["AUROC"].apply(lambda x: x.split("_")[0])

In [8]:
all_targets_results.index = all_targets_results.index.to_series().apply(lambda x: x.split("_")[1])

In [9]:
all_targets_results.to_csv("baseline_gridsearch_all_targets.csv")

In [10]:
all_targets_results

Unnamed: 0,max_RDKIT_BraunBlanquet,mean_RDKIT_BraunBlanquet,cuartile_RDKIT_BraunBlanquet,decile_RDKIT_BraunBlanquet,auroc_RDKIT_BraunBlanquet,aupr_RDKIT_BraunBlanquet,max_RDKIT_Tanimoto,mean_RDKIT_Tanimoto,cuartile_RDKIT_Tanimoto,decile_RDKIT_Tanimoto,...,decile_RAD2D_Tanimoto,auroc_RAD2D_Tanimoto,aupr_RAD2D_Tanimoto,max_RAD2D_Cosine,mean_RAD2D_Cosine,cuartile_RAD2D_Cosine,decile_RAD2D_Cosine,auroc_RAD2D_Cosine,aupr_RAD2D_Cosine,AUROC
CHEMBL262,0.848272,0.714617,0.736166,0.751517,0.638891,0.700297,0.84564,0.703292,0.685525,0.711818,...,0.80543,0.590174,0.692125,0.865087,0.689168,0.782456,0.798142,0.599407,0.711955,AUROC
CHEMBL262,0.809021,0.681636,0.707906,0.728973,0.56503,0.641453,0.812571,0.652425,0.680064,0.708324,...,0.781965,0.56664,0.650555,0.828551,0.661137,0.752881,0.774437,0.572381,0.670955,AUROC30
CHEMBL262,0.782244,0.647839,0.68327,0.704489,0.528228,0.585945,0.784452,0.620385,0.667419,0.703446,...,0.749198,0.550505,0.60281,0.796794,0.644342,0.714795,0.740631,0.559101,0.624525,AUROC15


In [13]:
px.bar(all_targets_results.loc["CHEMBL5567"],x="AUROC", y=all_targets_results.loc["CHEMBL262"].columns,barmode="group")

In [14]:
px.bar(all_targets_results.loc["CHEMBL4657"],x="AUROC", y=all_targets_results.loc["CHEMBL262"].columns,barmode="group")

In [15]:
px.bar(all_targets_results.loc["CHEMBL2581"],x="AUROC", y=all_targets_results.loc["CHEMBL262"].columns,barmode="group")

In [16]:
px.bar(all_targets_results.loc["CHEMBL2850"],x="AUROC", y=all_targets_results.loc["CHEMBL262"].columns,barmode="group")