In [1]:
#This notebook builds a heirarchial model from baseline randomforest 
#Morgan fingerprint 
#and cell painting

In [2]:
from rdkit import Chem, DataStructs
from scipy.stats import randint
from rdkit.Chem import AllChem
from scipy import stats
from rdkit.Chem import rdMolDescriptors as rdmd
from rdkit.Chem.Scaffolds import MurckoScaffold
import pandas as pd
from tqdm import tqdm
import time
import numpy as np
from scipy.spatial.distance import cdist
from sklearn.cluster import MiniBatchKMeans
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import matthews_corrcoef,confusion_matrix, roc_auc_score, roc_curve
import seaborn as sns

In [3]:
list_of_lists_df = pd.read_csv("Predictions_train_heldout_scaled_prob_all_assays_ensemble.csv")
list_of_lists_df

Unnamed: 0,assay,InChICode_standardised,fp_proba,fp_pred,fp_threshold,CP_proba,CP_pred,CP_threshold,true,Data,MFP_Correct,CP_Correct,fp_proba_scaled,CP_proba_scaled,simple_pred,Ensemble_Correct
0,588458,"InChI=1S/C19H19N3O5S3/c23-16-6-5-13(30(25,26)2...",0.072320,0,0.227732,0.291514,1,0.291203,0.0,Training,True,False,0.158783,0.500219,0.0,True
1,588458,InChI=1S/C9H8ClN3S/c1-6-11-12-9(14)13(6)8-4-2-...,0.241077,1,0.227732,0.334301,1,0.291203,0.0,Training,False,False,0.508641,0.530402,1.0,False
2,588458,InChI=1S/C20H20N2O4/c1-2-25-20(24)17-12-18-16(...,0.119513,0,0.227732,0.186209,0,0.291203,0.0,Training,True,True,0.262399,0.319723,0.0,True
3,588458,InChI=1S/C26H24N2O5S/c1-33-19-11-9-18(10-12-19...,0.045993,0,0.227732,0.472894,1,0.291203,1.0,Training,False,True,0.100980,0.628169,0.0,False
4,588458,InChI=1S/C19H21N3O3S/c1-13-10-16-11-15(4-9-19(...,0.269438,1,0.227732,0.168755,0,0.291203,0.0,Training,False,True,0.527003,0.289754,0.0,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
50191,1117305,"InChI=1S/C7H8ClN3O4S2/c8-4-1-5-7(2-6(4)16(9,12...",0.703995,1,0.457779,0.556806,1,0.264151,0.0,HeldOut,False,False,0.727044,0.698855,1.0,False
50192,1117305,InChI=1S/C14H9I3O4/c15-9-6-8(1-2-12(9)18)21-14...,0.196017,0,0.457779,0.210436,0,0.264151,0.0,HeldOut,True,True,0.214095,0.398324,0.0,True
50193,1117305,InChI=1S/C18H19Cl2NO4/c1-5-25-18(23)14-10(3)21...,0.202912,0,0.457779,0.487726,1,0.264151,1.0,HeldOut,False,True,0.221627,0.651916,0.0,False
50194,1117305,InChI=1S/C16H11BrN2O/c17-9-5-6-14-11(7-9)12-8-...,0.149597,0,0.457779,0.238125,0,0.264151,0.0,HeldOut,True,True,0.163395,0.450736,0.0,True


In [4]:
from sklearn.linear_model import LogisticRegression
from io import StringIO
from itertools import product
import sys
import matplotlib.pyplot as plt
from sklearn.tree import plot_tree
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import StratifiedKFold

In [5]:
def RF_Fn(assay):

    detail_list=pd.DataFrame()
    detail_list = list_of_lists_df[list_of_lists_df["assay"]==assay].reset_index(drop=True)
    
    
    proba_batch=[]
    pred_batch=[]
    true_batch=[]
    StdInChI_batch=[]
    
    train= detail_list[detail_list["Data"]=="Training"]

    test= detail_list[detail_list["Data"]=="HeldOut"]
    

    print("Herirachal model for training data")    

    print("RF Model")
            
    X_train = train[["fp_proba_scaled", "CP_proba_scaled"]].to_numpy()
    y_train = train["true"].to_numpy()
    X_test = test[["fp_proba_scaled", "CP_proba_scaled"]].to_numpy()
    y_test = test["true"].to_numpy()


    inner_cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
         
    param_dist_grid = {
        'n_estimators':[100, 300, 400, 500],
        'random_state': [42],
        'n_jobs': [1],
        'class_weight' : [None, 'balanced']
        }
        
    rf = RandomForestClassifier(n_jobs=-1)
    rsh = HalvingRandomSearchCV(estimator=rf, param_distributions=param_dist_grid, factor=2, 
                                random_state=42, n_jobs=20, verbose=0,  cv = inner_cv)

    rsh.fit(X_train, y_train)

    print("Tuned Mode: ", rsh.best_params_)
    clf = rsh.best_estimator_
        
    #n_estimators=100
    #clf = RandomForestClassifier(n_estimators=n_estimators, max_depth=10, random_state=42, class_weight="balanced")           
    clf.fit(X_train, y_train)
    
    #Threshold Balancing
    cross_val_prob = cross_val_predict(rsh.best_estimator_, X_train, y_train, cv=inner_cv, method='predict_proba')[:, 1]
    # calculate roc curves
    fpr, tpr, thresholds = roc_curve(y_train, cross_val_prob)
    # get the best threshold
    J = tpr - fpr
    ix = argmax(J)
    best_thresh = thresholds[ix]
    print('Best Threshold=%f' % (best_thresh))


    proba = clf.predict_proba(X_test)[:,1]
    pred  = [ 1 if x>best_thresh  else 0 for x in proba ] 

           
    StdInChI = test["InChICode_standardised"].to_numpy()
    #pred = clf.predict(X_test) 
    #proba = clf.predict_proba(X_test)[:,1]
    true = test["true"].to_numpy()
            

    StdInChI_batch = np.concatenate([StdInChI_batch, StdInChI])
    proba_batch = np.concatenate([proba_batch, proba])
    pred_batch = np.concatenate([pred_batch, pred])
    true_batch = np.concatenate([true_batch, true])

    print(len(pred_batch))
    print(len(true_batch))
   
    return StdInChI_batch, true_batch, pred_batch, proba_batch

In [6]:
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
from sklearn.metrics import classification_report
from sklearn.metrics import balanced_accuracy_score

def check_heirarchial(assay, method, y_true, y_pred, y_prob):
    
    conf_matrix = confusion_matrix(y_true, y_pred)
    #print(conf_matrix)
    
    #print(classification_report(y_true, y_pred))
    
    ba= balanced_accuracy_score(y_true, y_pred)
    #print('balanced_accuracy_score ',ba)
    
    mcc=matthews_corrcoef(y_true, y_pred)
    #print('matthews_corrcoef ',mcc)
    # Sensitivity, hit rate, recall, or true positive rate
    
    f1= f1_score(y_true, y_pred, average='binary')
    precision = precision_score(y_true, y_pred, average='binary')
    
    #print('F1 Toxic', f1)
    #print('precision', precision)
    
    Specificity = conf_matrix[0,0]/(conf_matrix[0,0]+conf_matrix[0,1])
    # Specificity or true negative rate
    Sensitivity = conf_matrix[1,1]/(conf_matrix[1,0]+conf_matrix[1,1]) 
    
    #print( 'Sensitivity', Sensitivity)
    #print( 'Specificity', Specificity)
    AUC = roc_auc_score(y_true, y_prob)
    #print('AUC-ROC ',AUC)    
    # calculate roc curves
    best_model_fpr, best_model_tpr, _ = roc_curve(y_true, y_prob)   
    
    
    row=[assay, method, f1, precision, Sensitivity, Specificity, ba, mcc, AUC]
    metrics_list.append(row)
    
    return metrics_list

In [7]:
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_predict
from collections import Counter
from numpy import argmax
from sklearn.experimental import enable_halving_search_cv
from sklearn.model_selection import HalvingRandomSearchCV

In [8]:
import warnings
warnings.filterwarnings("ignore") #ignore, default
import pandas

#sns.set(rc={'figure.figsize':(2, 2)})
#plt.rcParams['figure.dpi'] = 80
#sns.set_style("white")
metrics_list=[]
all_preds_test_compounds=pd.DataFrame(columns=['assay', 'InChICode_standardised', 'fp_proba', 'fp_pred', 'fp_threshold', 'CP_proba',
       'CP_pred', 'CP_threshold', 'true', 'Data', 'MFP_Correct',
       'CP_Correct', 'fp_proba_scaled', 'CP_proba_scaled', 'simple_pred',
       'Ensemble_Correct', 'heirarchial_pred', 'heirarchial_Correct'])


#for assay in tqdm(assaylist[:7]):
for assay in tqdm(list_of_lists_df.assay.unique()[:]):
    
    print("Assay: ", assay)
    detail_list=pd.DataFrame()
    detail_list = list_of_lists_df[list_of_lists_df["assay"]==assay].reset_index(drop=True)
    detail_list_test = detail_list[detail_list["Data"]=="HeldOut"]

    print("Heirarchial_Model")
    merger_df=pd.DataFrame()
    
    StdInChI_batch, true_batch, pred_batch, proba_batch = RF_Fn(assay)
    check_heirarchial(assay, "Heirarchial Model", true_batch, pred_batch, proba_batch)
    
    #print(true_batch)
    
    print(assay)
    
    merger_df["InChICode_standardised"]=StdInChI_batch
    merger_df["heirarchial_pred"]= pred_batch
    merger_df["true"]=true_batch
    merger_df["heirarchial_Correct"]= merger_df["heirarchial_pred"] == merger_df["true"]
    merger_df["assay"]=assay
    
    
    #For each assay combine original predictions and merger_df predictions
    #all_preds = pd.DataFrame()
    all_preds_test_compounds= pd.concat([all_preds_test_compounds, pd.merge(detail_list_test, merger_df)])  
    
    
    

  0%|                                                    | 0/88 [00:00<?, ?it/s]

Assay:  588458
Heirarchial_Model
Herirachal model for training data
RF Model
Tuned Mode:  {'random_state': 42, 'n_jobs': 1, 'n_estimators': 400, 'class_weight': 'balanced'}


  1%|▌                                           | 1/88 [00:06<08:52,  6.12s/it]

Best Threshold=0.565000
90
90
588458
Assay:  588334
Heirarchial_Model
Herirachal model for training data
RF Model
Tuned Mode:  {'random_state': 42, 'n_jobs': 1, 'n_estimators': 500, 'class_weight': 'balanced'}


  2%|█                                           | 2/88 [00:12<08:44,  6.10s/it]

Best Threshold=0.244000
106
106
588334
Assay:  2642
Heirarchial_Model
Herirachal model for training data
RF Model
Tuned Mode:  {'random_state': 42, 'n_jobs': 1, 'n_estimators': 400, 'class_weight': 'balanced'}


  3%|█▌                                          | 3/88 [00:17<08:10,  5.78s/it]

Best Threshold=0.297500
153
153
2642
Assay:  2156
Heirarchial_Model
Herirachal model for training data
RF Model
Tuned Mode:  {'random_state': 42, 'n_jobs': 1, 'n_estimators': 300, 'class_weight': 'balanced'}


  5%|██                                          | 4/88 [00:21<07:16,  5.20s/it]

Best Threshold=0.013333
114
114
2156
Assay:  2330
Heirarchial_Model
Herirachal model for training data
RF Model
Tuned Mode:  {'random_state': 42, 'n_jobs': 1, 'n_estimators': 400, 'class_weight': None}


  6%|██▌                                         | 5/88 [00:26<07:04,  5.11s/it]

Best Threshold=0.402500
66
66
2330
Assay:  2216
Heirarchial_Model
Herirachal model for training data
RF Model
Tuned Mode:  {'random_state': 42, 'n_jobs': 1, 'n_estimators': 500, 'class_weight': 'balanced'}


  7%|███                                         | 6/88 [00:33<07:29,  5.48s/it]

Best Threshold=0.344000
119
119
2216
Assay:  743015
Heirarchial_Model
Herirachal model for training data
RF Model
Tuned Mode:  {'random_state': 42, 'n_jobs': 1, 'n_estimators': 100, 'class_weight': None}


  8%|███▌                                        | 7/88 [00:35<06:09,  4.57s/it]

Best Threshold=0.680000
71
71
743015
Assay:  504444
Heirarchial_Model
Herirachal model for training data
RF Model
Tuned Mode:  {'random_state': 42, 'n_jobs': 1, 'n_estimators': 500, 'class_weight': 'balanced'}


  9%|████                                        | 8/88 [00:42<07:02,  5.28s/it]

Best Threshold=0.232000
216
216
504444
Assay:  894
Heirarchial_Model
Herirachal model for training data
RF Model
Tuned Mode:  {'random_state': 42, 'n_jobs': 1, 'n_estimators': 100, 'class_weight': 'balanced'}


 10%|████▌                                       | 9/88 [00:45<06:01,  4.58s/it]

Best Threshold=0.290000
232
232
894
Assay:  720635
Heirarchial_Model
Herirachal model for training data
RF Model
Tuned Mode:  {'random_state': 42, 'n_jobs': 1, 'n_estimators': 100, 'class_weight': 'balanced'}


 11%|████▉                                      | 10/88 [00:47<05:01,  3.86s/it]

Best Threshold=0.190000
51
51
720635
Assay:  1688
Heirarchial_Model
Herirachal model for training data
RF Model
Tuned Mode:  {'random_state': 42, 'n_jobs': 1, 'n_estimators': 100, 'class_weight': 'balanced'}


 12%|█████▍                                     | 11/88 [00:50<04:26,  3.46s/it]

Best Threshold=0.220000
122
122
1688
Assay:  2599
Heirarchial_Model
Herirachal model for training data
RF Model
Tuned Mode:  {'random_state': 42, 'n_jobs': 1, 'n_estimators': 500, 'class_weight': 'balanced'}


 14%|█████▊                                     | 12/88 [00:56<05:33,  4.39s/it]

Best Threshold=0.400000
180
180
2599
Assay:  602340
Heirarchial_Model
Herirachal model for training data
RF Model
Tuned Mode:  {'random_state': 42, 'n_jobs': 1, 'n_estimators': 100, 'class_weight': None}


 15%|██████▎                                    | 13/88 [00:59<04:48,  3.85s/it]

Best Threshold=0.070000
60
60
602340
Assay:  2796
Heirarchial_Model
Herirachal model for training data
RF Model
Tuned Mode:  {'random_state': 42, 'n_jobs': 1, 'n_estimators': 500, 'class_weight': 'balanced'}


 16%|██████▊                                    | 14/88 [01:07<06:07,  4.97s/it]

Best Threshold=0.220000
270
270
2796
Assay:  504652
Heirarchial_Model
Herirachal model for training data
RF Model
Tuned Mode:  {'random_state': 42, 'n_jobs': 1, 'n_estimators': 400, 'class_weight': None}


 17%|███████▎                                   | 15/88 [01:13<06:27,  5.31s/it]

Best Threshold=0.512500
236
236
504652
Assay:  651658
Heirarchial_Model
Herirachal model for training data
RF Model
Tuned Mode:  {'random_state': 42, 'n_jobs': 1, 'n_estimators': 100, 'class_weight': 'balanced'}


 18%|███████▊                                   | 16/88 [01:15<05:21,  4.47s/it]

Best Threshold=0.190000
77
77
651658
Assay:  720582
Heirarchial_Model
Herirachal model for training data
RF Model
Tuned Mode:  {'random_state': 42, 'n_jobs': 1, 'n_estimators': 100, 'class_weight': 'balanced'}


 19%|████████▎                                  | 17/88 [01:18<04:35,  3.88s/it]

Best Threshold=0.270000
93
93
720582
Assay:  624256
Heirarchial_Model
Herirachal model for training data
RF Model
Tuned Mode:  {'random_state': 42, 'n_jobs': 1, 'n_estimators': 500, 'class_weight': 'balanced'}


 20%|████████▊                                  | 18/88 [01:24<05:16,  4.52s/it]

Best Threshold=0.132000
100
100
624256
Assay:  1531
Heirarchial_Model
Herirachal model for training data
RF Model
Tuned Mode:  {'random_state': 42, 'n_jobs': 1, 'n_estimators': 400, 'class_weight': None}


 22%|█████████▎                                 | 19/88 [01:29<05:21,  4.65s/it]

Best Threshold=0.327500
92
92
1531
Assay:  588852
Heirarchial_Model
Herirachal model for training data
RF Model
Tuned Mode:  {'random_state': 42, 'n_jobs': 1, 'n_estimators': 500, 'class_weight': 'balanced'}


 23%|█████████▊                                 | 20/88 [01:35<05:43,  5.05s/it]

Best Threshold=0.260000
99
99
588852
Assay:  485270
Heirarchial_Model
Herirachal model for training data
RF Model
Tuned Mode:  {'random_state': 42, 'n_jobs': 1, 'n_estimators': 500, 'class_weight': 'balanced'}


 24%|██████████▎                                | 21/88 [01:41<06:03,  5.43s/it]

Best Threshold=0.158000
146
146
485270
Assay:  743012
Heirarchial_Model
Herirachal model for training data
RF Model
Tuned Mode:  {'random_state': 42, 'n_jobs': 1, 'n_estimators': 500, 'class_weight': 'balanced'}


 25%|██████████▊                                | 22/88 [01:47<06:04,  5.52s/it]

Best Threshold=0.558000
73
73
743012
Assay:  777
Heirarchial_Model
Herirachal model for training data
RF Model
Tuned Mode:  {'random_state': 42, 'n_jobs': 1, 'n_estimators': 500, 'class_weight': 'balanced'}


 26%|███████████▏                               | 23/88 [01:58<07:49,  7.23s/it]

Best Threshold=0.252000
684
684
777
Assay:  504582
Heirarchial_Model
Herirachal model for training data
RF Model
Tuned Mode:  {'random_state': 42, 'n_jobs': 1, 'n_estimators': 500, 'class_weight': None}


 27%|███████████▋                               | 24/88 [02:04<07:16,  6.82s/it]

Best Threshold=0.468000
83
83
504582
Assay:  504660
Heirarchial_Model
Herirachal model for training data
RF Model
Tuned Mode:  {'random_state': 42, 'n_jobs': 1, 'n_estimators': 400, 'class_weight': 'balanced'}


 28%|████████████▏                              | 25/88 [02:09<06:38,  6.32s/it]

Best Threshold=0.300000
100
100
504660
Assay:  2553
Heirarchial_Model
Herirachal model for training data
RF Model
Tuned Mode:  {'random_state': 42, 'n_jobs': 1, 'n_estimators': 500, 'class_weight': 'balanced'}


 30%|████████████▋                              | 26/88 [02:15<06:23,  6.18s/it]

Best Threshold=0.264000
84
84
2553
Assay:  743014
Heirarchial_Model
Herirachal model for training data
RF Model
Tuned Mode:  {'random_state': 42, 'n_jobs': 1, 'n_estimators': 500, 'class_weight': 'balanced'}


 31%|█████████████▏                             | 27/88 [02:20<06:07,  6.02s/it]

Best Threshold=0.542000
68
68
743014
Assay:  1822
Heirarchial_Model
Herirachal model for training data
RF Model
Tuned Mode:  {'random_state': 42, 'n_jobs': 1, 'n_estimators': 500, 'class_weight': 'balanced'}


 32%|█████████████▋                             | 28/88 [02:26<06:01,  6.02s/it]

Best Threshold=0.326000
108
108
1822
Assay:  938
Heirarchial_Model
Herirachal model for training data
RF Model
Tuned Mode:  {'random_state': 42, 'n_jobs': 1, 'n_estimators': 100, 'class_weight': 'balanced'}


 33%|██████████████▏                            | 29/88 [02:29<04:57,  5.04s/it]

Best Threshold=0.090000
112
112
938
Assay:  1529
Heirarchial_Model
Herirachal model for training data
RF Model
Tuned Mode:  {'random_state': 42, 'n_jobs': 1, 'n_estimators': 400, 'class_weight': None}


 34%|██████████████▋                            | 30/88 [02:34<04:55,  5.09s/it]

Best Threshold=0.020000
117
117
1529
Assay:  651610
Heirarchial_Model
Herirachal model for training data
RF Model
Tuned Mode:  {'random_state': 42, 'n_jobs': 1, 'n_estimators': 500, 'class_weight': 'balanced'}


 35%|███████████████▏                           | 31/88 [02:40<05:05,  5.36s/it]

Best Threshold=0.224000
120
120
651610
Assay:  624466
Heirarchial_Model
Herirachal model for training data
RF Model
Tuned Mode:  {'random_state': 42, 'n_jobs': 1, 'n_estimators': 500, 'class_weight': None}


 36%|███████████████▋                           | 32/88 [02:47<05:14,  5.61s/it]

Best Threshold=0.230000
136
136
624466
Assay:  932
Heirarchial_Model
Herirachal model for training data
RF Model
Tuned Mode:  {'random_state': 42, 'n_jobs': 1, 'n_estimators': 100, 'class_weight': 'balanced'}


 38%|████████████████▏                          | 33/88 [02:50<04:27,  4.87s/it]

Best Threshold=0.190000
336
336
932
Assay:  720648
Heirarchial_Model
Herirachal model for training data
RF Model
Tuned Mode:  {'random_state': 42, 'n_jobs': 1, 'n_estimators': 500, 'class_weight': 'balanced'}


 39%|████████████████▌                          | 34/88 [02:56<04:39,  5.17s/it]

Best Threshold=0.322000
95
95
720648
Assay:  2540
Heirarchial_Model
Herirachal model for training data
RF Model
Tuned Mode:  {'random_state': 42, 'n_jobs': 1, 'n_estimators': 400, 'class_weight': 'balanced'}


 40%|█████████████████                          | 35/88 [03:01<04:32,  5.14s/it]

Best Threshold=0.265000
100
100
2540
Assay:  2098
Heirarchial_Model
Herirachal model for training data
RF Model
Tuned Mode:  {'random_state': 42, 'n_jobs': 1, 'n_estimators': 500, 'class_weight': None}


 41%|█████████████████▌                         | 36/88 [03:07<04:41,  5.41s/it]

Best Threshold=0.450000
94
94
2098
Assay:  Novartis1
Heirarchial_Model
Herirachal model for training data
RF Model
Tuned Mode:  {'random_state': 42, 'n_jobs': 1, 'n_estimators': 500, 'class_weight': 'balanced'}


 42%|██████████████████                         | 37/88 [03:11<04:21,  5.12s/it]

Best Threshold=0.500000
25
25
Novartis1
Assay:  Novartis2
Heirarchial_Model
Herirachal model for training data
RF Model
Tuned Mode:  {'random_state': 42, 'n_jobs': 1, 'n_estimators': 500, 'class_weight': 'balanced'}


 43%|██████████████████▌                        | 38/88 [03:16<04:06,  4.93s/it]

Best Threshold=1.840000
26
26
Novartis2
Assay:  2685
Heirarchial_Model
Herirachal model for training data
RF Model
Tuned Mode:  {'random_state': 42, 'n_jobs': 1, 'n_estimators': 100, 'class_weight': 'balanced'}


 44%|███████████████████                        | 39/88 [03:18<03:18,  4.06s/it]

Best Threshold=0.090000
31
31
2685
Assay:  485294
Heirarchial_Model
Herirachal model for training data
RF Model
Tuned Mode:  {'random_state': 42, 'n_jobs': 1, 'n_estimators': 100, 'class_weight': None}


 45%|███████████████████▌                       | 40/88 [03:19<02:38,  3.30s/it]

Best Threshold=0.690000
23
23
485294
Assay:  2517
Heirarchial_Model
Herirachal model for training data
RF Model
Tuned Mode:  {'random_state': 42, 'n_jobs': 1, 'n_estimators': 500, 'class_weight': 'balanced'}


 47%|████████████████████                       | 41/88 [03:25<03:07,  3.98s/it]

Best Threshold=0.166000
55
55
2517
Assay:  504333
Heirarchial_Model
Herirachal model for training data
RF Model
Tuned Mode:  {'random_state': 42, 'n_jobs': 1, 'n_estimators': 500, 'class_weight': 'balanced'}


 48%|████████████████████▌                      | 42/88 [03:32<03:43,  4.85s/it]

Best Threshold=0.902000
186
186
504333
Assay:  881
Heirarchial_Model
Herirachal model for training data
RF Model
Tuned Mode:  {'random_state': 42, 'n_jobs': 1, 'n_estimators': 500, 'class_weight': 'balanced'}


 49%|█████████████████████                      | 43/88 [03:37<03:48,  5.07s/it]

Best Threshold=0.208000
54
54
881
Assay:  504339
Heirarchial_Model
Herirachal model for training data
RF Model
Tuned Mode:  {'random_state': 42, 'n_jobs': 1, 'n_estimators': 500, 'class_weight': 'balanced'}


 50%|█████████████████████▌                     | 44/88 [03:44<04:03,  5.53s/it]

Best Threshold=0.592000
161
161
504339
Assay:  504466
Heirarchial_Model
Herirachal model for training data
RF Model
Tuned Mode:  {'random_state': 42, 'n_jobs': 1, 'n_estimators': 400, 'class_weight': None}


 51%|█████████████████████▉                     | 45/88 [03:49<03:50,  5.36s/it]

Best Threshold=0.975000
59
59
504466
Assay:  504332
Heirarchial_Model
Herirachal model for training data
RF Model
Tuned Mode:  {'random_state': 42, 'n_jobs': 1, 'n_estimators': 500, 'class_weight': None}


 52%|██████████████████████▍                    | 46/88 [03:57<04:20,  6.21s/it]

Best Threshold=0.814000
327
327
504332
Assay:  504327
Heirarchial_Model
Herirachal model for training data
RF Model
Tuned Mode:  {'random_state': 42, 'n_jobs': 1, 'n_estimators': 100, 'class_weight': 'balanced'}


 53%|██████████████████████▉                    | 47/88 [03:59<03:24,  4.99s/it]

Best Threshold=0.650000
31
31
504327
Assay:  488953
Heirarchial_Model
Herirachal model for training data
RF Model
Tuned Mode:  {'random_state': 42, 'n_jobs': 1, 'n_estimators': 300, 'class_weight': 'balanced'}


 55%|███████████████████████▍                   | 48/88 [04:02<02:59,  4.48s/it]

Best Threshold=0.113333
25
25
488953
Assay:  1851_2
Heirarchial_Model
Herirachal model for training data
RF Model
Tuned Mode:  {'random_state': 42, 'n_jobs': 1, 'n_estimators': 500, 'class_weight': 'balanced'}


 56%|███████████████████████▉                   | 49/88 [04:09<03:19,  5.11s/it]

Best Threshold=0.056000
165
165
1851_2
Assay:  1851_4
Heirarchial_Model
Herirachal model for training data
RF Model
Tuned Mode:  {'random_state': 42, 'n_jobs': 1, 'n_estimators': 100, 'class_weight': 'balanced'}


 57%|████████████████████████▍                  | 50/88 [04:12<02:44,  4.32s/it]

Best Threshold=0.350000
153
153
1851_4
Assay:  1851_1
Heirarchial_Model
Herirachal model for training data
RF Model
Tuned Mode:  {'random_state': 42, 'n_jobs': 1, 'n_estimators': 500, 'class_weight': None}


 58%|████████████████████████▉                  | 51/88 [04:18<03:03,  4.96s/it]

Best Threshold=0.368000
166
166
1851_1
Assay:  1851_3
Heirarchial_Model
Herirachal model for training data
RF Model
Tuned Mode:  {'random_state': 42, 'n_jobs': 1, 'n_estimators': 300, 'class_weight': 'balanced'}


 59%|█████████████████████████▍                 | 52/88 [04:23<02:54,  4.86s/it]

Best Threshold=0.433333
155
155
1851_3
Assay:  1851_5
Heirarchial_Model
Herirachal model for training data
RF Model
Tuned Mode:  {'random_state': 42, 'n_jobs': 1, 'n_estimators': 400, 'class_weight': 'balanced'}


 60%|█████████████████████████▉                 | 53/88 [04:28<02:57,  5.07s/it]

Best Threshold=0.455000
158
158
1851_5
Assay:  449750
Heirarchial_Model
Herirachal model for training data
RF Model
Tuned Mode:  {'random_state': 42, 'n_jobs': 1, 'n_estimators': 500, 'class_weight': 'balanced'}


 61%|██████████████████████████▍                | 54/88 [04:33<02:47,  4.94s/it]

Best Threshold=0.574000
32
32
449750
Assay:  504847
Heirarchial_Model
Herirachal model for training data
RF Model
Tuned Mode:  {'random_state': 42, 'n_jobs': 1, 'n_estimators': 500, 'class_weight': 'balanced'}


 62%|██████████████████████████▉                | 55/88 [04:39<02:51,  5.20s/it]

Best Threshold=0.852000
64
64
504847
Assay:  504834
Heirarchial_Model
Herirachal model for training data
RF Model
Tuned Mode:  {'random_state': 42, 'n_jobs': 1, 'n_estimators': 500, 'class_weight': 'balanced'}


 64%|███████████████████████████▎               | 56/88 [04:50<03:44,  7.02s/it]

Best Threshold=0.478000
551
551
504834
Assay:  540317
Heirarchial_Model
Herirachal model for training data
RF Model
Tuned Mode:  {'random_state': 42, 'n_jobs': 1, 'n_estimators': 400, 'class_weight': 'balanced'}


 65%|███████████████████████████▊               | 57/88 [04:55<03:17,  6.36s/it]

Best Threshold=1.815000
63
63
540317
Assay:  588453
Heirarchial_Model
Herirachal model for training data
RF Model
Tuned Mode:  {'random_state': 42, 'n_jobs': 1, 'n_estimators': 100, 'class_weight': None}


 66%|████████████████████████████▎              | 58/88 [04:57<02:36,  5.23s/it]

Best Threshold=0.070000
142
142
588453
Assay:  588590
Heirarchial_Model
Herirachal model for training data
RF Model
Tuned Mode:  {'random_state': 42, 'n_jobs': 1, 'n_estimators': 300, 'class_weight': 'balanced'}


 67%|████████████████████████████▊              | 59/88 [05:02<02:25,  5.02s/it]

Best Threshold=0.013333
136
136
588590
Assay:  588795
Heirarchial_Model
Herirachal model for training data
RF Model
Tuned Mode:  {'random_state': 42, 'n_jobs': 1, 'n_estimators': 300, 'class_weight': None}


 68%|█████████████████████████████▎             | 60/88 [05:05<02:04,  4.45s/it]

Best Threshold=0.316667
23
23
588795
Assay:  504845
Heirarchial_Model
Herirachal model for training data
RF Model
Tuned Mode:  {'random_state': 42, 'n_jobs': 1, 'n_estimators': 500, 'class_weight': 'balanced'}


 69%|█████████████████████████████▊             | 61/88 [05:09<02:01,  4.49s/it]

Best Threshold=0.636000
26
26
504845
Assay:  588856
Heirarchial_Model
Herirachal model for training data
RF Model
Tuned Mode:  {'random_state': 42, 'n_jobs': 1, 'n_estimators': 500, 'class_weight': 'balanced'}


 70%|██████████████████████████████▎            | 62/88 [05:15<02:07,  4.91s/it]

Best Threshold=0.226000
85
85
588856
Assay:  504832
Heirarchial_Model
Herirachal model for training data
RF Model
Tuned Mode:  {'random_state': 42, 'n_jobs': 1, 'n_estimators': 400, 'class_weight': None}


 72%|██████████████████████████████▊            | 63/88 [05:25<02:36,  6.26s/it]

Best Threshold=0.395000
527
527
504832
Assay:  588855
Heirarchial_Model
Herirachal model for training data
RF Model
Tuned Mode:  {'random_state': 42, 'n_jobs': 1, 'n_estimators': 100, 'class_weight': None}


 73%|███████████████████████████████▎           | 64/88 [05:27<02:01,  5.08s/it]

Best Threshold=0.490000
76
76
588855
Assay:  121
Heirarchial_Model
Herirachal model for training data
RF Model
Tuned Mode:  {'random_state': 42, 'n_jobs': 1, 'n_estimators': 300, 'class_weight': None}


 74%|███████████████████████████████▊           | 65/88 [05:30<01:42,  4.44s/it]

Best Threshold=0.290000
20
20
121
Assay:  624032
Heirarchial_Model
Herirachal model for training data
RF Model
Tuned Mode:  {'random_state': 42, 'n_jobs': 1, 'n_estimators': 100, 'class_weight': None}


 75%|████████████████████████████████▎          | 66/88 [05:32<01:19,  3.63s/it]

Best Threshold=0.610000
22
22
624032
Assay:  119
Heirarchial_Model
Herirachal model for training data
RF Model
Tuned Mode:  {'random_state': 42, 'n_jobs': 1, 'n_estimators': 500, 'class_weight': 'balanced'}


 76%|████████████████████████████████▋          | 67/88 [05:36<01:22,  3.92s/it]

Best Threshold=0.290000
21
21
119
Assay:  624296
Heirarchial_Model
Herirachal model for training data
RF Model
Tuned Mode:  {'random_state': 42, 'n_jobs': 1, 'n_estimators': 500, 'class_weight': 'balanced'}


 77%|█████████████████████████████████▏         | 68/88 [05:43<01:32,  4.62s/it]

Best Threshold=0.714000
142
142
624296
Assay:  651965
Heirarchial_Model
Herirachal model for training data
RF Model
Tuned Mode:  {'random_state': 42, 'n_jobs': 1, 'n_estimators': 400, 'class_weight': 'balanced'}


 78%|█████████████████████████████████▋         | 69/88 [05:48<01:30,  4.76s/it]

Best Threshold=0.907500
91
91
651965
Assay:  624170
Heirarchial_Model
Herirachal model for training data
RF Model
Tuned Mode:  {'random_state': 42, 'n_jobs': 1, 'n_estimators': 500, 'class_weight': 'balanced'}


 80%|██████████████████████████████████▏        | 70/88 [05:52<01:24,  4.72s/it]

Best Threshold=0.018000
31
31
624170
Assay:  651820
Heirarchial_Model
Herirachal model for training data
RF Model
Tuned Mode:  {'random_state': 42, 'n_jobs': 1, 'n_estimators': 500, 'class_weight': 'balanced'}


 81%|██████████████████████████████████▋        | 71/88 [05:58<01:24,  5.00s/it]

Best Threshold=0.498000
66
66
651820
Assay:  624297
Heirarchial_Model
Herirachal model for training data
RF Model
Tuned Mode:  {'random_state': 42, 'n_jobs': 1, 'n_estimators': 500, 'class_weight': 'balanced'}


 82%|███████████████████████████████████▏       | 72/88 [06:05<01:27,  5.47s/it]

Best Threshold=0.364000
155
155
624297
Assay:  651635
Heirarchial_Model
Herirachal model for training data
RF Model
Tuned Mode:  {'random_state': 42, 'n_jobs': 1, 'n_estimators': 500, 'class_weight': 'balanced'}


 83%|███████████████████████████████████▋       | 73/88 [06:10<01:23,  5.56s/it]

Best Threshold=0.212000
71
71
651635
Assay:  624417
Heirarchial_Model
Herirachal model for training data
RF Model
Tuned Mode:  {'random_state': 42, 'n_jobs': 1, 'n_estimators': 300, 'class_weight': 'balanced'}


 84%|████████████████████████████████████▏      | 74/88 [06:15<01:13,  5.25s/it]

Best Threshold=0.503333
170
170
624417
Assay:  624202
Heirarchial_Model
Herirachal model for training data
RF Model
Tuned Mode:  {'random_state': 42, 'n_jobs': 1, 'n_estimators': 500, 'class_weight': 'balanced'}


 85%|████████████████████████████████████▋      | 75/88 [06:21<01:10,  5.44s/it]

Best Threshold=0.756000
76
76
624202
Assay:  624287
Heirarchial_Model
Herirachal model for training data
RF Model
Tuned Mode:  {'random_state': 42, 'n_jobs': 1, 'n_estimators': 300, 'class_weight': None}


 86%|█████████████████████████████████████▏     | 76/88 [06:24<00:56,  4.71s/it]

Best Threshold=0.370000
21
21
624287
Assay:  624288
Heirarchial_Model
Herirachal model for training data
RF Model
Tuned Mode:  {'random_state': 42, 'n_jobs': 1, 'n_estimators': 400, 'class_weight': None}


 88%|█████████████████████████████████████▋     | 77/88 [06:28<00:49,  4.50s/it]

Best Threshold=0.095000
31
31
624288
Assay:  651644
Heirarchial_Model
Herirachal model for training data
RF Model
Tuned Mode:  {'random_state': 42, 'n_jobs': 1, 'n_estimators': 500, 'class_weight': 'balanced'}


 89%|██████████████████████████████████████     | 78/88 [06:32<00:45,  4.51s/it]

Best Threshold=0.108000
31
31
651644
Assay:  652104
Heirarchial_Model
Herirachal model for training data
RF Model
Tuned Mode:  {'random_state': 42, 'n_jobs': 1, 'n_estimators': 300, 'class_weight': None}


 90%|██████████████████████████████████████▌    | 79/88 [06:36<00:39,  4.40s/it]

Best Threshold=0.846667
79
79
652104
Assay:  720579
Heirarchial_Model
Herirachal model for training data
RF Model
Tuned Mode:  {'random_state': 42, 'n_jobs': 1, 'n_estimators': 100, 'class_weight': None}


 91%|███████████████████████████████████████    | 80/88 [06:38<00:28,  3.59s/it]

Best Threshold=0.670000
22
22
720579
Assay:  720533
Heirarchial_Model
Herirachal model for training data
RF Model
Tuned Mode:  {'random_state': 42, 'n_jobs': 1, 'n_estimators': 500, 'class_weight': None}


 92%|███████████████████████████████████████▌   | 81/88 [06:43<00:27,  3.97s/it]

Best Threshold=0.860000
36
36
720533
Assay:  720542
Heirarchial_Model
Herirachal model for training data
RF Model
Tuned Mode:  {'random_state': 42, 'n_jobs': 1, 'n_estimators': 100, 'class_weight': None}


 93%|████████████████████████████████████████   | 82/88 [06:45<00:19,  3.29s/it]

Best Threshold=0.120000
21
21
720542
Assay:  720580
Heirarchial_Model
Herirachal model for training data
RF Model
Tuned Mode:  {'random_state': 42, 'n_jobs': 1, 'n_estimators': 500, 'class_weight': 'balanced'}


 94%|████████████████████████████████████████▌  | 83/88 [06:49<00:18,  3.64s/it]

Best Threshold=1.980000
25
25
720580
Assay:  720504
Heirarchial_Model
Herirachal model for training data
RF Model
Tuned Mode:  {'random_state': 42, 'n_jobs': 1, 'n_estimators': 400, 'class_weight': None}


 95%|█████████████████████████████████████████  | 84/88 [06:53<00:14,  3.70s/it]

Best Threshold=0.507500
26
26
720504
Assay:  720532
Heirarchial_Model
Herirachal model for training data
RF Model
Tuned Mode:  {'random_state': 42, 'n_jobs': 1, 'n_estimators': 100, 'class_weight': 'balanced'}


 97%|█████████████████████████████████████████▌ | 85/88 [06:56<00:10,  3.39s/it]

Best Threshold=0.810000
61
61
720532
Assay:  1159524
Heirarchial_Model
Herirachal model for training data
RF Model
Tuned Mode:  {'random_state': 42, 'n_jobs': 1, 'n_estimators': 500, 'class_weight': 'balanced'}


 98%|██████████████████████████████████████████ | 86/88 [07:02<00:08,  4.40s/it]

Best Threshold=0.186000
176
176
1159524
Assay:  1117304
Heirarchial_Model
Herirachal model for training data
RF Model
Tuned Mode:  {'random_state': 42, 'n_jobs': 1, 'n_estimators': 400, 'class_weight': 'balanced'}


 99%|██████████████████████████████████████████▌| 87/88 [07:07<00:04,  4.56s/it]

Best Threshold=0.210000
78
78
1117304
Assay:  1117305
Heirarchial_Model
Herirachal model for training data
RF Model
Tuned Mode:  {'random_state': 42, 'n_jobs': 1, 'n_estimators': 500, 'class_weight': 'balanced'}


100%|███████████████████████████████████████████| 88/88 [07:13<00:00,  4.93s/it]

Best Threshold=0.584000
72
72
1117305





In [9]:
all_preds_test_compounds

Unnamed: 0,assay,InChICode_standardised,fp_proba,fp_pred,fp_threshold,CP_proba,CP_pred,CP_threshold,true,Data,MFP_Correct,CP_Correct,fp_proba_scaled,CP_proba_scaled,simple_pred,Ensemble_Correct,heirarchial_pred,heirarchial_Correct
0,588458,InChI=1S/C14H15BrN2O3/c1-17(2)6-5-16-13(18)11-...,0.335765,1,0.310223,0.478519,1,0.304308,0.0,HeldOut,False,False,0.518514,0.625207,1.0,False,0.0,True
1,588458,InChI=1S/C23H22N2O2/c26-23(16-22-21-9-5-4-6-17...,0.210945,0,0.310223,0.168993,0,0.304308,0.0,HeldOut,True,True,0.339990,0.277667,0.0,True,0.0,True
2,588458,InChI=1S/C22H17F2NO3/c23-18-7-1-15(2-8-18)13-2...,0.155810,0,0.310223,0.248408,0,0.304308,0.0,HeldOut,True,True,0.251126,0.408151,0.0,True,0.0,True
3,588458,InChI=1S/C24H24N6O4/c25-22-20(24(31)26-7-8-29-...,0.163149,0,0.310223,0.208577,0,0.304308,0.0,HeldOut,True,True,0.262955,0.342707,0.0,True,0.0,True
4,588458,InChI=1S/C15H23BrN2O4S/c1-10-8-18(11(2)9-19)23...,0.285539,0,0.310223,0.554642,1,0.304308,0.0,HeldOut,True,False,0.460216,0.679917,1.0,False,0.0,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
67,1117305,"InChI=1S/C7H8ClN3O4S2/c8-4-1-5-7(2-6(4)16(9,12...",0.703995,1,0.457779,0.556806,1,0.264151,0.0,HeldOut,False,False,0.727044,0.698855,1.0,False,0.0,True
68,1117305,InChI=1S/C14H9I3O4/c15-9-6-8(1-2-12(9)18)21-14...,0.196017,0,0.457779,0.210436,0,0.264151,0.0,HeldOut,True,True,0.214095,0.398324,0.0,True,0.0,True
69,1117305,InChI=1S/C18H19Cl2NO4/c1-5-25-18(23)14-10(3)21...,0.202912,0,0.457779,0.487726,1,0.264151,1.0,HeldOut,False,True,0.221627,0.651916,0.0,False,0.0,False
70,1117305,InChI=1S/C16H11BrN2O/c17-9-5-6-14-11(7-9)12-8-...,0.149597,0,0.457779,0.238125,0,0.264151,0.0,HeldOut,True,True,0.163395,0.450736,0.0,True,0.0,True


In [10]:
all_preds_test_compounds.to_csv("Predictions_train_heldout_scaled_prob_all_assays_heirarchial_model_test_compounds.csv", index=False)

In [11]:
metrics_list = pd.DataFrame(metrics_list,columns=["assay","method","f1", "precision", "Sensitivity", "Specificity", "ba", "mcc", "AUC"])
metrics_list

Unnamed: 0,assay,method,f1,precision,Sensitivity,Specificity,ba,mcc,AUC
0,588458,Heirarchial Model,0.312500,0.500000,0.227273,0.926471,0.576872,0.210241,0.664104
1,588334,Heirarchial Model,0.320988,0.236364,0.500000,0.475000,0.487500,-0.021528,0.533413
2,2642,Heirarchial Model,0.280000,0.225806,0.368421,0.582609,0.475515,-0.043098,0.470252
3,2156,Heirarchial Model,0.408759,0.256881,1.000000,0.058140,0.529070,0.122209,0.596761
4,2330,Heirarchial Model,0.750000,0.750000,0.750000,0.920000,0.835000,0.670000,0.894375
...,...,...,...,...,...,...,...,...,...
83,720504,Heirarchial Model,0.863636,0.760000,1.000000,0.142857,0.571429,0.329502,0.293233
84,720532,Heirarchial Model,0.507937,0.800000,0.372093,0.777778,0.574935,0.145606,0.516150
85,1159524,Heirarchial Model,0.375000,0.270000,0.613636,0.446970,0.530303,0.052981,0.530045
86,1117304,Heirarchial Model,0.525000,0.381818,0.840000,0.358491,0.599245,0.203143,0.579245


In [12]:
metrics_list.to_csv("heirarchial_model_metrics.csv", index=False)