In [1]:
#This notebook builds heirarchial baseline randomforest on pubchem_assays using 
#Morgan fingerprint 
#and cell painting

In [2]:
from rdkit import Chem, DataStructs
from rdkit.Chem import AllChem
from scipy import stats
from rdkit.Chem import rdMolDescriptors as rdmd
from rdkit.Chem.Scaffolds import MurckoScaffold
import pandas as pd
from tqdm import tqdm
import time
import numpy as np
from scipy.spatial.distance import cdist
from sklearn.cluster import MiniBatchKMeans
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import matthews_corrcoef,confusion_matrix, roc_auc_score, roc_curve
import seaborn as sns

In [3]:
list_of_lists_df = pd.read_csv("Predictions_train_heldout_scaled_prob_all_assays_ensemble.csv")
list_of_lists_df

Unnamed: 0,assay,StdInChI,fp_proba,fp_pred,fp_threshold,CP_proba,CP_pred,CP_threshold,true,ts,pc,Data,MFP_Correct,CP_Correct,fp_proba_scaled,CP_proba_scaled,simple_pred,Ensemble_Correct
0,588458,"InChI=1S/C19H19N3O5S3/c23-16-6-5-13(30(25,26)2...",0.215985,0,0.278423,0.291141,0,0.306456,0.0,0.418605,0.430046,Training,True,True,0.387873,0.475014,0.0,True
1,588458,InChI=1S/C9H8ClN3S/c1-6-11-12-9(14)13(6)8-4-2-...,0.240910,0,0.278423,0.253367,0,0.306456,0.0,0.409091,0.680534,Training,True,True,0.432633,0.413382,0.0,True
2,588458,InChI=1S/C20H20N2O4/c1-2-25-20(24)17-12-18-16(...,0.232871,0,0.278423,0.154150,0,0.306456,0.0,0.447368,0.701574,Training,True,True,0.418196,0.251504,0.0,True
3,588458,InChI=1S/C26H24N2O5S/c1-33-19-11-9-18(10-12-19...,0.142133,0,0.278423,0.359792,1,0.306456,1.0,0.341463,0.453313,Training,False,True,0.255247,0.538452,0.0,False
4,588458,InChI=1S/C19H21N3O3S/c1-13-10-16-11-15(4-9-19(...,0.253998,0,0.278423,0.202499,0,0.306456,0.0,0.388889,0.646760,Training,True,True,0.456137,0.330388,0.0,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
50467,1117305,InChI=1S/C12H9F3N2O2/c1-7-10(6-16-19-7)11(18)1...,0.340631,0,0.357222,0.357490,1,0.330242,0.0,0.296296,0.479437,HeldOut,True,False,0.476778,0.520342,0.0,True
50468,1117305,InChI=1S/C12H10N2/c1-8-12-10(6-7-13-8)9-4-2-3-...,0.242578,0,0.357222,0.244216,0,0.330242,0.0,0.363636,0.619949,HeldOut,True,True,0.339534,0.369752,0.0,True
50469,1117305,InChI=1S/C8H4Cl2N2O2/c9-3-1-5-6(2-4(3)10)12-8(...,0.403774,1,0.357222,0.352388,1,0.330242,1.0,0.333333,0.541732,HeldOut,True,True,0.536212,0.516533,1.0,True
50470,1117305,InChI=1S/C15H12O5/c16-9-3-1-8(2-4-9)13-7-12(19...,0.478660,1,0.357222,0.243638,0,0.330242,0.0,0.444444,0.506977,HeldOut,False,True,0.594464,0.368877,0.0,True


In [5]:
from sklearn.linear_model import LogisticRegression
from io import StringIO
from itertools import product
import sys
import matplotlib.pyplot as plt
from sklearn.tree import plot_tree
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import StratifiedKFold

In [15]:
def RF_Fn(assay):

    detail_list=pd.DataFrame()
    detail_list = list_of_lists_df[list_of_lists_df["assay"]==assay].reset_index(drop=True)
    
    
    proba_batch=[]
    pred_batch=[]
    true_batch=[]
    StdInChI_batch=[]
    
    train= detail_list[detail_list["Data"]=="Training"]
    
    #Sample Train data
    train_inactive = train[train["true"] == 0]
    inactives_n=len(train_inactive)
    print("Inactive Compounds ", inactives_n)

    train_active = train[train["true"] == 1]
    actives_n=len(train_active)
    print("Active Compounds ", actives_n)

    if(inactives_n > actives_n):
            train_inactive= train_inactive.sample(actives_n, random_state=1)
            print("Inactive Compounds (after Undersamplimg) ", len(train_inactive) )
            train= train_active.append(train_inactive) 
            print("Total Compounds (after Undersamplimg) ", len(train) )

    elif(actives_n > inactives_n):
            train_active= train_active.sample(inactives_n, random_state=1)
            print("Active Compounds (after Undersamplimg) ", len(train_active) )
            train= train_inactive.append(train_active) 
            print("Total Compounds (after Undersamplimg) ", len(train) )

    
    #test
    test= detail_list[detail_list["Data"]=="HeldOut"]
    

    print("Herirachal model for training data")    

    print("LR Model")
            
    X_train = train[["fp_proba_scaled", "CP_proba_scaled"]].to_numpy()
    y_train = train["true"].to_numpy()
    X_test = test[["fp_proba_scaled", "CP_proba_scaled"]].to_numpy()
    y_test = test["true"].to_numpy()


    classifier = LogisticRegression(C=5, random_state=42)            
    classifier.fit(X_train, y_train)

           
    StdInChI = test["StdInChI"].to_numpy()
    pred = classifier.predict(X_test) 
    proba = classifier.predict_proba(X_test)[:,1]
    true = test["true"].to_numpy()
            

    StdInChI_batch = np.concatenate([StdInChI_batch, StdInChI])
    proba_batch = np.concatenate([proba_batch, proba])
    pred_batch = np.concatenate([pred_batch, pred])
    true_batch = np.concatenate([true_batch, true])

    print(len(pred_batch))
    print(len(true_batch))
   
    return StdInChI_batch, true_batch, pred_batch, proba_batch

In [16]:
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
from sklearn.metrics import classification_report
from sklearn.metrics import balanced_accuracy_score

def check_heirarchial(assay, method, y_true, y_pred, y_prob):
    
    conf_matrix = confusion_matrix(y_true, y_pred)
    #print(conf_matrix)
    
    #print(classification_report(y_true, y_pred))
    
    ba= balanced_accuracy_score(y_true, y_pred)
    #print('balanced_accuracy_score ',ba)
    
    mcc=matthews_corrcoef(y_true, y_pred)
    #print('matthews_corrcoef ',mcc)
    # Sensitivity, hit rate, recall, or true positive rate
    
    f1= f1_score(y_true, y_pred, average='binary')
    precision = precision_score(y_true, y_pred, average='binary')
    
    #print('F1 Toxic', f1)
    #print('precision', precision)
    
    Specificity = conf_matrix[0,0]/(conf_matrix[0,0]+conf_matrix[0,1])
    # Specificity or true negative rate
    Sensitivity = conf_matrix[1,1]/(conf_matrix[1,0]+conf_matrix[1,1]) 
    
    #print( 'Sensitivity', Sensitivity)
    #print( 'Specificity', Specificity)
    AUC = roc_auc_score(y_true, y_prob)
    #print('AUC-ROC ',AUC)    
    # calculate roc curves
    best_model_fpr, best_model_tpr, _ = roc_curve(y_true, y_prob)   
    
    
    row=[assay, method, f1, precision, Sensitivity, Specificity, ba, mcc, AUC]
    metrics_list.append(row)
    
    return metrics_list

In [25]:
import warnings
warnings.filterwarnings("ignore") #ignore, default
import pandas

#sns.set(rc={'figure.figsize':(2, 2)})
#plt.rcParams['figure.dpi'] = 80
#sns.set_style("white")
metrics_list=[]
all_preds_test_compounds=pd.DataFrame(columns=['assay', 'StdInChI', 'fp_proba', 'fp_pred', 'fp_threshold', 'CP_proba',
       'CP_pred', 'CP_threshold', 'true', 'ts', 'pc', 'Data', 'MFP_Correct',
       'CP_Correct', 'fp_proba_scaled', 'CP_proba_scaled', 'simple_pred',
       'Ensemble_Correct', 'heirarchial_pred', 'heirarchial_Correct'])


#for assay in tqdm(assaylist[:7]):
for assay in tqdm(list_of_lists_df.assay.unique()[:]):
    
    print("Assay: ", assay)
    detail_list=pd.DataFrame()
    detail_list = list_of_lists_df[list_of_lists_df["assay"]==assay].reset_index(drop=True)
    detail_list_test = detail_list[detail_list["Data"]=="HeldOut"]

    print("Heirarchial_Model")
    merger_df=pd.DataFrame()
    
    StdInChI_batch, true_batch, pred_batch, proba_batch = RF_Fn(assay)
    check_heirarchial(assay, "Heirarchial Model", true_batch, pred_batch, proba_batch)
    
    #print(true_batch)
    
    print(assay)
    
    merger_df["StdInChI"]=StdInChI_batch
    merger_df["heirarchial_pred"]= pred_batch
    merger_df["true"]=true_batch
    merger_df["heirarchial_Correct"]= merger_df["heirarchial_pred"] == merger_df["true"]
    merger_df["assay"]=assay
    
    #.scatterplot(data=detail_list[detail_list["true"]==1], x="pc", y="td", hue="herrachial_Correct", legend=False)
    
    #For each assay combine original predictions and merger_df predictions
    #all_preds = pd.DataFrame()
    all_preds_test_compounds= pd.concat([all_preds_test_compounds, pd.merge(detail_list_test, merger_df)])  
    
    
    

  4%|█▉                                          | 4/92 [00:00<00:02, 38.06it/s]

Assay:  588458
Heirarchial_Model
Inactive Compounds  268
Active Compounds  90
Inactive Compounds (after Undersamplimg)  90
Total Compounds (after Undersamplimg)  180
Herirachal model for training data
LR Model
90
90
588458
Assay:  588334
Heirarchial_Model
Inactive Compounds  316
Active Compounds  106
Inactive Compounds (after Undersamplimg)  106
Total Compounds (after Undersamplimg)  212
Herirachal model for training data
LR Model
106
106
588334
Assay:  2642
Heirarchial_Model
Inactive Compounds  458
Active Compounds  153
Inactive Compounds (after Undersamplimg)  153
Total Compounds (after Undersamplimg)  306
Herirachal model for training data
LR Model
153
153
2642
Assay:  2156
Heirarchial_Model
Inactive Compounds  340
Active Compounds  114
Inactive Compounds (after Undersamplimg)  114
Total Compounds (after Undersamplimg)  228
Herirachal model for training data
LR Model
114
114
2156
Assay:  2330
Heirarchial_Model
Inactive Compounds  196
Active Compounds  66
Inactive Compounds (after Un

 17%|███████▍                                   | 16/92 [00:00<00:01, 38.07it/s]

Assay:  894
Heirarchial_Model
Inactive Compounds  693
Active Compounds  231
Inactive Compounds (after Undersamplimg)  231
Total Compounds (after Undersamplimg)  462
Herirachal model for training data
LR Model
232
232
894
Assay:  720635
Heirarchial_Model
Inactive Compounds  140
Active Compounds  64
Inactive Compounds (after Undersamplimg)  64
Total Compounds (after Undersamplimg)  128
Herirachal model for training data
LR Model
51
51
720635
Assay:  1688
Heirarchial_Model
Inactive Compounds  364
Active Compounds  122
Inactive Compounds (after Undersamplimg)  122
Total Compounds (after Undersamplimg)  244
Herirachal model for training data
LR Model
122
122
1688
Assay:  2599
Heirarchial_Model
Inactive Compounds  540
Active Compounds  180
Inactive Compounds (after Undersamplimg)  180
Total Compounds (after Undersamplimg)  360
Herirachal model for training data
LR Model
180
180
2599
Assay:  602340
Heirarchial_Model
Inactive Compounds  180
Active Compounds  60
Inactive Compounds (after Unders

 22%|█████████▎                                 | 20/92 [00:00<00:01, 38.59it/s]

Assay:  720582
Heirarchial_Model
Inactive Compounds  278
Active Compounds  93
Inactive Compounds (after Undersamplimg)  93
Total Compounds (after Undersamplimg)  186
Herirachal model for training data
LR Model
93
93
720582
Assay:  624256
Heirarchial_Model
Inactive Compounds  300
Active Compounds  100
Inactive Compounds (after Undersamplimg)  100
Total Compounds (after Undersamplimg)  200
Herirachal model for training data
LR Model
100
100
624256
Assay:  1531
Heirarchial_Model
Inactive Compounds  273
Active Compounds  91
Inactive Compounds (after Undersamplimg)  91
Total Compounds (after Undersamplimg)  182
Herirachal model for training data
LR Model
92
92
1531
Assay:  588852
Heirarchial_Model
Inactive Compounds  295
Active Compounds  98
Inactive Compounds (after Undersamplimg)  98
Total Compounds (after Undersamplimg)  196
Herirachal model for training data
LR Model
99
99
588852
Assay:  485270
Heirarchial_Model
Inactive Compounds  436
Active Compounds  146
Inactive Compounds (after Und

 35%|██████████████▉                            | 32/92 [00:00<00:01, 38.78it/s]

Assay:  504660
Heirarchial_Model
Inactive Compounds  297
Active Compounds  99
Inactive Compounds (after Undersamplimg)  99
Total Compounds (after Undersamplimg)  198
Herirachal model for training data
LR Model
100
100
504660
Assay:  2553
Heirarchial_Model
Inactive Compounds  252
Active Compounds  84
Inactive Compounds (after Undersamplimg)  84
Total Compounds (after Undersamplimg)  168
Herirachal model for training data
LR Model
84
84
2553
Assay:  743014
Heirarchial_Model
Inactive Compounds  175
Active Compounds  96
Inactive Compounds (after Undersamplimg)  96
Total Compounds (after Undersamplimg)  192
Herirachal model for training data
LR Model
68
68
743014
Assay:  1822
Heirarchial_Model
Inactive Compounds  324
Active Compounds  108
Inactive Compounds (after Undersamplimg)  108
Total Compounds (after Undersamplimg)  216
Herirachal model for training data
LR Model
108
108
1822
Assay:  938
Heirarchial_Model
Inactive Compounds  336
Active Compounds  112
Inactive Compounds (after Undersam

 39%|████████████████▊                          | 36/92 [00:00<00:01, 38.78it/s]

Herirachal model for training data
LR Model
336
336
932
Assay:  720648
Heirarchial_Model
Inactive Compounds  283
Active Compounds  94
Inactive Compounds (after Undersamplimg)  94
Total Compounds (after Undersamplimg)  188
Herirachal model for training data
LR Model
95
95
720648
Assay:  2540
Heirarchial_Model
Inactive Compounds  300
Active Compounds  100
Inactive Compounds (after Undersamplimg)  100
Total Compounds (after Undersamplimg)  200
Herirachal model for training data
LR Model
100
100
2540
Assay:  2098
Heirarchial_Model
Inactive Compounds  280
Active Compounds  94
Inactive Compounds (after Undersamplimg)  94
Total Compounds (after Undersamplimg)  188
Herirachal model for training data
LR Model
94
94
2098
Assay:  Novartis1
Heirarchial_Model
Inactive Compounds  66
Active Compounds  30
Inactive Compounds (after Undersamplimg)  30
Total Compounds (after Undersamplimg)  60
Herirachal model for training data
LR Model
25
25
Novartis1
Assay:  Novartis2
Heirarchial_Model
Inactive Compoun

 49%|█████████████████████                      | 45/92 [00:01<00:01, 39.10it/s]

55
55
2517
Assay:  504333
Heirarchial_Model
Inactive Compounds  264
Active Compounds  478
Active Compounds (after Undersamplimg)  264
Total Compounds (after Undersamplimg)  528
Herirachal model for training data
LR Model
186
186
504333
Assay:  881
Heirarchial_Model
Inactive Compounds  160
Active Compounds  54
Inactive Compounds (after Undersamplimg)  54
Total Compounds (after Undersamplimg)  108
Herirachal model for training data
LR Model
54
54
881
Assay:  504339
Heirarchial_Model
Inactive Compounds  178
Active Compounds  466
Active Compounds (after Undersamplimg)  178
Total Compounds (after Undersamplimg)  356
Herirachal model for training data
LR Model
161
161
504339
Assay:  504466
Heirarchial_Model
Inactive Compounds  58
Active Compounds  175
Active Compounds (after Undersamplimg)  58
Total Compounds (after Undersamplimg)  116
Herirachal model for training data
LR Model
59
59
504466
Assay:  504332
Heirarchial_Model
Inactive Compounds  326
Active Compounds  979
Active Compounds (afte

 58%|████████████████████████▊                  | 53/92 [00:01<00:01, 38.67it/s]

1851_2
Assay:  1851_4
Heirarchial_Model
Inactive Compounds  270
Active Compounds  339
Active Compounds (after Undersamplimg)  270
Total Compounds (after Undersamplimg)  540
Herirachal model for training data
LR Model
153
153
1851_4
Assay:  1851_1
Heirarchial_Model
Inactive Compounds  396
Active Compounds  268
Inactive Compounds (after Undersamplimg)  268
Total Compounds (after Undersamplimg)  536
Herirachal model for training data
LR Model
166
166
1851_1
Assay:  1851_3
Heirarchial_Model
Inactive Compounds  397
Active Compounds  220
Inactive Compounds (after Undersamplimg)  220
Total Compounds (after Undersamplimg)  440
Herirachal model for training data
LR Model
155
155
1851_3
Assay:  1851_5
Heirarchial_Model
Inactive Compounds  428
Active Compounds  204
Inactive Compounds (after Undersamplimg)  204
Total Compounds (after Undersamplimg)  408
Herirachal model for training data
LR Model
158
158
1851_5
Assay:  449750
Heirarchial_Model
Inactive Compounds  47
Active Compounds  79
Active Com

 66%|████████████████████████████▌              | 61/92 [00:01<00:00, 38.34it/s]

551
551
504834
Assay:  540317
Heirarchial_Model
Inactive Compounds  187
Active Compounds  62
Inactive Compounds (after Undersamplimg)  62
Total Compounds (after Undersamplimg)  124
Herirachal model for training data
LR Model
63
63
540317
Assay:  588453
Heirarchial_Model
Inactive Compounds  416
Active Compounds  148
Inactive Compounds (after Undersamplimg)  148
Total Compounds (after Undersamplimg)  296
Herirachal model for training data
LR Model
142
142
588453
Assay:  588590
Heirarchial_Model
Inactive Compounds  408
Active Compounds  136
Inactive Compounds (after Undersamplimg)  136
Total Compounds (after Undersamplimg)  272
Herirachal model for training data
LR Model
136
136
588590
Assay:  588795
Heirarchial_Model
Inactive Compounds  44
Active Compounds  46
Active Compounds (after Undersamplimg)  44
Total Compounds (after Undersamplimg)  88
Herirachal model for training data
LR Model
23
23
588795
Assay:  504845
Heirarchial_Model
Inactive Compounds  65
Active Compounds  35
Inactive Com

 75%|████████████████████████████████▎          | 69/92 [00:01<00:00, 38.43it/s]

Inactive Compounds  1440
Active Compounds  666
Inactive Compounds (after Undersamplimg)  666
Total Compounds (after Undersamplimg)  1332
Herirachal model for training data
LR Model
527
527
504832
Assay:  588855
Heirarchial_Model
Inactive Compounds  170
Active Compounds  134
Inactive Compounds (after Undersamplimg)  134
Total Compounds (after Undersamplimg)  268
Herirachal model for training data
LR Model
76
76
588855
Assay:  121
Heirarchial_Model
Inactive Compounds  60
Active Compounds  20
Inactive Compounds (after Undersamplimg)  20
Total Compounds (after Undersamplimg)  40
Herirachal model for training data
LR Model
20
20
121
Assay:  624032
Heirarchial_Model
Inactive Compounds  44
Active Compounds  41
Inactive Compounds (after Undersamplimg)  41
Total Compounds (after Undersamplimg)  82
Herirachal model for training data
LR Model
22
22
624032
Assay:  119
Heirarchial_Model
Inactive Compounds  62
Active Compounds  21
Inactive Compounds (after Undersamplimg)  21
Total Compounds (after U

 84%|███████████████████████████████████▉       | 77/92 [00:02<00:00, 38.36it/s]

Heirarchial_Model
Inactive Compounds  65
Active Compounds  197
Active Compounds (after Undersamplimg)  65
Total Compounds (after Undersamplimg)  130
Herirachal model for training data
LR Model
66
66
651820
Assay:  624297
Heirarchial_Model
Inactive Compounds  386
Active Compounds  230
Inactive Compounds (after Undersamplimg)  230
Total Compounds (after Undersamplimg)  460
Herirachal model for training data
LR Model
155
155
624297
Assay:  651635
Heirarchial_Model
Inactive Compounds  164
Active Compounds  119
Inactive Compounds (after Undersamplimg)  119
Total Compounds (after Undersamplimg)  238
Herirachal model for training data
LR Model
71
71
651635
Assay:  624417
Heirarchial_Model
Inactive Compounds  495
Active Compounds  181
Inactive Compounds (after Undersamplimg)  181
Total Compounds (after Undersamplimg)  362
Herirachal model for training data
LR Model
170
170
624417
Assay:  624202
Heirarchial_Model
Inactive Compounds  101
Active Compounds  202
Active Compounds (after Undersamplim

 92%|███████████████████████████████████████▋   | 85/92 [00:02<00:00, 38.73it/s]

Assay:  686978
Heirarchial_Model
Inactive Compounds  9
Active Compounds  26
Active Compounds (after Undersamplimg)  9
Total Compounds (after Undersamplimg)  18
Herirachal model for training data
LR Model
9
9
686978
Assay:  686979
Heirarchial_Model
Inactive Compounds  14
Active Compounds  43
Active Compounds (after Undersamplimg)  14
Total Compounds (after Undersamplimg)  28
Herirachal model for training data
LR Model
15
15
686979
Assay:  652104
Heirarchial_Model
Inactive Compounds  124
Active Compounds  191
Active Compounds (after Undersamplimg)  124
Total Compounds (after Undersamplimg)  248
Herirachal model for training data
LR Model
79
79
652104
Assay:  720579
Heirarchial_Model
Inactive Compounds  22
Active Compounds  62
Active Compounds (after Undersamplimg)  22
Total Compounds (after Undersamplimg)  44
Herirachal model for training data
LR Model
22
22
720579
Assay:  720533
Heirarchial_Model
Inactive Compounds  39
Active Compounds  105
Active Compounds (after Undersamplimg)  39
Tot

100%|███████████████████████████████████████████| 92/92 [00:02<00:00, 38.54it/s]

Assay:  720532
Heirarchial_Model
Inactive Compounds  73
Active Compounds  167
Active Compounds (after Undersamplimg)  73
Total Compounds (after Undersamplimg)  146
Herirachal model for training data
LR Model
61
61
720532
Assay:  1159524
Heirarchial_Model
Inactive Compounds  528
Active Compounds  176
Inactive Compounds (after Undersamplimg)  176
Total Compounds (after Undersamplimg)  352
Herirachal model for training data
LR Model
176
176
1159524
Assay:  1117304
Heirarchial_Model
Inactive Compounds  207
Active Compounds  101
Inactive Compounds (after Undersamplimg)  101
Total Compounds (after Undersamplimg)  202
Herirachal model for training data
LR Model
78
78
1117304
Assay:  1117305
Heirarchial_Model
Inactive Compounds  197
Active Compounds  90
Inactive Compounds (after Undersamplimg)  90
Total Compounds (after Undersamplimg)  180
Herirachal model for training data
LR Model
72
72
1117305





In [26]:
all_preds_test_compounds

Unnamed: 0,assay,StdInChI,fp_proba,fp_pred,fp_threshold,CP_proba,CP_pred,CP_threshold,true,ts,pc,Data,MFP_Correct,CP_Correct,fp_proba_scaled,CP_proba_scaled,simple_pred,Ensemble_Correct,heirarchial_pred,heirarchial_Correct
0,588458,InChI=1S/C14H15BrN2O3/c1-17(2)6-5-16-13(18)11-...,0.314086,1,0.288942,0.456960,1,0.318929,0.0,0.379310,0.302177,HeldOut,False,False,0.517681,0.601334,1.0,False,1.0,False
1,588458,InChI=1S/C23H22N2O2/c26-23(16-22-21-9-5-4-6-17...,0.236041,0,0.288942,0.204912,0,0.318929,0.0,0.481481,0.658485,HeldOut,True,True,0.408457,0.321251,0.0,True,0.0,True
2,588458,InChI=1S/C22H17F2NO3/c23-18-7-1-15(2-8-18)13-2...,0.196085,0,0.288942,0.327494,1,0.318929,0.0,0.384615,0.741548,HeldOut,True,False,0.339316,0.506288,0.0,True,0.0,True
3,588458,InChI=1S/C24H24N6O4/c25-22-20(24(31)26-7-8-29-...,0.168825,0,0.288942,0.202695,0,0.318929,0.0,0.416667,0.623549,HeldOut,True,True,0.292143,0.317774,0.0,True,0.0,True
4,588458,InChI=1S/C15H23BrN2O4S/c1-10-8-18(11(2)9-19)23...,0.251624,0,0.288942,0.476059,1,0.318929,0.0,0.454545,0.766103,HeldOut,True,False,0.435424,0.615355,1.0,False,1.0,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
67,1117305,InChI=1S/C12H9F3N2O2/c1-7-10(6-16-19-7)11(18)1...,0.340631,0,0.357222,0.357490,1,0.330242,0.0,0.296296,0.479437,HeldOut,True,False,0.476778,0.520342,0.0,True,0.0,True
68,1117305,InChI=1S/C12H10N2/c1-8-12-10(6-7-13-8)9-4-2-3-...,0.242578,0,0.357222,0.244216,0,0.330242,0.0,0.363636,0.619949,HeldOut,True,True,0.339534,0.369752,0.0,True,1.0,False
69,1117305,InChI=1S/C8H4Cl2N2O2/c9-3-1-5-6(2-4(3)10)12-8(...,0.403774,1,0.357222,0.352388,1,0.330242,1.0,0.333333,0.541732,HeldOut,True,True,0.536212,0.516533,1.0,True,0.0,False
70,1117305,InChI=1S/C15H12O5/c16-9-3-1-8(2-4-9)13-7-12(19...,0.478660,1,0.357222,0.243638,0,0.330242,0.0,0.444444,0.506977,HeldOut,False,True,0.594464,0.368877,0.0,True,0.0,True


In [27]:
all_preds_test_compounds.to_csv("Predictions_train_heldout_scaled_prob_all_assays_heirarchial_model_test_compounds.csv", index=False)

In [28]:
metrics_list = pd.DataFrame(metrics_list,columns=["assay","method","f1", "precision", "Sensitivity", "Specificity", "ba", "mcc", "AUC"])
metrics_list

Unnamed: 0,assay,method,f1,precision,Sensitivity,Specificity,ba,mcc,AUC
0,588458,Heirarchial Model,0.468750,0.357143,0.681818,0.602941,0.642380,0.245301,0.694519
1,588334,Heirarchial Model,0.527778,0.413043,0.730769,0.662500,0.696635,0.341403,0.746635
2,2642,Heirarchial Model,0.429752,0.313253,0.684211,0.504348,0.594279,0.163530,0.657895
3,2156,Heirarchial Model,0.454545,0.394737,0.535714,0.732558,0.634136,0.244966,0.649917
4,2330,Heirarchial Model,0.666667,0.714286,0.625000,0.920000,0.772500,0.571315,0.825000
...,...,...,...,...,...,...,...,...,...
87,720504,Heirarchial Model,0.844444,0.730769,1.000000,0.000000,0.500000,0.000000,0.398496
88,720532,Heirarchial Model,0.514286,0.666667,0.418605,0.500000,0.459302,-0.074739,0.452196
89,1159524,Heirarchial Model,0.291262,0.254237,0.340909,0.666667,0.503788,0.006949,0.528065
90,1117304,Heirarchial Model,0.454545,0.365854,0.600000,0.509434,0.554717,0.102274,0.535094


In [30]:
metrics_list.to_csv("heirarchial_model_metrics.csv", index=False)