In [7]:
#!/usr/bin/env python
# coding: utf-8


import os
import numpy as np
import pandas as pd
from tqdm import tqdm
from rdkit import Chem, DataStructs
from rdkit.Chem import AllChem
from mordred import Calculator, descriptors
calc = Calculator(descriptors, ignore_3D=True)

from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.metrics import mean_squared_error, roc_auc_score, matthews_corrcoef, average_precision_score, confusion_matrix
from imblearn.metrics import geometric_mean_score
from imblearn.over_sampling import RandomOverSampler
from imblearn.pipeline import Pipeline
from sklearn.metrics import roc_curve
from sklearn.experimental import enable_halving_search_cv
from sklearn.model_selection import HalvingRandomSearchCV
from scipy.stats import randint, uniform
from sklearn.model_selection import StratifiedKFold
import math

from pandarallel import pandarallel
pandarallel.initialize()

import sys
sys.path.append('/home/ss2686/03_DICTrank')
import argparse
from scripts.evaluation_functions import evaluate_classifier, optimize_threshold_j_statistic


# Path where your data is stored
data_path = '../data/processed_binarised__splits/'

results = {}
held_out_results = []

def get_Mordred_columns_to_use():
    
        datasets = {}
        directory='../data/processed/'
        # Load datasets from given directory
        for foldername in os.listdir(directory):

            if not foldername.startswith('.'):  # Ignore folders starting with a dot

                print(foldername)
                file_path = os.path.join(directory, foldername, f"{foldername}_processed.csv.gz")

                if os.path.exists(file_path):
                    datasets[foldername] = pd.read_csv(file_path, compression='gzip')
                else:
                    print(f"No matching file found for folder: {foldername}")
        
        smiles_list = []
        
        for featuresets in ["sider", "DICTrank"]: 
    
            smiles_list.extend(datasets[featuresets].Standardized_SMILES.to_list())
            print(len(smiles_list))
        smiles_list = list(set(smiles_list))
        
        print(len(smiles_list))
        data = pd.DataFrame(smiles_list, columns=["Standardized_SMILES"])
        
        Ser_Mol_train = data['Standardized_SMILES'].apply(Chem.MolFromSmiles)
        Mordred_table_data = calc.pandas(Ser_Mol_train)
        Mordred_table_data = Mordred_table_data.astype('float')
        Mordred_table_data = Mordred_table_data.dropna(axis='columns')
        data_columns = Mordred_table_data.columns
        
        return(data_columns)
    
data_columns = get_Mordred_columns_to_use()




INFO: Pandarallel will run on 56 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.
GeneExpressionSMILES
sider
MOA
CellPaintingSMILES
Cmax
DrugBank
DICTrank
1378
2613
1579


  1%|▏                                         | 8/1579 [00:01<03:31,  7.42it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


  2%|▊                                        | 32/1579 [00:04<06:25,  4.01it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


  3%|█                                        | 42/1579 [00:05<03:20,  7.68it/s]Process ForkPoolWorker-319:
Process ForkPoolWorker-310:
Process ForkPoolWorker-324:
Process ForkPoolWorker-305:
Process ForkPoolWorker-298:
Process ForkPoolWorker-314:
Process ForkPoolWorker-323:



KeyboardInterrupt: 

In [8]:
data_columns

Index(['ABC', 'ABCGG', 'nAcid', 'nBase', 'SpAbs_A', 'SpMax_A', 'SpDiam_A',
       'SpAD_A', 'SpMAD_A', 'LogEE_A',
       ...
       'SRW09', 'SRW10', 'TSRW10', 'MW', 'AMW', 'WPath', 'WPol', 'Zagreb1',
       'Zagreb2', 'mZagreb2'],
      dtype='object', length=1038)

In [75]:
import pickle

#Dataset only DICTrank
for dataset in ["DICTrank"]:
    
    # Exclude hidden files or directories like .ipynb_checkpoints
    if dataset.startswith('.'):
        continue
    print(dataset)

    # Get all the file names for this dataset
    all_files = os.listdir(os.path.join(data_path, dataset))

    # Extract activity names by removing the _train.csv.gz or _test.csv.gz from file names
    activity_names = list(set([f.replace("_train.csv.gz", "").replace("_test.csv.gz", "")  for f in all_files if not f.startswith(".ipynb_checkpoints")]))

    for activity in tqdm(activity_names, desc="Processing activities"):
        
        train_path = os.path.join(data_path, dataset, f"{activity}_train.csv.gz")
        test_path = os.path.join(data_path, dataset, f"{activity}_test.csv.gz")

        train_df = pd.read_csv(train_path, compression='gzip')
        test_df = pd.read_csv(test_path, compression='gzip')
        
        Ser_Mol_train = train_df['Standardized_SMILES'].apply(Chem.MolFromSmiles)
        Mordred_table_train = calc.pandas(Ser_Mol_train)
        Mordred_table_train = Mordred_table_train.astype('float')
               
        Ser_Mol_test = test_df['Standardized_SMILES'].apply(Chem.MolFromSmiles)
        Mordred_table_test = calc.pandas(Ser_Mol_test)
        Mordred_table_test = Mordred_table_test.astype('float')

        # Retain only those columns in the test dataset
        Mordred_table_train = Mordred_table_train[data_columns]
        Mordred_table_test = Mordred_table_test[data_columns]

        X_train = np.array(Mordred_table_train)
        X_test = np.array(Mordred_table_test)
        y_train = train_df[activity]
        y_test = test_df[activity]

      
        # Classification
        model = RandomForestClassifier(n_jobs=40)
            
        # Hyperparameter Optimization
        param_dist_classification = {'max_depth': randint(10, 20),
                          'max_features': randint(40, 50),
                          'min_samples_leaf': randint(5, 15),
                          'min_samples_split': randint(5, 15),
                          'n_estimators':[200, 300, 400, 500, 600],
                          'bootstrap': [True, False],
                          'oob_score': [False],
                          'random_state': [42],
                          'criterion': ['gini', 'entropy'],
                          'n_jobs': [40],
                          'class_weight' : [None, 'balanced']
                         }
        inner_cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)   
            
        classification_search = HalvingRandomSearchCV(
                model,
                param_dist_classification,
                factor=3,
                cv=inner_cv,
                random_state=42,
                verbose=1,
                n_jobs=40)
            
        classification_search.fit(X_train, y_train)
        best_model = classification_search.best_estimator_
            
        # Random Over-sampling 
        sampler = RandomOverSampler(sampling_strategy='auto', random_state=42)
            
        pipeline = Pipeline(steps=[('sampler', sampler), ('model', best_model)])
        pipeline.fit(X_train, y_train)
        pickle.dump(pipeline, open("Physicochemical_model.sav",  'wb'))
        
        # Predict using threshold-optimized model
        probs_test = pipeline.predict_proba(X_test)[:, 1]
            
        oof_predictions = np.zeros(X_train.shape[0])
        oof_probs = np.zeros(X_train.shape[0])

        cv_scores = []

        for train_idx, valid_idx in inner_cv.split(X_train, y_train):
            X_train_fold, y_train_fold = X_train[train_idx], y_train[train_idx]
            X_valid_fold, y_valid_fold = X_train[valid_idx], y_train[valid_idx]

            # Random Over-sampling
            X_resampled, y_resampled = sampler.fit_resample(X_train_fold, y_train_fold)

            # Train the model on the resampled data
            best_model.fit(X_resampled, y_resampled)

            # Store out-of-fold predictions
            oof_predictions[valid_idx] = best_model.predict(X_valid_fold)
            oof_probs[valid_idx] = best_model.predict_proba(X_valid_fold)[:, 1]

            # AUC for this fold
            fold_auc = roc_auc_score(y_valid_fold, oof_probs[valid_idx])
            cv_scores.append(fold_auc)

        # Optimize the threshold using out-of-fold predictions
        best_threshold = optimize_threshold_j_statistic(y_train, oof_probs)
        predictions_test = (probs_test >= best_threshold).astype(int)

        results[activity] = {
                'CV_AUC_mean': np.mean(cv_scores),
                'CV_AUC_std': np.std(cv_scores),
                **evaluate_classifier(y_test, predictions_test, probs_test)
            }
        
        held_out_data = {
            'Dataset': dataset,
            "Actviity": activity,
            'SMILES': test_df['Standardized_SMILES'],
            'True_Value': y_test,
            'Prediction': predictions_test,
            'Probability': probs_test,
            'Best_Threshold': best_threshold
        }
        
        held_out_results.append(pd.DataFrame(held_out_data))         
        # Save results at each step
            
            
    # Save results at each step
    pd.DataFrame(results).T.to_csv('./physicochemical_model_results.csv')
              

# Save results
results_df = pd.DataFrame(results).T.reset_index(drop=False)
results_df = results_df.rename(columns={'index': 'endpoint'})
results_df.to_csv('./physicochemical_model_results.csv', index=False)

# Concatenate and save held-out test set results
pd.concat(held_out_results).to_csv('./physicochemical_model_held_out_test_results.csv', index=False)
            

DICTrank


Processing activities:   0%|                              | 0/1 [00:00<?, ?it/s]
  0%|                                                   | 0/930 [00:00<?, ?it/s][A
  0%|                                           | 1/930 [00:00<09:58,  1.55it/s][A
                                                                                [A
Processing activities:   0%|                              | 0/1 [00:03<?, ?it/s]
  3%|█▎                                        | 28/930 [00:02<01:25, 10.56it/s][A
                                                                                [A
Processing activities:   0%|                              | 0/1 [00:03<?, ?it/s]
                                                                                [A
Processing activities:   0%|                              | 0/1 [00:03<?, ?it/s]
  7%|██▊                                       | 62/930 [00:02<00:58, 14.92it/s][A
                                                                                [A
Proc

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)



 15%|█████▉                                  | 139/930 [00:02<00:07, 105.05it/s][A
                                                                                [A
Processing activities:   0%|                              | 0/1 [00:05<?, ?it/s]
                                                                                [A
Processing activities:   0%|                              | 0/1 [00:05<?, ?it/s]
 21%|████████▋                                | 196/930 [00:04<00:11, 66.52it/s][A
 21%|████████▊                                | 199/930 [00:04<00:19, 37.71it/s][A
 27%|██████████▉                              | 247/930 [00:04<00:11, 59.61it/s][A

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)



 31%|████████████▋                            | 287/930 [00:05<00:07, 81.79it/s][A
 34%|█████████████▉                           | 317/930 [00:07<00:17, 35.75it/s][A
                                                                                [A
Processing activities:   0%|                              | 0/1 [00:10<?, ?it/s]
 42%|█████████████████                        | 388/930 [00:09<00:11, 47.25it/s][A
                                                                                [A
Processing activities:   0%|                              | 0/1 [00:10<?, ?it/s]
                                                                                [A
Processing activities:   0%|                              | 0/1 [00:10<?, ?it/s]
 43%|█████████████████▊                       | 403/930 [00:09<00:14, 35.92it/s][A
                                                                                [A
Processing activities:   0%|                              | 0/1 [00:10<?, ?it/s]
 51

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


                                                                                
Processing activities:   0%|                              | 0/1 [00:14<?, ?it/s]
 52%|█████████████████████▏                   | 480/930 [00:13<00:08, 52.23it/s][A
                                                                                [A
Processing activities:   0%|                              | 0/1 [00:14<?, ?it/s]
                                                                                [A
Processing activities:   0%|                              | 0/1 [00:14<?, ?it/s]
 52%|█████████████████████▍                   | 487/930 [00:13<00:21, 20.98it/s][A
                                                                                [A
Processing activities:   0%|                              | 0/1 [00:15<?, ?it/s]
                                                                                [A
Processing activities:   0%|                              | 0/1 [00:15<?, ?it/s]
          

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


                                                                                
Processing activities:   0%|                              | 0/1 [00:15<?, ?it/s]
                                                                                [A
Processing activities:   0%|                              | 0/1 [00:15<?, ?it/s]
 58%|███████████████████████▊                 | 540/930 [00:14<00:16, 23.80it/s][A
                                                                                [A
Processing activities:   0%|                              | 0/1 [00:15<?, ?it/s]
                                                                                [A
Processing activities:   0%|                              | 0/1 [00:15<?, ?it/s]
                                                                                [A
Processing activities:   0%|                              | 0/1 [00:15<?, ?it/s]
 62%|█████████████████████████▎               | 573/930 [00:14<00:09, 36.71it/s][A
 62%|█████

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


                                                                                
Processing activities:   0%|                              | 0/1 [00:16<?, ?it/s]
                                                                                [A
Processing activities:   0%|                              | 0/1 [00:16<?, ?it/s]
                                                                                [A
Processing activities:   0%|                              | 0/1 [00:16<?, ?it/s]
 63%|█████████████████████████▉               | 588/930 [00:15<00:07, 48.81it/s][A
                                                                                [A
Processing activities:   0%|                              | 0/1 [00:16<?, ?it/s]
 65%|██████████████████████████▍              | 600/930 [00:15<00:09, 33.69it/s][A

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


                                                                                
Processing activities:   0%|                              | 0/1 [00:17<?, ?it/s]
 65%|██████████████████████████▌              | 603/930 [00:16<00:09, 33.69it/s][A

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


                                                                                
Processing activities:   0%|                              | 0/1 [00:20<?, ?it/s]
 65%|██████████████████████████▊              | 609/930 [00:19<00:09, 33.69it/s][A

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


                                                                                
Processing activities:   0%|                              | 0/1 [00:31<?, ?it/s]
                                                                                [A
Processing activities:   0%|                              | 0/1 [00:31<?, ?it/s]
 66%|██████████████████████████▉              | 611/930 [00:30<00:09, 33.69it/s][A
 67%|███████████████████████████▎             | 619/930 [00:30<00:54,  5.73it/s][A
 74%|██████████████████████████████▏          | 685/930 [00:30<00:21, 11.34it/s][A

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)



                                                                                [A
Processing activities:   0%|                              | 0/1 [00:32<?, ?it/s]
 79%|████████████████████████████████▌        | 739/930 [00:31<00:11, 17.10it/s][A

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


                                                                                
Processing activities:   0%|                              | 0/1 [00:33<?, ?it/s]
 81%|█████████████████████████████████▎       | 755/930 [00:32<00:10, 17.10it/s][A
                                                                                [A
Processing activities:   0%|                              | 0/1 [00:33<?, ?it/s]
 83%|██████████████████████████████████▏      | 775/930 [00:32<00:09, 16.74it/s][A

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)



                                                                                [A
Processing activities:   0%|                              | 0/1 [00:34<?, ?it/s]
 86%|███████████████████████████████████▏     | 797/930 [00:33<00:06, 19.62it/s][A

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)



                                                                                [A
Processing activities:   0%|                              | 0/1 [00:46<?, ?it/s]
 87%|███████████████████████████████████▊     | 811/930 [00:45<00:06, 19.62it/s][A
                                                                                [A
Processing activities:   0%|                              | 0/1 [00:46<?, ?it/s]
                                                                                [A
Processing activities:   0%|                              | 0/1 [00:46<?, ?it/s]
 91%|█████████████████████████████████████    | 842/930 [00:45<00:14,  6.09it/s][A

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)



                                                                                [A
Processing activities:   0%|                              | 0/1 [00:47<?, ?it/s]
                                                                                [A
Processing activities:   0%|                              | 0/1 [00:47<?, ?it/s]
                                                                                [A
Processing activities:   0%|                              | 0/1 [00:47<?, ?it/s]
                                                                                [A
Processing activities:   0%|                              | 0/1 [00:47<?, ?it/s]
                                                                                [A
Processing activities:   0%|                              | 0/1 [00:47<?, ?it/s]
100%|█████████████████████████████████████████| 930/930 [00:46<00:00, 19.89it/s][A


  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  s += (eig.vec[i, eig.max] * eig.vec[j, eig.max]) ** -0.5
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)



  0%|                                                    | 0/90 [00:00<?, ?it/s][A
  1%|▍                                           | 1/90 [00:00<00:38,  2.34it/s][A
  4%|█▉                                          | 4/90 [00:00<00:19,  4.39it/s][A
                                                                                [A
Processing activities:   0%|                              | 0/1 [00:50<?, ?it/s]
100%|███████████████████████████████████████████| 90/90 [00:01<00:00, 67.80it/s][A


  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
n_iterations: 4
n_required_iterations: 4
n_possible_iterations: 4
min_resources_: 20
max_resources_: 930
aggressive_elimination: False
factor: 3
----------
iter: 0
n_candidates: 46
n_resources: 20
Fitting 5 folds for each of 46 candidates, totalling 230 fits
----------
iter: 1
n_candidates: 16
n_resources: 60
Fitting 5 folds for each of 16 candidates, totalling 80 fits
----------
iter: 2
n_candidates: 6
n_resources: 180
Fitting 5 folds for each of 6 candidates, totalling 30 fits
----------
iter: 3
n_candidates: 2
n_resources: 540
Fitting 5 folds for each of 2 candidates, totalling 10 fits


Processing activities: 100%|██████████████████████| 1/1 [01:07<00:00, 67.76s/it]


In [76]:
held_out_results

[     Dataset  Actviity                                             SMILES  \
 0   DICTrank  DICTrank     O=c1n(CCC[NH+]2CCN(c3cccc(Cl)c3)CC2)nc2ccccn12   
 1   DICTrank  DICTrank                CC(C(=O)[O-])c1ccc(-c2ccccc2)c(F)c1   
 2   DICTrank  DICTrank                 C[NH+](C)CCC=C1c2ccccc2CCc2ccccc21   
 3   DICTrank  DICTrank  CC(=O)[NH+]1CCN(c2ccc(OCC3COC(Cn4ccnc4)(c4ccc(...   
 4   DICTrank  DICTrank                  Cc1nccn1CC1CCc2c(c3ccccc3n2C)C1=O   
 ..       ...       ...                                                ...   
 85  DICTrank  DICTrank                         CCC1(c2ccccc2)C(=O)NCNC1=O   
 86  DICTrank  DICTrank   CCOC(=O)[NH+]1CCC(=C2c3ccc(Cl)cc3CCc3cccnc32)CC1   
 87  DICTrank  DICTrank                 CCCSc1ccc2[n-]c(=NC(=O)OC)[n-]c2c1   
 88  DICTrank  DICTrank          CCC[NH+](CCC)S(=O)(=O)c1ccc(C(=O)[O-])cc1   
 89  DICTrank  DICTrank                 CC[NH+](CC)C(=S)SSC(=S)[NH+](CC)CC   
 
     True_Value  Prediction  Probability  Best_Threshold  
 0 

In [77]:
results_df

Unnamed: 0,endpoint,CV_AUC_mean,CV_AUC_std,Held_out_AUC,Held_out_AUCPR,Held_out_BA,Held_out_F1,Held_out_FN,Held_out_FP,Held_out_MCC,Held_out_Sensitivity,Held_out_Specificity,Held_out_TN,Held_out_TP
0,DICTrank,0.698334,0.064899,0.840615,0.932518,0.690769,0.842105,9.0,12.0,0.397648,0.861538,0.52,13.0,56.0


In [78]:
loaded_rf = pickle.load(open("Physicochemical_model.sav", 'rb'))

In [79]:
params = loaded_rf.get_params()


In [80]:
params

{'memory': None,
 'steps': [('sampler', RandomOverSampler(random_state=42)),
  ('model',
   RandomForestClassifier(bootstrap=False, class_weight='balanced', max_depth=18,
                          max_features=46, min_samples_leaf=5, min_samples_split=5,
                          n_estimators=200, n_jobs=40, random_state=42))],
 'verbose': False,
 'sampler': RandomOverSampler(random_state=42),
 'model': RandomForestClassifier(bootstrap=False, class_weight='balanced', max_depth=18,
                        max_features=46, min_samples_leaf=5, min_samples_split=5,
                        n_estimators=200, n_jobs=40, random_state=42),
 'sampler__random_state': 42,
 'sampler__sampling_strategy': 'auto',
 'sampler__shrinkage': None,
 'model__bootstrap': False,
 'model__ccp_alpha': 0.0,
 'model__class_weight': 'balanced',
 'model__criterion': 'gini',
 'model__max_depth': 18,
 'model__max_features': 46,
 'model__max_leaf_nodes': None,
 'model__max_samples': None,
 'model__min_impurity_decrease

In [81]:
classifier= RandomForestClassifier(bootstrap=False, class_weight='balanced', max_depth=18,
                          max_features=46, min_samples_leaf=5, min_samples_split=5,
                          n_estimators=200, n_jobs=40, random_state=42)

In [82]:
X_train

array([[ 19.77512879,  14.803335  ,   0.        , ..., 140.        ,
        172.        ,   5.        ],
       [ 23.80059255,  17.74158608,   0.        , ..., 154.        ,
        173.        ,   6.88888889],
       [  6.16336313,   6.51589884,   0.        , ...,  36.        ,
         36.        ,   2.11111111],
       ...,
       [  7.95651408,   7.58116421,   0.        , ...,  50.        ,
         55.        ,   2.61111111],
       [  7.38715648,   7.65713981,   0.        , ...,  50.        ,
         58.        ,   2.25      ],
       [  7.80668416,   7.34357946,   2.        , ...,  52.        ,
         61.        ,   2.22222222]])

In [83]:
y_train.shape

(930,)

In [84]:
X_test

array([[ 20.5631158 ,  14.95163401,   0.        , ..., 138.        ,
        162.        ,   5.69444444],
       [ 13.71082758,  11.69985029,   1.        , ...,  90.        ,
        104.        ,   4.02777778],
       [ 16.32047511,  13.12450461,   0.        , ..., 108.        ,
        126.        ,   4.69444444],
       ...,
       [ 13.50397853,  11.40209184,   2.        , ...,  88.        ,
        100.        ,   4.19444444],
       [ 13.80798646,  12.71610245,   1.        , ...,  90.        ,
        103.        ,   4.44444444],
       [ 10.74450109,  11.044544  ,   0.        , ...,  66.        ,
         72.        ,   4.13888889]])

In [85]:
y_test.shape

(90,)

In [86]:
X_all= np.vstack((X_train, X_test))
y_all= np.concatenate((y_train.to_numpy(), y_test.to_numpy()))

In [87]:
classifier.fit(X_all, y_all)

In [96]:
from sklearn.model_selection import cross_val_score, cross_val_predict
from numpy import argmax

inner_cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)   
#Threshold Balancing
cross_val_prob_cp = cross_val_predict(classifier, X_all, y_all, cv=inner_cv, method='predict_proba', n_jobs=-1)[:, 1]
# calculate roc curves
fpr, tpr, thresholds = roc_curve(y_all, cross_val_prob_cp)
# get the best threshold
J = tpr - fpr
ix = argmax(J)
best_thresh_cp = thresholds[ix]
print('Best Threshold=%f' % (best_thresh_cp))

Best Threshold=0.641338


In [97]:
df = pd.read_csv('./DICTrank_ambiguous_compounds.csv.gz')
df

Unnamed: 0.1,Unnamed: 0,Trade Name,Generic/Proper Name(s),Active Ingredient(s),SMILES,DICT _ Concern,Standardized_SMILES,Standardized_InChI
0,0,pancuronium bromide,pancuronium bromide,pancuronium bromide,[Br-].[Br-].[H][C@@]1(C[C@@]2([H])[C@]3([H])CC...,ambiguous,CC(=O)OC1CC2CCC3C(CCC4(C)C3CC([N+]3(C)CCCCC3)C...,InChI=1S/C35H60N2O4/c1-24(38)40-32-21-26-13-14...
1,1,vecuronium bromide,vecuronium bromide,vecuronium bromide,[Br-].CC(=O)O[C@H]1[C@H](C[C@H]2[C@@H]3CC[C@H]...,ambiguous,CC(=O)OC1CC2CCC3C(CCC4(C)C3CC([N+]3(C)CCCCC3)C...,InChI=1S/C34H57N2O4/c1-23(37)39-31-20-25-12-13...
2,2,doxercalciferol,doxercalciferol,doxercalciferol,[H][C@@]1(CC[C@@]2([H])\C(CCC[C@]12C)=C\C=C1\C...,ambiguous,C=C1C(=CC=C2CCCC3(C)C2CCC3C(C)C=CC(C)C(C)C)CC(...,InChI=1S/C28H44O2/c1-18(2)19(3)9-10-20(4)25-13...
3,3,"ergocalciferol capsules,","ergocalciferol capsules,",ergocalciferol,[H][C@@]1(CC[C@@]2([H])\C(CCC[C@]12C)=C\C=C1\C...,ambiguous,C=C1CCC(O)CC1=CC=C1CCCC2(C)C1CCC2C(C)C=CC(C)C(C)C,InChI=1S/C28H44O/c1-19(2)20(3)9-10-22(5)26-15-...
4,4,doxycycline,doxycycline,doxycycline,[H][C@@]12[C@@H](C)C3=C(C(O)=CC=C3)C(=O)C1=C(O...,ambiguous,CC1c2cccc([O-])c2C(=O)C2C(=O)C3(O)C(=O)C(C(N)=...,InChI=1S/C22H24N2O8/c1-7-8-5-4-6-9(25)11(8)16(...
...,...,...,...,...,...,...,...,...
80,80,chlorhexidine gluconate,chlorhexidine gluconate,chlorhexidine gluconate,OC[C@@H](O)[C@@H](O)[C@H](O)[C@@H](O)C(O)=O.OC...,ambiguous,NC(Nc1ccc(Cl)cc1)=[NH+]C(N)=[NH+]CCCCCC[NH2+]C...,InChI=1S/C22H30Cl2N10/c23-15-5-9-17(10-6-15)31...
81,81,dextrose,dextrose,dextrose,OC[C@@H](O)[C@@H](O)[C@H](O)[C@@H](O)C=O,ambiguous,O=C(C(O)CO)C(O)C(O)CO,InChI=1S/C6H12O6/c7-1-3(9)5(11)6(12)4(10)2-8/h...
82,82,didanosine,didanosine,didanosine,OC[C@@H]1CC[C@@H](O1)N1C=NC2=C1N=CNC2=O,ambiguous,O=c1[n-]cnc2c1ncn2C1CCC(CO)O1,InChI=1S/C10H12N4O3/c15-3-6-1-2-7(17-6)14-5-13...
83,83,oxazepam,oxazepam,oxazepam,OC1N=C(C2=CC=CC=C2)C2=CC(Cl)=CC=C2NC1=O,ambiguous,O=C1Nc2ccc(Cl)cc2C(c2ccccc2)NC1=O,InChI=1S/C15H11ClN2O2/c16-10-6-7-12-11(8-10)13...


In [89]:
Ser_Mol = df['Standardized_SMILES'].apply(Chem.MolFromSmiles)
Mordred_table = calc.pandas(Ser_Mol)
Mordred_table = Mordred_table.astype('float')
               
     

# Retain only those columns in the test dataset
Mordred_table = Mordred_table[data_columns]


X = np.array(Mordred_table)

100%|███████████████████████████████████████████| 85/85 [00:03<00:00, 23.12it/s]


  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


In [None]:
probs_test = pipeline.predict_proba(X)[:, 1]
probs_test

array([0.60183532, 0.68570635, 0.78581349, 0.80121627, 0.55528373,
       0.65849603, 0.85357341, 0.53119048, 0.38056259, 0.90015278,
       0.87293452, 0.30986859, 0.54224603, 0.78040476, 0.64391667,
       0.49850794, 0.90287698, 0.6148631 , 0.61646627, 0.57776786,
       0.84180754, 0.87831349, 0.87293452, 0.72064484, 0.72336508,
       0.67365278, 0.63985119, 0.69475   , 0.72061508, 0.81724405,
       0.68836706, 0.53051984, 0.6466627 , 0.80232143, 0.78765476,
       0.74249405, 0.82393056, 0.34971183, 0.54500397, 0.835     ,
       0.69593056, 0.57805556, 0.89027579, 0.74029365, 0.8317877 ,
       0.70241865, 0.71560913, 0.7906131 , 0.75715675, 0.62194048,
       0.6197381 , 0.72748611, 0.92641865, 0.68814881, 0.91735516,
       0.79649206, 0.41717063, 0.86302778, 0.57741468, 0.5636369 ,
       0.62208532, 0.8104623 , 0.94216667, 0.66081349, 0.85518452,
       0.53583532, 0.7405781 , 0.73031548, 0.66081349, 0.6599301 ,
       0.78893849, 0.66220905, 0.83384722, 0.69201984, 0.70543

In [99]:
df["Probability"] = probs_test
df["Prediction"] = (probs_test >= best_threshold).astype(int)

In [100]:
df

Unnamed: 0.1,Unnamed: 0,Trade Name,Generic/Proper Name(s),Active Ingredient(s),SMILES,DICT _ Concern,Standardized_SMILES,Standardized_InChI,Probability,Prediction
0,0,pancuronium bromide,pancuronium bromide,pancuronium bromide,[Br-].[Br-].[H][C@@]1(C[C@@]2([H])[C@]3([H])CC...,ambiguous,CC(=O)OC1CC2CCC3C(CCC4(C)C3CC([N+]3(C)CCCCC3)C...,InChI=1S/C35H60N2O4/c1-24(38)40-32-21-26-13-14...,0.601835,0
1,1,vecuronium bromide,vecuronium bromide,vecuronium bromide,[Br-].CC(=O)O[C@H]1[C@H](C[C@H]2[C@@H]3CC[C@H]...,ambiguous,CC(=O)OC1CC2CCC3C(CCC4(C)C3CC([N+]3(C)CCCCC3)C...,InChI=1S/C34H57N2O4/c1-23(37)39-31-20-25-12-13...,0.685706,1
2,2,doxercalciferol,doxercalciferol,doxercalciferol,[H][C@@]1(CC[C@@]2([H])\C(CCC[C@]12C)=C\C=C1\C...,ambiguous,C=C1C(=CC=C2CCCC3(C)C2CCC3C(C)C=CC(C)C(C)C)CC(...,InChI=1S/C28H44O2/c1-18(2)19(3)9-10-20(4)25-13...,0.785813,1
3,3,"ergocalciferol capsules,","ergocalciferol capsules,",ergocalciferol,[H][C@@]1(CC[C@@]2([H])\C(CCC[C@]12C)=C\C=C1\C...,ambiguous,C=C1CCC(O)CC1=CC=C1CCCC2(C)C1CCC2C(C)C=CC(C)C(C)C,InChI=1S/C28H44O/c1-19(2)20(3)9-10-22(5)26-15-...,0.801216,1
4,4,doxycycline,doxycycline,doxycycline,[H][C@@]12[C@@H](C)C3=C(C(O)=CC=C3)C(=O)C1=C(O...,ambiguous,CC1c2cccc([O-])c2C(=O)C2C(=O)C3(O)C(=O)C(C(N)=...,InChI=1S/C22H24N2O8/c1-7-8-5-4-6-9(25)11(8)16(...,0.555284,0
...,...,...,...,...,...,...,...,...,...,...
80,80,chlorhexidine gluconate,chlorhexidine gluconate,chlorhexidine gluconate,OC[C@@H](O)[C@@H](O)[C@H](O)[C@@H](O)C(O)=O.OC...,ambiguous,NC(Nc1ccc(Cl)cc1)=[NH+]C(N)=[NH+]CCCCCC[NH2+]C...,InChI=1S/C22H30Cl2N10/c23-15-5-9-17(10-6-15)31...,0.762843,1
81,81,dextrose,dextrose,dextrose,OC[C@@H](O)[C@@H](O)[C@H](O)[C@@H](O)C=O,ambiguous,O=C(C(O)CO)C(O)C(O)CO,InChI=1S/C6H12O6/c7-1-3(9)5(11)6(12)4(10)2-8/h...,0.448847,0
82,82,didanosine,didanosine,didanosine,OC[C@@H]1CC[C@@H](O1)N1C=NC2=C1N=CNC2=O,ambiguous,O=c1[n-]cnc2c1ncn2C1CCC(CO)O1,InChI=1S/C10H12N4O3/c15-3-6-1-2-7(17-6)14-5-13...,0.687075,1
83,83,oxazepam,oxazepam,oxazepam,OC1N=C(C2=CC=CC=C2)C2=CC(Cl)=CC=C2NC1=O,ambiguous,O=C1Nc2ccc(Cl)cc2C(c2ccccc2)NC1=O,InChI=1S/C15H11ClN2O2/c16-10-6-7-12-11(8-10)13...,0.727454,1


In [103]:
df.to_csv('./DICTrank_ambiguous_compounds_predictions.csv', index=False)