### **Predicting Drug-induced Liver Injury Part 2 - ML Models**

**Dataset** : Toxicological Sciences, Volume 136, Issue 1, November 2013, Pages 242–249, https://doi.org/10.1093/toxsci/kft189

In [1]:
!pip install rdkit-pypi 

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
!pip install padelpy

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [3]:
!pip install pycm

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pycm
  Using cached pycm-3.8-py2.py3-none-any.whl (66 kB)
Collecting art>=1.8
  Using cached art-5.9-py2.py3-none-any.whl (597 kB)
Installing collected packages: art, pycm
Successfully installed art-5.9 pycm-3.8


In [4]:
!pip install -q "tqdm>=4.36.1"

### **Import Libraries**

In [5]:
import os

# Base Libraries
import pandas as pd
import numpy as np
import tqdm


# rdkit
from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit.Chem import Draw
from rdkit.Chem.Draw import IPythonConsole
from rdkit import rdBase

from rdkit.Chem import Descriptors as des
from rdkit.Chem.Descriptors import qed
from rdkit.ML.Descriptors import MoleculeDescriptors
from padelpy import padeldescriptor

# sklearn Data Transformation
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import power_transform

# sklearn Models
from sklearn.model_selection import train_test_split, ShuffleSplit, KFold, cross_val_score, GridSearchCV 
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB

# Metrics
from pycm import*
from sklearn import metrics

### **Read Data**

In [6]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [7]:
os.chdir('/content/drive/My Drive/')

In [8]:
train_=pd.read_csv('train.csv', sep='\t')
train_.head(3)

Unnamed: 0,isosmiles,STD_SMILES,Murcko_smi,New_ID,label,ClusterNumber,CompoundName,Drug label-based annotation
0,CCCCCOC(=O)NC1=NC(=O)N(C=C1F)[C@H]2[C@@H]([C@@...,CCCCCOC(=O)Nc1nc(=O)n([C@@H]2O[C@H](C)[C@@H](O...,O=c1ncccn1[C@H]1CCCO1,60953_1,1,11,capecitabine,Most DILI-concern
1,C1CN(CCC1C(=O)N)CCCN2C3=CC=CC=C3SC4=C2C=C(C=C4)Cl,NC(=O)C1CCN(CCCN2c3ccccc3Sc3ccc(Cl)cc32)CC1,c1ccc2c(c1)Sc1ccccc1N2CCCN1CCCCC1,6761_1,1,42,pipamazine,Most DILI-concern
2,CC1=CC(=C(C=C1)NC2=C(C=CC=C2Cl)F)CC(=O)O,Cc1ccc(Nc2c(F)cccc2Cl)c(CC(=O)O)c1,c1ccc(Nc2ccccc2)cc1,151166_1,1,2,lumiracoxib,Most DILI-concern


In [9]:
test_=pd.read_csv('test.csv', sep='\t')
test_.head(3)

Unnamed: 0,isosmiles,STD_SMILES,Murcko_smi,New_ID,label,ClusterNumber,CompoundName,Drug label-based annotation
0,COC1=C(C=C(C=C1)CC2=NC=CC3=CC(=C(C=C32)OC)OC)OC,COc1ccc(Cc2nccc3cc(OC)c(OC)cc23)cc1OC,c1ccc(Cc2nccc3ccccc23)cc1,4680_1,1,2,papaverine,Most DILI-concern
1,CO/N=C(\N)/C1=CC=C(C=C1)C2=CC=C(O2)C3=CC=C(C=C...,CO/N=C(\N)c1ccc(-c2ccc(-c3ccc(/C(N)=N/OC)cc3)o...,c1ccc(-c2ccc(-c3ccccc3)o2)cc1,5480200_1,1,49,pafuramidine,Most DILI-concern
2,C=CCOC1=C(C=C(C=C1)CC(=O)O)Cl,C=CCOc1ccc(CC(=O)O)cc1Cl,c1ccccc1,30951_1,1,1,alclofenac,Most DILI-concern


In [10]:
train=train_[['STD_SMILES', 'label']]
print(train.shape)
test=test_[['STD_SMILES', 'label']]
print(test.shape)

(244, 2)
(27, 2)


### **Explore two methods for DILI prediction**

### **1. Morgan Fingerprints**
### **2.** **Physiochemical Properties**
    
    

### **Method 1: Morgan FingerPrints**

In [11]:
train_mols=[Chem.MolFromSmiles (smi) for smi in train['STD_SMILES'] if smi is not None]
test_mols=[Chem.MolFromSmiles (smi) for smi in test['STD_SMILES'] if smi is not None]

In [12]:
train_fp=[AllChem.GetMorganFingerprintAsBitVect(mol,2,nBits=1024) for mol in train_mols]
test_fp=[AllChem.GetMorganFingerprintAsBitVect(mol,2,nBits=1024) for mol in test_mols]

In [13]:
X_train=np.asarray(train_fp, dtype=np.int32)
X_test=np.asarray(test_fp, dtype=np.int32)

In [14]:
y_train=train[['label']]
y_test=test[['label']]

### ***Build Classifier Models***

### ***1. RandomForest Classifier***

In [15]:
rf=RandomForestClassifier(random_state=42)

#### KFold CV

In [16]:
kfold = KFold(n_splits=5, random_state=42, shuffle=True)
rf_cv = cross_val_score(rf, X_train, y_train.values.ravel(), cv=kfold, scoring='accuracy')
print(rf_cv)
print('RF Cross Validation Accuracy:', round(rf_cv.mean(),3))

[0.63265306 0.69387755 0.75510204 0.6122449  0.58333333]
RF Cross Validation Accuracy: 0.655


#### GridSearch CV

In [17]:
rf_param_grid={'n_estimators': [500,700,900,1000],
             'criterion':['entropy', 'gini'],
             'max_features':['sqrt','log2'],
             'max_depth':[50,60,70,80],
             'min_samples_split':[10,20,30],
             'min_samples_leaf':[1,5,7],
             'bootstrap':[False]}

In [18]:
rf_gridCV=GridSearchCV(estimator=rf, param_grid=rf_param_grid, scoring='accuracy', cv=5, n_jobs=-1, verbose=2)

In [19]:
rf_gridCV.fit(X_train,y_train)

Fitting 5 folds for each of 576 candidates, totalling 2880 fits


  self.best_estimator_.fit(X, y, **fit_params)


In [20]:
print(rf_gridCV.best_params_)
print(rf_gridCV.best_score_)

{'bootstrap': False, 'criterion': 'entropy', 'max_depth': 60, 'max_features': 'log2', 'min_samples_leaf': 1, 'min_samples_split': 20, 'n_estimators': 500}
0.6969387755102041


In [21]:
rf_best_grid=rf_gridCV.best_estimator_
rf_best_grid_score=rf_best_grid.score(X_test,y_test)
rf_best_grid_score

0.6296296296296297

### ***2. SVM Classifier*** 

In [22]:
svc=SVC(random_state=42)

#### KFold CV

In [23]:
svc_cv = cross_val_score(svc, X_train, y_train.values.ravel(), cv=kfold, scoring='accuracy')
print(svc_cv)
print('SVC Cross Validation Accuracy:', round(svc_cv.mean(),3))

[0.59183673 0.71428571 0.7755102  0.6122449  0.5625    ]
SVC Cross Validation Accuracy: 0.651


#### GridSearch CV

In [24]:
svc_param_grid = {'kernel':['linear', 'rbf', 'poly'],
                  'C':[0.001, 0.01, 0.1, 1, 10, 100],
                  'gamma':[0.0001, 0.001, 0.001, 0.01, 0.1, 1, 10], 
                  'degree':[2,3,4]}

In [25]:
svc_gridCV=GridSearchCV(estimator=svc, param_grid=svc_param_grid, scoring='accuracy', cv=5, n_jobs=-1, verbose=2)

In [26]:
svc_gridCV.fit(X_train,y_train)

Fitting 5 folds for each of 378 candidates, totalling 1890 fits


  y = column_or_1d(y, warn=True)


In [27]:
print(svc_gridCV.best_params_)
print(svc_gridCV.best_score_)

{'C': 0.01, 'degree': 4, 'gamma': 0.1, 'kernel': 'poly'}
0.7255102040816326


In [28]:
svc_best_grid=svc_gridCV.best_estimator_
svc_best_grid_score=svc_best_grid.score(X_test,y_test)
svc_best_grid_score

0.5925925925925926

### ***3. KNN Classifier***

In [29]:
knn=KNeighborsClassifier()

#### KFoldCV

In [30]:
knn_cv = cross_val_score(knn, X_train, y_train.values.ravel(), cv=kfold, scoring='accuracy')
print(knn_cv)
print('KNN Cross Validation Accuracy:', round(knn_cv.mean(),3))

[0.53061224 0.71428571 0.71428571 0.71428571 0.52083333]
KNN Cross Validation Accuracy: 0.639


#### GridSearch CV

In [31]:
knn_param_grid = { 'n_neighbors' : [5,7,9,11,13,15],
               'weights' : ['uniform','distance'],
               'metric' : ['minkowski','euclidean','manhattan']}

In [32]:
knn_gridCV=GridSearchCV(estimator=knn, param_grid=knn_param_grid, scoring='accuracy', cv=5, n_jobs=-1, verbose=2)

In [33]:
knn_gridCV.fit(X_train,y_train)

Fitting 5 folds for each of 36 candidates, totalling 180 fits


  return self._fit(X, y)


In [34]:
print(knn_gridCV.best_params_)
print(knn_gridCV.best_score_)

{'metric': 'manhattan', 'n_neighbors': 9, 'weights': 'distance'}
0.6927721088435375


In [35]:
knn_best_grid=knn_gridCV.best_estimator_
knn_best_grid_score=knn_best_grid.score(X_test,y_test)
knn_best_grid_score

0.5925925925925926

### ***4. Naive Bayes Classifier***

In [36]:
nb=GaussianNB()

#### KFoldCV

In [37]:
nb_cv = cross_val_score(nb, X_train, y_train.values.ravel(), cv=kfold, scoring='accuracy')
print(nb_cv)
print('KNN Cross Validation Accuracy:', round(nb_cv.mean(),3))

[0.57142857 0.51020408 0.63265306 0.6122449  0.5       ]
KNN Cross Validation Accuracy: 0.565


#### GridSearch CV

In [38]:
nb_param_grid = { 'var_smoothing' : np.logspace(0,-9,num=100)}

In [39]:
nb_gridCV=GridSearchCV(estimator=nb, param_grid=nb_param_grid, scoring='accuracy', cv=5, n_jobs=-1, verbose=2)

In [40]:
nb_gridCV.fit(X_train,y_train)

Fitting 5 folds for each of 100 candidates, totalling 500 fits


  y = column_or_1d(y, warn=True)


In [41]:
print(nb_gridCV.best_params_)
print(nb_gridCV.best_score_)

{'var_smoothing': 0.12328467394420659}
0.6067176870748299


In [42]:
nb_best_grid=nb_gridCV.best_estimator_
nb_best_grid_score=nb_best_grid.score(X_test,y_test)
nb_best_grid_score

0.2962962962962963

### ***5. Ensemble Voting Classifier***

In [43]:
estimators=[('RF', rf_best_grid), ('SVC', svc_best_grid), ('KNN', knn_best_grid), ('GNB', nb_best_grid)]
eclf=VotingClassifier(estimators=estimators, voting='hard')
eclf.fit(X_train,y_train.values.ravel())
y_pred=eclf.predict(X_test)

In [44]:
print('Ensemble Classifier Train data score:', round(eclf.score(X_train,y_train),3))
print('Ensemble Classifier Test data score:', round(eclf.score(X_test,y_test), 3))
print('Ensemble Classifier AUC score:', round(metrics.roc_auc_score(y_test,y_pred), 3))

Ensemble Classifier Train data score: 1.0
Ensemble Classifier Test data score: 0.593
Ensemble Classifier AUC score: 0.557


### ***Scores Comparison***

In [45]:
result_fp=pd.DataFrame ({'Random Forest':[0.697,0.630], 'SVC':[0.726, 0.593], 'KNN':[0.693, 0.593],
                     'Naive Bayes':[0.607, 0.296], 'Ensemble Voting':[0.992, 0.630]}, index=['Train Data', 'Test Data'])

In [46]:
result_fp

Unnamed: 0,Random Forest,SVC,KNN,Naive Bayes,Ensemble Voting
Train Data,0.697,0.726,0.693,0.607,0.992
Test Data,0.63,0.593,0.593,0.296,0.63


### **Method 2: Physiochemical Properties**

In [47]:
train[['STD_SMILES', 'label']].to_csv('train.smi', sep='\t', header=None, index=None )
test[['STD_SMILES', 'label']].to_csv('test.smi', sep='\t', header=None, index=None )

### ***Get PaDEL Descriptors***

In [None]:
padeldescriptor(mol_dir='train.smi', d_file='train-desc.csv', d_2d=True, d_3d=True, maxruntime=10000, waitingjobs=10)

In [None]:
padeldescriptor(mol_dir='test.smi', d_file='test-desc.csv', d_2d=True, d_3d=True, maxruntime=10000, waitingjobs=10)

### ***Train data preprocessing***

In [50]:
train_desc=pd.read_csv('train-desc.csv')
print(train_desc.shape)
train_desc.head(3)

(244, 1876)


Unnamed: 0,Name,nAcid,ALogP,ALogp2,AMR,apol,naAromAtom,nAromBond,nAtom,nHeavyAtom,...,P1s,P2s,E1s,E2s,E3s,Ts,As,Vs,Ks,Ds
0,1,0.0,-2.0755,4.3077,64.9718,50.405239,6,6,48,25,...,,,,,,,,,,
1,1,0.0,-0.2358,0.055602,57.8518,62.145032,12,12,51,27,...,,,,,,,,,,
2,1,1.0,1.2406,1.539088,27.0587,40.509309,12,12,33,20,...,,,,,,,,,,


In [None]:
train_desc.dropna(axis=1, how='all', inplace=True)
train_desc=train_desc.replace([np.inf, -np.inf], np.nan)
train_desc=train_desc.fillna(train_desc.mean()).round(3)
train_desc=train_desc.replace(np.inf, np.nan)
train_desc['gmin']=train_desc['gmin'].fillna(train_desc['gmin'].mean())

In [52]:
scaler=MinMaxScaler()

In [53]:
Xtrain=power_transform(train_desc.iloc[0:,1:], method='yeo-johnson')
Xtrain=scaler.fit_transform(Xtrain)
Xtrain

array([[0.        , 0.47237632, 0.62143762, ..., 0.49516838, 0.4156782 ,
        0.49949907],
       [0.        , 0.54608835, 0.03084382, ..., 0.53645037, 0.52030599,
        0.5488032 ],
       [0.98581726, 0.65094663, 0.41651154, ..., 0.41472872, 0.55528021,
        0.42256086],
       ...,
       [0.        , 0.49847239, 0.44432469, ..., 0.41472872, 0.48832133,
        0.34478274],
       [0.        , 0.58357415, 0.07678778, ..., 0.3925051 , 0.64464571,
        0.44413231],
       [0.        , 0.81180598, 0.72170619, ..., 0.47129417, 0.58786806,
        0.5102441 ]])

In [54]:
ytrain=train_desc[['Name']]
print(Xtrain.shape)
print(ytrain.shape)

(244, 1444)
(244, 1)


### ***Test Data Preprocessing***

In [55]:
test_desc=pd.read_csv('test-desc.csv')
print(test_desc.shape)
test_desc.head(3)

(27, 1876)


Unnamed: 0,Name,nAcid,ALogP,ALogp2,AMR,apol,naAromAtom,nAromBond,nAtom,nHeavyAtom,...,P1s,P2s,E1s,E2s,E3s,Ts,As,Vs,Ks,Ds
0,1,0,0.114,0.012996,34.0178,53.510653,16,17,46,25,...,,,,,,,,,,
1,1,0,-1.2109,1.466279,33.6479,55.34186,17,19,47,27,...,,,,,,,,,,
2,1,1,1.1043,1.219478,33.8626,31.280723,6,6,26,15,...,,,,,,,,,,


In [56]:
test_desc.dropna(axis=1, how='all', inplace=True)
test_desc=test_desc.round(3)
test_desc.dropna(how='any', inplace=True)

In [57]:
Xtest=power_transform(test_desc.iloc[0:,1:], method='yeo-johnson')
Xtest=scaler.fit_transform(Xtest)
Xtest

array([[0.        , 0.5650054 , 0.0094331 , ..., 0.74809095, 0.48845012,
        0.66557812],
       [0.        , 0.39489114, 0.50663176, ..., 0.70432078, 0.61057944,
        0.72723734],
       [1.        , 0.69745208, 0.46097294, ..., 0.17168411, 0.40625753,
        0.18644029],
       ...,
       [0.        , 0.52879033, 0.0193748 , ..., 0.70432078, 0.63378708,
        0.63458708],
       [0.        , 0.32620567, 0.68859394, ..., 0.42769146, 0.51049298,
        0.49366965],
       [0.        , 0.41825376, 0.42536756, ..., 0.89709254, 0.29212722,
        0.69646033]])

In [58]:
ytest=test_desc[['Name']]
print(Xtest.shape)
print(ytest.shape)

(26, 1444)
(26, 1)


### ***Build Classifier Models***

In [59]:
classifier=[]
rf=RandomForestClassifier(random_state=42)
classifier.append(rf)
svc=SVC(random_state=42)
classifier.append(svc)
knn=KNeighborsClassifier()
classifier.append(knn)
nb=GaussianNB()
classifier.append(nb)

In [60]:
name=['Random Forest', 'SVC', 'KNN', 'Naive Bayes']

In [61]:
parameters=[]
rf_param_grid_={'n_estimators': [500,700,900,1000],
             'criterion':['entropy', 'gini'],
             'max_features':['sqrt','log2'],
             'max_depth':[50,60,70,80],
             'min_samples_split':[10,20,30],
             'min_samples_leaf':[1,5,7],
             'bootstrap':[False]}
parameters.append(rf_param_grid_)
svc_param_grid_ = {'kernel':['linear', 'rbf', 'poly'],
                  'C':[0.001, 0.01, 0.1, 1, 10, 100],
                  'gamma':[0.0001, 0.001, 0.001, 0.01, 0.1, 1, 10], 
                  'degree':[2,3,4]}
parameters.append(svc_param_grid_)
knn_param_grid_ = { 'n_neighbors' : [5,7,9,11,13,15],
               'weights' : ['uniform','distance'],
               'metric' : ['minkowski','euclidean','manhattan']}
parameters.append(knn_param_grid_)
nb_param_grid_ = { 'var_smoothing' : np.logspace(0,-9,num=100)}
parameters.append(nb_param_grid_)

In [62]:
from tqdm import tqdm

def gridSearchCV(model, name, params, count):
    best_models=[]
    for i in tqdm(range(0,count)):
        model_grid=GridSearchCV(estimator=model[i], param_grid=params[i], scoring='accuracy', cv=5, n_jobs=-1, verbose=2)
        model_grid.fit(Xtrain,ytrain.values.ravel())
        best_models.append(model_grid.best_estimator_)
        print(name[i],'Accuracy:')
        print('Train Data:', model_grid.best_score_)
        print('Test Data:', model_grid.best_estimator_.score(Xtest,ytest))
    return best_models   


best_model_list=gridSearchCV(classifier, name, parameters, 4)

  0%|          | 0/4 [00:00<?, ?it/s]

Fitting 5 folds for each of 576 candidates, totalling 2880 fits


 25%|██▌       | 1/4 [1:41:32<5:04:38, 6092.79s/it]

Random Forest Accuracy:
Train Data: 0.7335884353741496
Test Data: 0.6923076923076923
Fitting 5 folds for each of 378 candidates, totalling 1890 fits


 50%|█████     | 2/4 [1:42:37<1:24:53, 2546.92s/it]

SVC Accuracy:
Train Data: 0.6761904761904761
Test Data: 0.7307692307692307
Fitting 5 folds for each of 36 candidates, totalling 180 fits


Exception ignored on calling ctypes callback function: <function ThreadpoolController._find_libraries_with_dl_iterate_phdr.<locals>.match_library_callback at 0x7f5244302040>
Traceback (most recent call last):
  File "/usr/local/lib/python3.9/dist-packages/threadpoolctl.py", line 584, in match_library_callback
    self._make_controller_from_path(filepath)
  File "/usr/local/lib/python3.9/dist-packages/threadpoolctl.py", line 683, in _make_controller_from_path
    filepath = _realpath(filepath)
  File "/usr/local/lib/python3.9/dist-packages/threadpoolctl.py", line 127, in _realpath
    return os.path.realpath(filepath)
  File "/usr/lib/python3.9/posixpath.py", line 393, in realpath
    return abspath(path)
  File "/usr/lib/python3.9/posixpath.py", line 380, in abspath
    cwd = os.getcwd()
OSError: [Errno 107] Transport endpoint is not connected
 75%|███████▌  | 3/4 [1:42:41<23:05, 1385.78s/it]  

KNN Accuracy:
Train Data: 0.7087585034013606
Test Data: 0.6538461538461539
Fitting 5 folds for each of 100 candidates, totalling 500 fits


100%|██████████| 4/4 [1:42:44<00:00, 1541.18s/it]

Naive Bayes Accuracy:
Train Data: 0.6188775510204081
Test Data: 0.6538461538461539





### ***Ensemble Voting Classifier***

In [69]:
best_model_list

[RandomForestClassifier(bootstrap=False, max_depth=50, min_samples_leaf=7,
                        min_samples_split=30, n_estimators=1000,
                        random_state=42),
 SVC(C=10, degree=2, gamma=0.01, random_state=42),
 KNeighborsClassifier(n_neighbors=7),
 GaussianNB(var_smoothing=0.08111308307896872)]

In [71]:
estimators_=[('RF', best_model_list[0]), ('SVC', best_model_list[1]), ('KNN', best_model_list[2]), ('GNB', best_model_list[3])]
eclf_=VotingClassifier(estimators=estimators_, voting='hard')
eclf_.fit(Xtrain,ytrain.values.ravel())
ypred=eclf_.predict(Xtest)

In [72]:
print('Ensemble Classifier Train data score:', round(eclf_.score(Xtrain,ytrain),3))
print('Ensemble Classifier Test data score:', round(eclf_.score(Xtest,ytest), 3))
print('Ensemble Classifier AUC score:', round(metrics.roc_auc_score(ytest,ypred), 3))

Ensemble Classifier Train data score: 0.951
Ensemble Classifier Test data score: 0.692
Ensemble Classifier AUC score: 0.675


### ***Scores Comparison***

In [73]:
result_desc=pd.DataFrame ({'Random Forest':[0.734,0.692], 'SVC':[0.676, 0.730], 'KNN':[0.709, 0.654],
                     'Naive Bayes':[0.619, 0.654], 'Ensemble Voting':[0.951, 0.692]}, index=['Train Data', 'Test Data'])

In [74]:
result_desc

Unnamed: 0,Random Forest,SVC,KNN,Naive Bayes,Ensemble Voting
Train Data,0.734,0.676,0.709,0.619,0.951
Test Data,0.692,0.73,0.654,0.654,0.692
