In [1]:
import tensorflow as tf

physical_devices = tf.config.list_physical_devices('GPU')
try:
  tf.config.experimental.set_memory_growth(physical_devices[0], True)
except:
  # Invalid device or cannot modify virtual devices once initialized.
  pass 


In [2]:
!pip install rdkit-pypi 

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting rdkit-pypi
  Downloading rdkit_pypi-2022.9.4-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (29.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m29.3/29.3 MB[0m [31m16.0 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: rdkit-pypi
Successfully installed rdkit-pypi-2022.9.4


In [3]:
!pip install molvs

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting molvs
  Downloading MolVS-0.1.1.tar.gz (61 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.9/61.9 KB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: molvs
  Building wheel for molvs (setup.py) ... [?25l[?25hdone
  Created wheel for molvs: filename=MolVS-0.1.1-py3-none-any.whl size=32385 sha256=1acc2e4001c1a1512c906c2169aaa4fa3e9e545baf7ee12d42f75865034d33a2
  Stored in directory: /root/.cache/pip/wheels/f5/0b/44/a9ee577a6de1da52be0d5acbc0940957446cc208b808c2f6c4
Successfully built molvs
Installing collected packages: molvs
Successfully installed molvs-0.1.1


In [4]:
!pip install pycm

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pycm
  Downloading pycm-3.7-py2.py3-none-any.whl (62 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m63.0/63.0 KB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting art>=1.8
  Downloading art-5.8-py2.py3-none-any.whl (595 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m595.7/595.7 KB[0m [31m14.6 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: art, pycm
Successfully installed art-5.8 pycm-3.7


### Import Libraries and data file

In [5]:
import os
import numpy as np
import pandas as pd

from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit.Chem import Draw
from rdkit.Chem.Draw import IPythonConsole
from rdkit import rdBase
from molvs import standardize_smiles

from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.model_selection import KFold, StratifiedKFold, ShuffleSplit
from sklearn.model_selection import cross_val_score, cross_val_predict, cross_validate
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix, accuracy_score

In [6]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [7]:
os.chdir('/content/drive/My Drive/')

In [8]:
x=pd.read_csv('erk2.csv')
x.head(2)

Unnamed: 0,Molecule ChEMBL ID,Molecule Name,Molecule Max Phase,Molecular Weight,#RO5 Violations,AlogP,Compound Key,Smiles,Standard Type,Standard Relation,...,Target Name,Target Organism,Target Type,Document ChEMBL ID,Source ID,Source Description,Document Journal,Document Year,Cell ChEMBL ID,Properties
0,CHEMBL440356,,0.0,243.06,0,1.09,2,O=C1CCNC(=O)c2[nH]c(Br)cc21,IC50,'=',...,MAP kinase ERK2,Homo sapiens,SINGLE PROTEIN,CHEMBL1135814,1.0,Scientific Literature,J. Med. Chem.,2002.0,,
1,CHEMBL260417,,0.0,373.21,0,3.44,SB-725317,O=C(Nc1n[nH]c2nc(-c3ccc(O)cc3)c(Br)cc12)C1CC1,Inhibition,'=',...,MAP kinase ERK2,Homo sapiens,SINGLE PROTEIN,CHEMBL1961873,16.0,GSK Published Kinase Inhibitor Set,,,,


### Data Preprocessing

In [9]:
x1=x[['Molecule ChEMBL ID','Smiles', 'Standard Value', 'Standard Units']]
x1=x1[x1['Standard Units'].str.contains('nM', na=False)]
x1.head(3)

Unnamed: 0,Molecule ChEMBL ID,Smiles,Standard Value,Standard Units
0,CHEMBL440356,O=C1CCNC(=O)c2[nH]c(Br)cc21,539.0,nM
2,CHEMBL213451,CCNc1nnc2ccc(-c3ocnc3-c3ccc(F)cc3)cn12,10000.0,nM
4,CHEMBL214198,CC(C)c1nnc2ccc(-c3c[nH]nc3-c3cc(F)ccc3F)cn12,10000.0,nM


In [10]:
x1.drop_duplicates(keep='first', inplace=True)
x1.dropna(inplace=True)
x1.shape

(18756, 4)

In [11]:
x1['New Std_value']=x1.groupby('Molecule ChEMBL ID')['Standard Value'].transform('mean')
x1.drop_duplicates('Molecule ChEMBL ID', keep='first', inplace=True)
x1=x1.drop(['Standard Value', 'Standard Units'], axis=1)
x1=x1.sort_values('New Std_value', ascending=True).reset_index(drop=True)
print(x1.shape)
x1.head(3)

(17739, 3)


Unnamed: 0,Molecule ChEMBL ID,Smiles,New Std_value
0,CHEMBL4868141,Nc1ncc(-c2ccc(NS(=O)(=O)C3CC3)cc2OC2CCCCC2)cc1...,-29600.0
1,CHEMBL4115001,Nc1ncc([C@@H]2CC[C@@H](O)[C@H](O)C2)nc1-c1ccc(...,0.00431
2,CHEMBL4111166,NC[C@@H](NC(=O)c1ccc(-c2nc([C@@H]3CC[C@@H](O)[...,0.005


In [12]:
x1['Label']=x1['New Std_value'].apply(lambda x: 1 if x<=10000 else 0)
x1.head(3)

Unnamed: 0,Molecule ChEMBL ID,Smiles,New Std_value,Label
0,CHEMBL4868141,Nc1ncc(-c2ccc(NS(=O)(=O)C3CC3)cc2OC2CCCCC2)cc1...,-29600.0,1
1,CHEMBL4115001,Nc1ncc([C@@H]2CC[C@@H](O)[C@H](O)C2)nc1-c1ccc(...,0.00431,1
2,CHEMBL4111166,NC[C@@H](NC(=O)c1ccc(-c2nc([C@@H]3CC[C@@H](O)[...,0.005,1


In [13]:
std_smiles=[standardize_smiles(smi) for smi in x1['Smiles'] if smi is not None]
std_smiles_df=pd.DataFrame(std_smiles, columns=['Std_Smiles'])

In [14]:
x2=pd.concat([x1[['Molecule ChEMBL ID','New Std_value', 'Label']],std_smiles_df], axis=1)
x2=x2[['Molecule ChEMBL ID','Std_Smiles','New Std_value','Label']]
x2.head(3)

Unnamed: 0,Molecule ChEMBL ID,Std_Smiles,New Std_value,Label
0,CHEMBL4868141,Nc1ncc(-c2ccc(NS(=O)(=O)C3CC3)cc2OC2CCCCC2)cc1...,-29600.0,1
1,CHEMBL4115001,Nc1ncc([C@@H]2CC[C@@H](O)[C@H](O)C2)nc1-c1ccc(...,0.00431,1
2,CHEMBL4111166,NC[C@@H](NC(=O)c1ccc(-c2nc([C@@H]3CC[C@@H](O)[...,0.005,1


### Generate Morgan Fingerprints

In [15]:
x2[['Std_Smiles','Label']].to_csv('erk2_binary.smi', sep='\t', header=None, index=None)

In [16]:
supplier=Chem.SmilesMolSupplier('erk2_binary.smi', delimiter='\t', titleLine=None)

In [17]:
morgan_fp=[AllChem.GetMorganFingerprintAsBitVect(mol,2,nBits=1024) for mol in supplier]

In [18]:
labels=[mol.GetProp('_Name') for mol in supplier]

In [19]:
fp_array=np.asarray(morgan_fp, dtype=np.int32)
labels_array=np.asarray(labels, dtype=np.int32).reshape(-1,1)

In [20]:
combined=np.concatenate([fp_array,labels_array], axis=1)
np.savetxt('erk2_fps_labelled.csv', combined, delimiter=',')

In [21]:
x3=pd.read_csv('erk2_fps_labelled.csv', sep=',', header=None)
x3.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1015,1016,1017,1018,1019,1020,1021,1022,1023,1024
0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
1,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0
2,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0
3,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0
4,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0


### Train-Test split

In [22]:
X=x3.iloc[:,0:1024]
y=x3.iloc[:,1024]

In [23]:
print('Features dimension:', X.shape)
print('Labels dimension:', y.shape)

Features dimension: (17739, 1024)
Labels dimension: (17739,)


In [24]:
X_train,X_test,y_train,y_test=train_test_split(X, y , test_size=0.2, random_state=42, shuffle=True)

### Build Model : Random Forest Classifier

In [25]:
rf=RandomForestClassifier(random_state=42)
rf.fit(X_train, y_train.ravel())

RandomForestClassifier(random_state=42)

In [26]:
rf_pred=rf.predict(X_test)

In [27]:
!pip install pycm
from pycm import *

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [28]:
pycm_=ConfusionMatrix(y_test.ravel(),rf_pred)
print(pycm_)

Predict    0.0        1.0        
Actual
0.0        2041       81         

1.0        675        751        





Overall Statistics : 

95% CI                                                            (0.77345,0.8004)
ACC Macro                                                         0.78692
ARI                                                               0.32181
AUNP                                                              0.74424
AUNU                                                              0.74424
Bangdiwala B                                                      0.68055
Bennett S                                                         0.57384
CBA                                                               0.63906
CSI                                                               0.5713
Chi-Squared                                                       1133.65955
Chi-Squared DF                                                    1
Conditional Entropy                        

Accuracy = 0.786

AUC score = 0.744

### KFold Cross Validation

In [29]:
kfold = KFold(n_splits=5, random_state=42, shuffle=True)
rf_cv = cross_val_score(rf, X_train, y_train, cv=kfold, scoring='accuracy')
rf_cv

array([0.7879535 , 0.76814658, 0.77554616, 0.77589852, 0.77730796])

In [30]:
print('Cross Validation Accuracy:', round(rf_cv.mean(),3))

Cross Validation Accuracy: 0.777


### Stratified KFold Cross Validation

In [31]:
skf=StratifiedKFold(n_splits=5, random_state=42, shuffle=True)

In [32]:
acc=[]

for train,test in skf.split(X_train,y_train):
    Xtrain=X_train.iloc[train]
    Xtest=X_train.iloc[test]
    ytrain=y_train.iloc[train]
    ytest=y_train.iloc[test]
    rf_skf=rf.fit(Xtrain,ytrain)
    y_pred=rf_skf.predict(Xtest)
    result=accuracy_score(ytest,y_pred)
    acc.append(result)

print(acc)

[0.7742162733356815, 0.7716701902748414, 0.7720225510923185, 0.7783650458069062, 0.7836504580690627]


In [33]:
print('Stratified KFold Accuracy:', round(np.asarray(acc).mean(),3))

Stratified KFold Accuracy: 0.776


### Using Shufflesplit

In [34]:
cv=ShuffleSplit(n_splits=5, test_size=0.20, random_state=42)

In [35]:
rf_cv_shuffle=cross_val_score(rf, X_train, y_train, cv=cv, n_jobs=-1, scoring='accuracy')
rf_cv_shuffle

array([0.78724903, 0.77034167, 0.77879535, 0.77210285, 0.78266995])

In [36]:
print('ShuffleSplit Accuracy:', round(np.asarray(rf_cv_shuffle).mean(),3))

ShuffleSplit Accuracy: 0.778


### RandomizedSearch CV

In [37]:
random_grid={'n_estimators': [int(x) for x in np.linspace(50,1000,10)],
             'criterion':['gini','entropy'],
             'max_features':['sqrt','log2'],
             'max_depth':[int(x) for x in np.linspace(5,100,10)],
             'min_samples_split':[2,5,10,20,50],
             'min_samples_leaf':[1,5,10,20],
             'bootstrap':[True,False]}

In [38]:
rf_randomCV=RandomizedSearchCV(estimator=rf, param_distributions=random_grid, n_iter=50, scoring='accuracy', cv=5, n_jobs=-1, verbose=2)

In [39]:
rf_randomCV.fit(X_train,y_train)

Fitting 5 folds for each of 50 candidates, totalling 250 fits




RandomizedSearchCV(cv=5, estimator=RandomForestClassifier(random_state=42),
                   n_iter=50, n_jobs=-1,
                   param_distributions={'bootstrap': [True, False],
                                        'criterion': ['gini', 'entropy'],
                                        'max_depth': [5, 15, 26, 36, 47, 57, 68,
                                                      78, 89, 100],
                                        'max_features': ['sqrt', 'log2'],
                                        'min_samples_leaf': [1, 5, 10, 20],
                                        'min_samples_split': [2, 5, 10, 20, 50],
                                        'n_estimators': [50, 155, 261, 366, 472,
                                                         577, 683, 788, 894,
                                                         1000]},
                   scoring='accuracy', verbose=2)

In [40]:
rf_randomCV.best_params_

{'n_estimators': 894,
 'min_samples_split': 20,
 'min_samples_leaf': 1,
 'max_features': 'log2',
 'max_depth': 68,
 'criterion': 'entropy',
 'bootstrap': False}

In [57]:
rf_randomCV.best_score_

0.7816219817546848

In [49]:
best_random=rf_randomCV.best_estimator_
best_score=best_random.score(X_test,y_test)
best_score

0.7883314543404735

### GridSearch CV

In [50]:
param_grid={'n_estimators': [700,800,900,100],
             'criterion':['entropy'],
             'max_features':['log2'],
             'max_depth':[50,60,70,80],
             'min_samples_split':[10,20,30],
             'min_samples_leaf':[1,5,7],
             'bootstrap':[False]}

In [51]:
rf_gridCV=GridSearchCV(estimator=rf, param_grid=param_grid, scoring='accuracy', cv=5, n_jobs=-1, verbose=2)

In [52]:
rf_gridCV.fit(X_train,y_train)

Fitting 5 folds for each of 144 candidates, totalling 720 fits




GridSearchCV(cv=5, estimator=RandomForestClassifier(random_state=42), n_jobs=-1,
             param_grid={'bootstrap': [False], 'criterion': ['entropy'],
                         'max_depth': [50, 60, 70, 80],
                         'max_features': ['log2'],
                         'min_samples_leaf': [1, 5, 7],
                         'min_samples_split': [10, 20, 30],
                         'n_estimators': [700, 800, 900, 100]},
             scoring='accuracy', verbose=2)

In [56]:
rf_gridCV.best_params_

{'bootstrap': False,
 'criterion': 'entropy',
 'max_depth': 70,
 'max_features': 'log2',
 'min_samples_leaf': 1,
 'min_samples_split': 20,
 'n_estimators': 900}

In [58]:
rf_gridCV.best_score_

0.7814811615421067

In [59]:
best_grid=rf_gridCV.best_estimator_
best_grid_score=best_grid.score(X_test,y_test)
best_grid_score

0.7891770011273957

In [55]:
import pickle

with open('rf_model_erk2_binary.pkl', 'wb') as f:
  pickle.dump(best_grid, f)

### Score Comparsion





In [61]:
result=pd.DataFrame({'Model':['RandomForest', 'KFoldCV', 'StratifiedKFold', 'RandomizedSearchCV', 'GridSearchCV'],
                     'Accuracy':[0.786, 0.777, 0.776, 0.782, 0.781]})
result

Unnamed: 0,Model,Accuracy
0,RandomForest,0.786
1,KFoldCV,0.777
2,StratifiedKFold,0.776
3,RandomizedSearchCV,0.782
4,GridSearchCV,0.781
