In [1]:
%matplotlib inline
import pandas as pd
from rdkit import Chem
from rdkit.Chem import AllChem
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

In [2]:
df_train = pd.read_csv('train.csv')
X_t = [Chem.MolFromSmiles(smiles) for smiles in df_train['Smiles']]
Xtrain = [AllChem.GetMorganFingerprintAsBitVect(mol, 2) for mol in X_t]
Ytrain = df_train['Activity'].values.tolist()

df_train = pd.read_csv('val.csv')
X_v = [Chem.MolFromSmiles(smiles) for smiles in df_train['Smiles']]
Xtest = [AllChem.GetMorganFingerprintAsBitVect(mol, 2) for mol in X_v]
Ytest = df_train['Activity'].values.tolist()

In [6]:
from sklearn.model_selection import GridSearchCV
param_test1 = {
    'n_estimators': range(100,1050,100),
    }
gsearch1 = GridSearchCV(estimator = RandomForestClassifier(), 
                       param_grid = param_test1, scoring='roc_auc',cv=5)
gsearch1.fit(Xtrain,Ytrain)
gsearch1.best_params_, gsearch1.best_score_

({'n_estimators': 700}, 0.9280989931375057)

In [9]:
param_test1 = {
    'max_depth': range(2,31,2),
    }
gsearch1 = GridSearchCV(estimator = RandomForestClassifier(n_estimators =700), 
                       param_grid = param_test1, scoring='roc_auc',cv=5)
gsearch1.fit(Xtrain,Ytrain)
gsearch1.best_params_, gsearch1.best_score_

({'max_depth': 26}, 0.9269853781006814)

In [10]:
param_test1 = {
    'max_features': range(5,23,2),
    }
gsearch1 = GridSearchCV(estimator = RandomForestClassifier(n_estimators =700,
                                                           max_depth =26), 
                       param_grid = param_test1, scoring='roc_auc',cv=5)
gsearch1.fit(Xtrain,Ytrain)
gsearch1.best_params_, gsearch1.best_score_

({'max_features': 17}, 0.9269690275447626)

In [11]:
param_test1 = {
    'min_samples_leaf':range(1, 101, 2),
    }
gsearch1 = GridSearchCV(estimator = RandomForestClassifier(n_estimators =700,
                                                           max_depth =26,
                                                           max_features = 17), 
                       param_grid = param_test1, scoring='roc_auc',cv=5)
gsearch1.fit(Xtrain,Ytrain)
gsearch1.best_params_, gsearch1.best_score_

({'min_samples_leaf': 1}, 0.9268272634856543)

In [12]:
param_test1 = {
    'min_samples_split':range(2, 22, 2),
    }
gsearch1 = GridSearchCV(estimator = RandomForestClassifier(n_estimators =700,
                                                           max_depth =26,
                                                           max_features = 17,
                                                           min_samples_leaf =1
                                                           ), 
                       param_grid = param_test1, scoring='roc_auc',cv=5)
gsearch1.fit(Xtrain,Ytrain)
gsearch1.best_params_, gsearch1.best_score_

({'min_samples_split': 2}, 0.9263474694552658)

In [15]:
param_test1 = {
    'criterion':['gini', 'entropy','log_loss'],
    }
gsearch1 = GridSearchCV(estimator = RandomForestClassifier(n_estimators =700,
                                                           max_depth =26,
                                                           max_features = 17,
                                                           min_samples_leaf =1,
                                                           min_samples_split =2
                                                           ), 
                       param_grid = param_test1, scoring='roc_auc',cv=5)
gsearch1.fit(Xtrain,Ytrain)
gsearch1.best_params_, gsearch1.best_score_

({'criterion': 'gini'}, 0.9276529073184141)

In [42]:
model = RandomForestClassifier(
    n_estimators =700,
    max_depth =26,
    max_features = 17,
    min_samples_leaf =1,
    min_samples_split =2
)
rfc = model.fit(Xtrain,Ytrain)

In [43]:
from sklearn import metrics
Ypre = rfc.predict(Xtest)
y_test_predprob = rfc.predict_proba(Xtest)[:,1]

def met(Ytest, Ypre,y_test_predprob):
    ACC = metrics.accuracy_score(Ytest, Ypre)
    print("ACC: %.3f" % ACC)

    PRE = metrics.precision_score(Ytest, Ypre)
    print("PRE: %.3f" % PRE)

    TPR = metrics.recall_score(Ytest, Ypre)
    print("TPR: %.3f" % TPR)

    MCC = metrics.matthews_corrcoef(Ytest, Ypre)
    print("MCC: %.3f" % MCC)

    AUC = metrics.roc_auc_score(Ytest, y_test_predprob)
    print("AUC: %.3f" % AUC)


met(Ytest, Ypre,y_test_predprob)

ACC: 0.869
PRE: 0.866
TPR: 0.907
MCC: 0.734
AUC: 0.938


In [46]:
smiles = ['NC1=NC(C(NC2=CC=CC=N2)=O)=CN3C1=NC4=CC=CC=C34',
          'FC(F)(F)c(c1)ccc2c1nc(n23)c(N)nc(c3)Cc4cccc(c4)C(=O)NCCO',
          'CNC(=O)CNc1cccc(c1)Cc(c2)nc(N)c(n23)nc4c3ccc(c4)C',
          'OCCNC(=O)c(c1)nc(N)c(n12)nc3c2ccc(c3)C(F)(F)F',
          'CCc(cc1)cc(c12)nc3n2cc(nc3N)C(=O)NOc4ccccc4O',
          'FC(F)(F)c(c1)ccc2c1nc(n23)c(N)nc(c3)Cc4cccc(c4)NC(=O)NCc5cccc(Cl)c5'
          ]

labels = ['12o',
         'ABA-1266',
         'ABN-2100',
         'AAL-9931',
         'AAR-10704',
         'ABAN-835'
         ]

def predict(model, smiles, labels):
    X_pred = [Chem.MolFromSmiles(smi) for smi in smiles]
    X_pred_ecfp = [AllChem.GetMorganFingerprintAsBitVect(mol, 2) for mol in X_pred]
    y_pred = model.predict(X_pred_ecfp)
    y_proba = model.predict_proba(X_pred_ecfp)
    for label, pred, proba in zip(labels, y_pred, y_proba):
        print(f'{label} Activity: {pred} {proba}')

#Random Forest Prediction
predict(rfc, smiles, labels)

12o Activity: 1 [0.4659765 0.5340235]
ABA-1266 Activity: 0 [0.51738474 0.48261526]
ABN-2100 Activity: 0 [0.6298483 0.3701517]
AAL-9931 Activity: 1 [0.48490745 0.51509255]
AAR-10704 Activity: 0 [0.58546504 0.41453496]
ABAN-835 Activity: 0 [0.50291824 0.49708176]
