In [1]:
import rdkit
import math
import pandas as pd
import numpy as np
from rdkit import Chem
from rdkit.Chem import AllChem
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from lightgbm import LGBMRegressor

from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor


In [2]:
from xgboost import XGBRegressor

In [3]:
df=pd.read_csv('HERG_Concatenate.csv')
df.head()

Unnamed: 0,MOLECULE,OUTCOME,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,BrC1=CC(COCC2(C3=CC=CC=C3)CCNCC2)=CN=C1,4.823909,4.176091,15000.0,0
1,BrC1=CC2=C(N=C1)N=C(N1CCN3CCC1CC3)O2,5.638272,3.361728,2300.0,1
2,C/C=C/C1=NC(COCC2(C3=CC=C(F)C=C3)CCN(C)CC2)=CC...,4.886057,4.113943,12999.99999,0
3,C=CC(=O)N1CCC[C@@H](N2N=C(C3=CC(C(=O)NCC4=CC=C...,5.744727,3.255273,1800.0,1
4,C=CC(=O)N1CCC[C@@H](N2N=C(C3=CC(C(=O)NC4=CC(C)...,6.522879,2.477121,300.0,1


In [4]:
smiles=df['MOLECULE']


In [5]:
def ECFP4(smiles):
    target=[]
    for smile in smiles:
        c=Chem.MolFromSmiles(smile)
        fp=AllChem.GetMorganFingerprintAsBitVect(c, 2, 1024)
        fp=fp.ToBitString()
        target.append(fp)
#     print(len(target))
    df7=pd.DataFrame()
    df7['target']=target
    for i in range(len(target[0])):
        df7['B'+str(i)]=df7['target'].str[i]
    df7=df7.drop('target', axis=1)
    for cols in df7.columns:
        df7[cols]=np.int64(df7[cols])
    return df7

In [6]:
desc=ECFP4(smiles)
len(desc)

  if sys.path[0] == '':


3663

In [7]:
X=np.asarray(desc)
y=df['OUTCOME']

In [8]:
X1=pd.DataFrame(X)
X1['outcome']=y
X1.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1015,1016,1017,1018,1019,1020,1021,1022,1023,outcome
0,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,4.823909
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,5.638272
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,4.886057
3,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,5.744727
4,0,1,0,0,1,0,0,1,0,0,...,0,0,0,0,1,0,0,0,0,6.522879


In [9]:
X1['Smiles']=smiles
X1.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1016,1017,1018,1019,1020,1021,1022,1023,outcome,Smiles
0,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,4.823909,BrC1=CC(COCC2(C3=CC=CC=C3)CCNCC2)=CN=C1
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,5.638272,BrC1=CC2=C(N=C1)N=C(N1CCN3CCC1CC3)O2
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,4.886057,C/C=C/C1=NC(COCC2(C3=CC=C(F)C=C3)CCN(C)CC2)=CC...
3,0,0,0,0,1,0,0,0,0,0,...,0,0,0,1,0,0,0,0,5.744727,C=CC(=O)N1CCC[C@@H](N2N=C(C3=CC(C(=O)NCC4=CC=C...
4,0,1,0,0,1,0,0,1,0,0,...,0,0,0,1,0,0,0,0,6.522879,C=CC(=O)N1CCC[C@@H](N2N=C(C3=CC(C(=O)NC4=CC(C)...


In [10]:
len(X1)

3663

In [11]:
X1=X1.drop_duplicates(subset='Smiles', keep=False)


In [12]:
len(X1)

3399

In [13]:
y1=np.asarray(X1['outcome'])
X1=np.asarray(X1.drop(['outcome', 'Smiles'], axis=1))

In [14]:
X1_train, X1_test, y1_train, y1_test = train_test_split(X1, y1, test_size=0.20, random_state=101)

In [15]:
models=[LGBMRegressor, XGBRegressor, RandomForestRegressor, ExtraTreesRegressor]
for model in models:
    p=model()
    print("For model ", p)
    p.fit(X1_train, y1_train)
    pred=p.predict(X1_test)
    r2=r2_score(y1_test, pred)
    print("R2 score is ", r2)
    rmse=math.sqrt(mean_squared_error(y1_test, pred))
    print("RMSE is ", rmse)

For model  LGBMRegressor()
R2 score is  0.7699667062541558
RMSE is  0.45835613163400685
For model  XGBRegressor(base_score=None, booster=None, callbacks=None,
             colsample_bylevel=None, colsample_bynode=None,
             colsample_bytree=None, early_stopping_rounds=None,
             enable_categorical=False, eval_metric=None, gamma=None,
             gpu_id=None, grow_policy=None, importance_type=None,
             interaction_constraints=None, learning_rate=None, max_bin=None,
             max_cat_to_onehot=None, max_delta_step=None, max_depth=None,
             max_leaves=None, min_child_weight=None, missing=nan,
             monotone_constraints=None, n_estimators=100, n_jobs=None,
             num_parallel_tree=None, predictor=None, random_state=None,
             reg_alpha=None, reg_lambda=None, ...)
R2 score is  0.8133965402105763
RMSE is  0.41282647601141975
For model  RandomForestRegressor()
R2 score is  0.816639577222497
RMSE is  0.40922343578289777
For model  Extr

In [16]:
from sklearn.model_selection import KFold

In [17]:
cv=KFold(n_splits=10, shuffle= True)

In [18]:
rfr=RandomForestRegressor()

In [20]:
avg=0
err=0
count=0
for train_index, test_index in cv.split(X1):
    X_train, X_test = X1[train_index], X1[test_index]
    y_train, y_test = y1[train_index], y1[test_index]
    rfr.fit(X_train, y_train)
    pred=rfr.predict(X_test)
    r2=r2_score(y_test, pred)
    print("For counter ", count, "----")
    print(r2)
    rmse=math.sqrt(mean_squared_error(y_test, pred))
    print(rmse)
    avg=avg+r2
    count=count+1
    err=err+rmse
#     print(err)
print("Final r2 score is ", (avg/10))
print("Final error is ", (err/10))

For counter  0 ----
0.8969007534465563
0.29792137306306954
For counter  1 ----
0.9010084231878064
0.2660856166151126
For counter  2 ----
0.92104504571651
0.2869977485160898
For counter  3 ----
0.831614683436367
0.39994694997559666
For counter  4 ----
0.8947542063906694
0.29537009717104623
For counter  5 ----
0.8828807033839263
0.308848251657868
For counter  6 ----
0.8394726873025461
0.35980876124147815
For counter  7 ----
0.8854018633758077
0.3072175699657589
For counter  8 ----
0.8541202663998457
0.3507729813544136
For counter  9 ----
0.8673808803128951
0.3373868876838765
Final r2 score is  0.8774579512952931
Final error is  0.321035623724431


In [21]:
y_test

array([4.82390874, 5.15490196, 4.88941029, 4.82390874, 5.84163751,
       5.20760831, 5.40000019, 4.52287874, 5.40893539, 5.62893214,
       5.49999968, 6.39794001, 5.48148606, 5.63827216, 5.6777807 ,
       7.29998894, 5.7447275 , 6.19999817, 4.87942607, 5.6777807 ,
       5.13076828, 4.61978876, 4.76955108, 6.39794001, 5.25963731,
       4.76955108, 4.60205999, 4.82390874, 4.79588002, 6.33724217,
       3.85387196, 5.76955108, 4.56863624, 5.89999842, 5.89999842,
       4.88605665, 5.20000024, 5.58502665, 5.58972904, 4.88605665,
       5.50863831, 4.63827216, 4.76955108, 4.39794001, 5.1079054 ,
       5.92445304, 4.98296666, 4.82973829, 8.85387196, 4.55909092,
       8.95860732, 4.7447275 , 4.85387196, 5.19382003, 3.75945075,
       5.6777807 , 6.49485002, 4.49620932, 5.69745263, 5.        ,
       5.76955108, 5.12199553, 4.39685563, 8.07058107, 4.65757732,
       6.59999765, 7.85667287, 5.40000019, 5.31875876, 5.20760831,
       4.61385789, 6.63078414, 4.41907502, 4.79588002, 3.76447

In [22]:
#Testing available drugs for toxicity
test_smiles=['C1=NC2=C(N1)C(=S)N=CN2', 'CC(C)C(=O)NC1=CC(=C(C=C1)[N+](=O)[O-])C(F)(F)F', 'CCC(=O)N(C1CCN(CC1)CCC2=CC=CC=C2)C3=CC=CC=C3', 'C(CCl)NC(=O)N(CCCl)N=O', 'CN(CC1=CN=C2C(=N1)C(=NC(=N2)N)N)C3=CC=C(C=C3)C(=O)N[C@@H](CCC(=O)O)C(=O)O', 'C1=CC=C(C(=C1)C(C2=CC=C(C=C2)Cl)C(Cl)Cl)Cl', 'C1=C(C(=O)NC(=O)N1)F', 'COCCOC1=C(C=C2C(=C1)C(=NC=N2)NC3=CC=CC(=C3)C#C)OCCOC']
len(test_smiles)

8

In [23]:
test_data=np.asarray(ECFP4(test_smiles))
len(test_data)

  if sys.path[0] == '':


8

In [24]:
pred1=rfr.predict(test_data)

In [25]:
pred1

array([5.38816307, 4.88355121, 5.37310708, 5.16081328, 5.01243835,
       5.48097653, 5.31095803, 6.17485557])