In [1]:
import pandas as pd
from random import seed
seed(52)

df_test = pd.read_csv('bbbp_test.csv')
df_test = df_test.drop(columns='selfies', axis=1)
df_test.head()

Unnamed: 0,smiles,target
0,CCOC(=O)[C@H](CCc1ccccc1)N[C@@H](C)C(=O)N1CCC[...,0
1,O=C1Nc2ccc(Cl)cc2[C@@](C#CC2CC2)(C(F)(F)F)O1,0
2,CN1C[C@H](C(=O)N[C@]2(C)O[C@@]3(O)[C@@H]4CCCN4...,0
3,CN1CCC[C@H]1c1cccnc1,1
4,CN1CC=C(c2ccccc2)CC1,1


In [2]:
df_train = pd.read_csv('bbbp_train.csv')
df_train = df_train.drop(columns='selfies', axis=1)
df_train.head()

Unnamed: 0,smiles,target
0,CC(C)(C)OC(=O)CCCc1ccc(N(CCCl)CCCl)cc1,1
1,C[C@H](N)Cc1ccccc1,1
2,CS(=O)(=O)c1ccc([C@@H](O)[C@@H](CO)NC(=O)C(Cl)...,1
3,Cc1cccc(C)c1OCC(C)N,1
4,CCc1ccccc1,1


In [3]:
from rdkit.Chem import QED, rdMolDescriptors, MolToSmiles, MolFromSmiles, Draw, \
                       Descriptors, Lipinski, Crippen, GraphDescriptors
from rdkit.Chem.FilterCatalog import FilterCatalog, FilterCatalogParams
from rdkit.Contrib.SA_Score import sascorer
from rdkit import rdBase
import numpy as np
import warnings

warnings.filterwarnings("ignore")
rdBase.DisableLog('rdApp.error')

def smile(df, func):
    return df['smiles'].apply(lambda x: func(MolFromSmiles(x)))

df_train['exactmw'] = smile(df_train, Descriptors.ExactMolWt)
df_train['FractionCSP3'] = smile(df_train, Lipinski.FractionCSP3)
df_train['NumRings'] = smile(df_train, rdMolDescriptors.CalcNumRings)
df_train['NumAromaticRings'] = smile(df_train, Lipinski.NumAromaticRings)
df_train['NumAliphaticRings'] = smile(df_train, Lipinski.NumAliphaticRings)
df_train['NumHeterocycles'] = smile(df_train, rdMolDescriptors.CalcNumHeterocycles)
df_train['NumAromaticHeterocycles'] = smile(df_train, rdMolDescriptors.CalcNumAromaticHeterocycles)
df_train['NumSaturatedHeterocycles'] = smile(df_train, rdMolDescriptors.CalcNumSaturatedHeterocycles)
df_train['NumAliphaticHeterocycles'] = smile(df_train, rdMolDescriptors.CalcNumAliphaticHeterocycles)
df_train['NumSpiroAtoms'] = smile(df_train, rdMolDescriptors.CalcNumSpiroAtoms)
df_train['NumBridgeheadAtoms'] = smile(df_train, rdMolDescriptors.CalcNumBridgeheadAtoms)
df_train['NumAtomStereoCenters'] = smile(df_train, rdMolDescriptors.CalcNumAtomStereoCenters)
df_train['NumUnspecifiedAtomStereoCenters'] = smile(df_train, rdMolDescriptors.CalcNumUnspecifiedAtomStereoCenters)
df_train['CrippenClogP'] = smile(df_train, Crippen.MolLogP)
df_train['hallKierAlpha'] = smile(df_train, GraphDescriptors.HallKierAlpha)

df_train.head()



Unnamed: 0,smiles,target,exactmw,FractionCSP3,NumRings,NumAromaticRings,NumAliphaticRings,NumHeterocycles,NumAromaticHeterocycles,NumSaturatedHeterocycles,NumAliphaticHeterocycles,NumSpiroAtoms,NumBridgeheadAtoms,NumAtomStereoCenters,NumUnspecifiedAtomStereoCenters,CrippenClogP,hallKierAlpha
0,CC(C)(C)OC(=O)CCCc1ccc(N(CCCl)CCCl)cc1,1,359.141884,0.611111,1,1,0,0,0,0,0,0,0,0,0,4.635,-0.93
1,C[C@H](N)Cc1ccccc1,1,135.104799,0.333333,1,1,0,0,0,0,0,0,0,1,0,1.5763,-0.82
2,CS(=O)(=O)c1ccc([C@@H](O)[C@@H](CO)NC(=O)C(Cl)...,1,355.004799,0.416667,1,1,0,0,0,0,0,0,0,2,0,0.4043,-0.86
3,Cc1cccc(C)c1OCC(C)N,1,179.131014,0.454545,1,1,0,0,0,0,0,0,0,1,1,2.02944,-1.02
4,CCc1ccccc1,1,106.07825,0.25,1,1,0,0,0,0,0,0,0,0,0,2.249,-0.78


In [4]:
from rdkit.Chem import QED, rdMolDescriptors, MolToSmiles, MolFromSmiles, Draw, \
                       Descriptors, Lipinski, Crippen, GraphDescriptors
from rdkit.Chem.FilterCatalog import FilterCatalog, FilterCatalogParams
from rdkit.Contrib.SA_Score import sascorer
from rdkit import rdBase
import numpy as np
import warnings

warnings.filterwarnings("ignore")
rdBase.DisableLog('rdApp.error')

def smile(df, func):
    return df['smiles'].apply(lambda x: func(MolFromSmiles(x)))

df_test['exactmw'] = smile(df_test, Descriptors.ExactMolWt)
df_test['FractionCSP3'] = smile(df_test, Lipinski.FractionCSP3)
df_test['NumRings'] = smile(df_test, rdMolDescriptors.CalcNumRings)
df_test['NumAromaticRings'] = smile(df_test, Lipinski.NumAromaticRings)
df_test['NumAliphaticRings'] = smile(df_test, Lipinski.NumAliphaticRings)
df_test['NumHeterocycles'] = smile(df_test, rdMolDescriptors.CalcNumHeterocycles)
df_test['NumAromaticHeterocycles'] = smile(df_test, rdMolDescriptors.CalcNumAromaticHeterocycles)
df_test['NumSaturatedHeterocycles'] = smile(df_test, rdMolDescriptors.CalcNumSaturatedHeterocycles)
df_test['NumAliphaticHeterocycles'] = smile(df_test, rdMolDescriptors.CalcNumAliphaticHeterocycles)
df_test['NumSpiroAtoms'] = smile(df_test, rdMolDescriptors.CalcNumSpiroAtoms)
df_test['NumBridgeheadAtoms'] = smile(df_test, rdMolDescriptors.CalcNumBridgeheadAtoms)
df_test['NumAtomStereoCenters'] = smile(df_test, rdMolDescriptors.CalcNumAtomStereoCenters)
df_test['NumUnspecifiedAtomStereoCenters'] = smile(df_test, rdMolDescriptors.CalcNumUnspecifiedAtomStereoCenters)
df_test['CrippenClogP'] = smile(df_test, Crippen.MolLogP)
df_test['hallKierAlpha'] = smile(df_test, GraphDescriptors.HallKierAlpha)

df_test.head()



Unnamed: 0,smiles,target,exactmw,FractionCSP3,NumRings,NumAromaticRings,NumAliphaticRings,NumHeterocycles,NumAromaticHeterocycles,NumSaturatedHeterocycles,NumAliphaticHeterocycles,NumSpiroAtoms,NumBridgeheadAtoms,NumAtomStereoCenters,NumUnspecifiedAtomStereoCenters,CrippenClogP,hallKierAlpha
0,CCOC(=O)[C@H](CCc1ccccc1)N[C@@H](C)C(=O)N1CCC[...,0,376.199822,0.55,2,1,1,1,0,1,1,0,0,3,0,1.6046,-2.41
1,O=C1Nc2ccc(Cl)cc2[C@@](C#CC2CC2)(C(F)(F)F)O1,0,315.027391,0.357143,3,1,2,1,0,0,1,0,0,1,0,4.0731,-1.87
2,CN1C[C@H](C(=O)N[C@]2(C)O[C@@]3(O)[C@@H]4CCCN4...,0,583.279469,0.484848,8,3,5,5,1,4,4,0,0,7,0,2.0811,-3.73
3,CN1CCC[C@H]1c1cccnc1,1,162.115698,0.5,2,1,1,2,1,1,1,0,0,1,0,1.8483,-0.89
4,CN1CC=C(c2ccccc2)CC1,1,173.120449,0.333333,2,1,1,1,0,0,1,0,0,0,0,2.4055,-1.08


In [5]:
X_train = df_train.drop(columns=['smiles', 'target'])
Y_train = df_train['target']

X_test = df_test.drop(columns=['smiles', 'target'])
Y_test = df_test['target']

In [6]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier

model_RFR = RandomForestClassifier(n_estimators=1000, max_depth=500, random_state=42)
model_RFR.fit(X_train, Y_train)

model_GBC = GradientBoostingClassifier(n_estimators=1000, max_depth=500, random_state=42)
model_GBC.fit(X_train, Y_train)

Y_predicted1 = model_RFR.predict(X_test)
Y_predicted2 = model_GBC.predict(X_test)

In [7]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix

print("Random Forest Classifier")
print("Accuracy:", accuracy_score(Y_test, Y_predicted1))
print("Precision:", precision_score(Y_test, Y_predicted1))
print("Recall:", recall_score(Y_test, Y_predicted1))
print("F1-score:", f1_score(Y_test, Y_predicted1))
print("ROC-AUC:", roc_auc_score(Y_test, Y_predicted1))
print("Confusion matrix:\n", confusion_matrix(Y_test, Y_predicted1))

print("\nGradient Boosting Classifier")
print("Accuracy:", accuracy_score(Y_test, Y_predicted2))
print("Precision:", precision_score(Y_test, Y_predicted2))
print("Recall:", recall_score(Y_test, Y_predicted2))
print("F1-score:", f1_score(Y_test, Y_predicted2))
print("ROC-AUC:", roc_auc_score(Y_test, Y_predicted2))
print("Confusion matrix:\n", confusion_matrix(Y_test, Y_predicted2))

Random Forest Classifier
Accuracy: 0.5784313725490197
Precision: 0.5614035087719298
Recall: 0.897196261682243
F1-score: 0.6906474820143885
ROC-AUC: 0.5620001926967916
Confusion matrix:
 [[22 75]
 [11 96]]

Gradient Boosting Classifier
Accuracy: 0.5686274509803921
Precision: 0.5568862275449101
Recall: 0.8691588785046729
F1-score: 0.6788321167883211
ROC-AUC: 0.5531361402832643
Confusion matrix:
 [[23 74]
 [14 93]]


In [8]:
from joblib import dump

dump(model_RFR, 'RFR_model_bbbp.joblib')

['RFR_model_bbbp.joblib']