In [1]:
import pandas as pd
import os
import sys
sys.path.append("../utilities/")
from utility import FeatureGenerator

### Data Processing

In [2]:
train_df = pd.read_csv("../datasets/blood_brain_barrier_solubility/BBB_TrainingSet1.csv")
train_df.head()

Unnamed: 0,Name,Structure,Class
0,12dichlorotetrafluoroethane (cryofluorane),[F]C([F])([Cl])C([F])([F])[Cl],1
1,2methylthiophencyclidine (gacyclidine),C[C@@]1([H])CCCC[C@@]1(N2CCCCC2)C=3[S]C=CC=3,1
2,7amphetaminoethyltheophylline (fenetylline),C[C@@]([H])(C/C1=C/C=CC=C1)NCCN2C=NC=3N(C)C(=O...,1
3,acaprazine,CC(=O)NCCCN1CCN(CC1)C2=CC([Cl])=C/C=C2/[Cl],1
4,acebrochol,CC(C)([H])CCC[C@@](C)([H])[C@@]3([H])CC[C@@]4(...,1


In [3]:
test_df = pd.read_csv("../datasets/blood_brain_barrier_solubility/BBB_TestSet1.csv")
test_df.head()

Unnamed: 0,Name,Structure,Class
0,acepromazine,CN(C)CCCN2C1=CC=CC=C1[S]C3=CC=C(C=C23)C(C)=O,1
1,aceprometazine,C[C@@]([H])(CN2C1=CC=CC=C1[S]C3=CC=C(C=C23)C(C...,1
2,acetophenazine,CC(=O)C4=CC=C3[S]C1=CC=CC=C1N(CCCN2CCN(CCO)CC2...,1
3,adinazolam,CN(C)C/C3=N/N=C4C/N=C(/C1=C/C=CC=C1)C2=CC([Cl]...,1
4,albutoin,CC(C)([H])C[C@@]1([H])NC(=[S])N(CC=C)C1=O,1


In [4]:
external_df = pd.read_csv("../datasets/blood_brain_barrier_solubility/BBB_ExternalSet.csv")
external_df.head()

Unnamed: 0,Name,Structure,Class
0,2,NC(N)=NC=1[S]/C=C(/C)N=1,1
1,14,O=N(=O)C=1C=CNC=1NCC[S]CC=2C=CC=CN=2,1
2,15,CC=1C(C)=CC(=NC=1C)C(C)(C)[S]C(C)(C)CNC=2NC=CC...,1
3,16,NC(N)=N/C1=N/C(=C[S]1)C2=CC=CC=C2,1
4,24,O=C(C)NCCCOC=2C=C/C=C(/CN1CCCCC1)C=2,1


In [5]:
len(train_df), len(test_df), len(external_df)

(1093, 500, 246)

In [6]:
def get_features(smiles):
    try:
        feat_gen = FeatureGenerator(smiles)
        features = feat_gen.toTPATF()
        return features
    except:
        return None

In [7]:
train_df['tpatf'] = train_df.Structure.apply(get_features)

In [8]:
test_df['tpatf'] = test_df.Structure.apply(get_features)

In [9]:
external_df['tpatf'] = external_df.Structure.apply(get_features)

RDKit ERROR: [15:36:18] Explicit valence for atom # 1 C, 5, is greater than permitted
[15:36:18] Explicit valence for atom # 1 C, 5, is greater than permitted


### Model training

In [10]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
import numpy as np
from sklearn.metrics import classification_report

In [11]:
train_x, train_y = np.array([f for f in train_df.tpatf.values], dtype=np.float32), train_df.Class.values
test_x, test_y = np.array([f for f in test_df.tpatf.values], dtype=np.float32), test_df.Class.values
print(train_x.shape, train_y.shape)
print(test_x.shape, test_y.shape)

(1093, 2692) (1093,)
(500, 2692) (500,)


In [12]:
clf = RandomForestClassifier(class_weight="balanced")
param_grid = {"n_estimators": [i for i in range(100, 1001, 100)]}
grid_clf = GridSearchCV(estimator=clf, cv=5, param_grid=param_grid, verbose=True, n_jobs=-1)
grid_clf.fit(train_x, train_y)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


GridSearchCV(cv=5, estimator=RandomForestClassifier(class_weight='balanced'),
             n_jobs=-1,
             param_grid={'n_estimators': [100, 200, 300, 400, 500, 600, 700,
                                          800, 900, 1000]},
             verbose=True)

In [13]:
model = grid_clf.best_estimator_
model

RandomForestClassifier(class_weight='balanced', n_estimators=600)

Training performance

In [14]:
y_pred = model.predict(train_x)
print(classification_report(y_true=train_y, y_pred=y_pred))

              precision    recall  f1-score   support

          -1       1.00      1.00      1.00       261
           1       1.00      1.00      1.00       832

    accuracy                           1.00      1093
   macro avg       1.00      1.00      1.00      1093
weighted avg       1.00      1.00      1.00      1093



Test performance

In [15]:
y_pred = model.predict(test_x)
print(classification_report(y_true=test_y, y_pred=y_pred))

              precision    recall  f1-score   support

          -1       0.95      0.78      0.85        49
           1       0.98      1.00      0.99       451

    accuracy                           0.97       500
   macro avg       0.96      0.89      0.92       500
weighted avg       0.97      0.97      0.97       500

