In [3]:
import pandas as pd
import sys
sys.path.append("../utilities/")
from utility import FeatureGenerator
import h5py
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
import numpy as np
from sklearn.metrics import classification_report
from sklearn.externals import joblib
import glob
import os

  from ._conv import register_converters as _register_converters


In [4]:
def get_class(label):
    if label == "non-inhibitor":
        return 0
    else:
        return 1

In [5]:
# Extract features
def get_features(SMILES):
    try:
        feat_gen = FeatureGenerator(SMILES)
        return feat_gen.toTPATF()
    except:
        return float('NaN')
    

In [6]:
def doit(_id):
    print("Generating models for " + _id)
    data_file = _id + ".h5"
    model_file = _id + ".mdl"
    
    if not os.path.isfile(data_file):
        train_df = pd.read_excel("TrainSet_" + _id + ".xls")
        valid_df = pd.read_excel("ValidationSet_" + _id + ".xls")

        train_df['class'] = train_df.Labels.apply(get_class)
        valid_df['class'] = valid_df.Labels.apply(get_class)

        train_df['features'] = train_df.SMILES.apply(get_features)
        valid_df['features'] = valid_df.SMILES.apply(get_features)

        train_df = train_df.dropna()
        valid_df = valid_df.dropna()

        train_x, train_y = np.stack(train_df['features'].values).astype(np.float32), train_df['class'].values
        valid_x, valid_y = np.stack(valid_df['features'].values).astype(np.float32), valid_df['class'].values
        print(train_x.shape, train_y.shape)
        print(valid_x.shape, valid_y.shape)

        h5f = h5py.File(data_file, "w")
        h5f.create_dataset("train_x", data=train_x)
        h5f.create_dataset("train_y", data=train_y)
        h5f.create_dataset("valid_x", data=valid_x)
        h5f.create_dataset("valid_y", data=valid_y)
        h5f.close()
    else:
        print("Existing data file found. Not data is being generated")
    
    if not os.path.isfile(model_file):
        clf = RandomForestClassifier(class_weight="balanced")
        param_grid = {"n_estimators": [i for i in range(100, 1001, 100)]}
        grid_clf = GridSearchCV(estimator=clf, cv=5, param_grid=param_grid, verbose=True, n_jobs=-1)
        grid_clf.fit(train_x, train_y)

        model = grid_clf.best_estimator_
        print(model)
        
        print("REPORT ON " + _id)
        print("TRAINING PERFORMANCE")
        y_pred = model.predict(train_x)
        print(classification_report(y_true=train_y, y_pred=y_pred))
        
        print("TEST PERFORMANCE")
        y_pred = model.predict(valid_x)
        print(classification_report(y_true=valid_y, y_pred=y_pred))
    
        joblib.dump(model, model_file)
    else:
        print("Existing model foudn for " + _id + ". No model will be trained")

In [7]:
training_files = glob.glob("Train*")
training_files

['TrainSet_CYP2C9.xls',
 'TrainSet_CYP2C19.xls',
 'TrainSet_CYP2D6.xls',
 'TrainSet_CYP3A4.xls',
 'TrainSet_CYP1A2.xls']

In [8]:
_ids = [[i for i in os.path.splitext(f)][0].split('_')[1] for f in training_files]
print(_ids)

['CYP2C9', 'CYP2C19', 'CYP2D6', 'CYP3A4', 'CYP1A2']


In [9]:
for _id in _ids:
    doit(_id)

Generating models for CYP2C9
(12127, 2692) (12127,)
(2579, 2692) (2579,)
Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=-1)]: Done  41 out of  50 | elapsed:  2.1min remaining:   27.6s
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:  2.3min finished


RandomForestClassifier(bootstrap=True, class_weight='balanced',
            criterion='gini', max_depth=None, max_features='auto',
            max_leaf_nodes=None, min_impurity_decrease=0.0,
            min_impurity_split=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=300, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False)
REPORT ON CYP2C9
TRAINING PERFORMANCE
             precision    recall  f1-score   support

          0       1.00      0.99      1.00      7759
          1       0.99      1.00      0.99      4368

avg / total       0.99      0.99      0.99     12127

TEST PERFORMANCE
             precision    recall  f1-score   support

          0       0.89      0.93      0.91      1970
          1       0.75      0.64      0.69       609

avg / total       0.86      0.86      0.86      2579

Generating models for CYP2C19
Existing data file found. Not data is being generated
Exi

[Parallel(n_jobs=-1)]: Done  41 out of  50 | elapsed:  2.0min remaining:   26.2s
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:  2.2min finished


RandomForestClassifier(bootstrap=True, class_weight='balanced',
            criterion='gini', max_depth=None, max_features='auto',
            max_leaf_nodes=None, min_impurity_decrease=0.0,
            min_impurity_split=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=1000, n_jobs=1, oob_score=False,
            random_state=None, verbose=0, warm_start=False)
REPORT ON CYP2D6
TRAINING PERFORMANCE
             precision    recall  f1-score   support

          0       1.00      0.99      1.00      9362
          1       0.97      1.00      0.98      2516

avg / total       0.99      0.99      0.99     11878

TEST PERFORMANCE
             precision    recall  f1-score   support

          0       0.89      0.97      0.93      2316
          1       0.79      0.47      0.59       544

avg / total       0.87      0.88      0.86      2860

Generating models for CYP3A4
(11533, 2692) (11533,)
(7025, 2692) (7025,)
Fitting 5 fol

[Parallel(n_jobs=-1)]: Done  41 out of  50 | elapsed:  2.1min remaining:   27.6s
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:  2.5min finished


RandomForestClassifier(bootstrap=True, class_weight='balanced',
            criterion='gini', max_depth=None, max_features='auto',
            max_leaf_nodes=None, min_impurity_decrease=0.0,
            min_impurity_split=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=1000, n_jobs=1, oob_score=False,
            random_state=None, verbose=0, warm_start=False)
REPORT ON CYP3A4
TRAINING PERFORMANCE
             precision    recall  f1-score   support

          0       1.00      0.99      1.00      6898
          1       0.99      1.00      0.99      4635

avg / total       0.99      0.99      0.99     11533

TEST PERFORMANCE
             precision    recall  f1-score   support

          0       0.84      0.84      0.84      4955
          1       0.62      0.63      0.62      2070

avg / total       0.78      0.77      0.77      7025

Generating models for CYP1A2
Existing data file found. Not data is being generated
Exi