# Training with ClusterCV

In [2]:
import sys

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from tqdm import tqdm

import clustercv

In [9]:
# Read data file
data = pd.read_csv("cmpd_activity_tcrd_5.4.1.csv", low_memory=False)
# Filter data
targets = clustercv.filter_data(data, act_type='IC50', min_samples=100)
# Generate features
n = targets[0]

target_data = clustercv.get_target_data(data, n)
   
# Get features
features = clustercv.batchECFP(target_data.smiles)
  
# Cluster features and generate folds
n_folds = 10
folds, max_score = clustercv.ClusterCV(features, n_folds=10)

Number of clusters: 43


In [10]:
def cross_val(n_estimators=10, criterion='mse', max_depth=None, min_samples_split=2, min_samples_leaf=1,
              max_features='auto', bootstrap=True, n_jobs=None, random_state=None):
    # Settings for RF
    rf = RandomForestRegressor(n_estimators=n_estimators, criterion=criterion, max_depth=max_depth,
                               min_samples_split=min_samples_split, min_samples_leaf=min_samples_leaf,
                               max_features=max_features, bootstrap=bootstrap, n_jobs=n_jobs,
                               random_state=random_state)
    # Store results
    scores = np.zeros(n_folds)
    r2scores = np.zeros(n_folds)
    n_features = features.shape[1]
    # CV loop
    for k in range (n_folds):
        # Get training and testing set
        test_index = np.where(folds == k)[0]
        train_index = np.where(folds != k)[0]
        # Get train and test samples
        X_test = features[test_index, :]
        X_train = features[train_index, :]
        y_test = target_data.act_value.iloc[test_index]
        y_train = target_data.act_value.iloc[train_index]
        # Run Model
        rf.fit(X_train, y_train)
        y_pred = rf.predict(X_test)
        scores[k] = mean_squared_error(y_test, y_pred)
        r2scores[k] = r2_score(y_test, y_pred)
    
    print("R2: %0.2f (+/- %0.2f)" % (r2scores.mean(), r2scores.std() * 2))
    print("MSE: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))
    print()

In [14]:
# Grid search settings    
n_estimators = [10, 100, 200, 300, 400, 500]
max_features = ['auto', 'sqrt']
max_depth = [2, 12, 23, 34, 45, 56, 66, 77, 88, 99, 110, None]
min_samples_split = [2, 5, 10]
min_samples_leaf = [1, 2, 4]
bootstrap = [True, False]
criterion=['mse', 'mae']

In [15]:
for n in n_estimators:
    print("n_estimators =", n)
    cross_val(n_estimators=n, random_state = 42)

n_estimators = 10
R2: -0.42 (+/- 1.83)
MSE: 0.31 (+/- 0.38)

n_estimators = 100
R2: -0.29 (+/- 1.43)
MSE: 0.29 (+/- 0.32)

n_estimators = 200
R2: -0.26 (+/- 1.36)
MSE: 0.28 (+/- 0.32)

n_estimators = 300
R2: -0.26 (+/- 1.40)
MSE: 0.28 (+/- 0.32)

n_estimators = 400
R2: -0.27 (+/- 1.39)
MSE: 0.28 (+/- 0.32)

n_estimators = 500
R2: -0.28 (+/- 1.39)
MSE: 0.29 (+/- 0.32)



In [17]:
for n in n_estimators:
    for m in max_features:
        print("n_estimators =", n, "max_features =", m)
        cross_val(n_estimators=n, max_features=m, random_state=42)

n_estimators = 10 max_features = auto
R2: -0.42 (+/- 1.83)
MSE: 0.31 (+/- 0.38)

n_estimators = 10 max_features = sqrt
R2: -0.42 (+/- 1.11)
MSE: 0.32 (+/- 0.29)

n_estimators = 100 max_features = auto
R2: -0.29 (+/- 1.43)
MSE: 0.29 (+/- 0.32)

n_estimators = 100 max_features = sqrt
R2: -0.26 (+/- 1.01)
MSE: 0.29 (+/- 0.31)

n_estimators = 200 max_features = auto
R2: -0.26 (+/- 1.36)
MSE: 0.28 (+/- 0.32)

n_estimators = 200 max_features = sqrt
R2: -0.26 (+/- 1.09)
MSE: 0.28 (+/- 0.28)

n_estimators = 300 max_features = auto
R2: -0.26 (+/- 1.40)
MSE: 0.28 (+/- 0.32)

n_estimators = 300 max_features = sqrt
R2: -0.24 (+/- 1.04)
MSE: 0.28 (+/- 0.29)

n_estimators = 400 max_features = auto
R2: -0.27 (+/- 1.39)
MSE: 0.28 (+/- 0.32)

n_estimators = 400 max_features = sqrt
R2: -0.24 (+/- 1.05)
MSE: 0.28 (+/- 0.29)

n_estimators = 500 max_features = auto
R2: -0.28 (+/- 1.39)
MSE: 0.29 (+/- 0.32)

n_estimators = 500 max_features = sqrt
R2: -0.24 (+/- 1.05)
MSE: 0.28 (+/- 0.28)

