# Training with ClusterCV

In [4]:
import sys

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from tqdm import tqdm

import clustercv

In [10]:
# Read data file
data = pd.read_csv("cmpd_activity_tcrd_5.4.1.csv", low_memory=False)
# Filter data
targets = clustercv.filter_data(data, act_type='IC50', min_samples=100)
# Generate features
#n = targets[2]
for i in range(len(targets)):
    n = targets[i]
    print("Target:", n)
    target_data = clustercv.get_target_data(data, n)
    
    # Get features
    features = clustercv.batchECFP(target_data.smiles)
    
    # Cluster features and generate folds
    n_folds = 10
    folds, max_score = clustercv.ClusterCV(features, n_folds=10)
    
    # Train a random forest
    rf = RandomForestRegressor(n_estimators=500, random_state=42, n_jobs=3)
    scores = np.zeros(n_folds)
    r2scores = np.zeros(n_folds)
    n_features = features.shape[1]
    for k in range (n_folds):
        # Get training and testing set
        test_index = np.where(folds == k)[0]
        train_index = np.where(folds != k)[0]
        # Get train and test samples
        X_test = features[test_index, :]
        X_train = features[train_index, :]
        y_test = target_data.act_value.iloc[test_index]
        y_train = target_data.act_value.iloc[train_index]
        # Run Model
        rf.fit(X_train, y_train)
        y_pred = rf.predict(X_test)
        scores[k] = mean_squared_error(y_test, y_pred)
        r2scores[k] = r2_score(y_test, y_pred)
    print("R2: %0.2f (+/- %0.2f)" % (
        r2scores.mean(), r2scores.std() * 2))
    print("MSE: %0.2f (+/- %0.2f)" % (
        scores.mean(), scores.std() * 2))
    print()

Target: 27
Number of clusters: 43
R2: -0.41 (+/- 1.79)
MSE: 0.30 (+/- 0.36)

Target: 36
Number of clusters: 45
R2: 0.01 (+/- 1.12)
MSE: 0.40 (+/- 0.52)

Target: 39
Number of clusters: 41
R2: -1.53 (+/- 4.63)
MSE: 0.52 (+/- 0.44)

Target: 42
Number of clusters: 120
R2: 0.47 (+/- 0.52)
MSE: 0.44 (+/- 0.64)

Target: 59
Number of clusters: 103
R2: -0.06 (+/- 0.76)
MSE: 0.28 (+/- 0.32)

Target: 61
Number of clusters: 154
R2: -0.02 (+/- 0.78)
MSE: 0.76 (+/- 0.80)

Target: 77
Number of clusters: 133
R2: -0.36 (+/- 1.34)
MSE: 0.33 (+/- 0.40)

Target: 103
Number of clusters: 242
R2: 0.24 (+/- 0.46)
MSE: 0.35 (+/- 0.22)

Target: 104
Number of clusters: 53
R2: -1.33 (+/- 8.16)
MSE: 0.44 (+/- 0.45)

Target: 139
Number of clusters: 66
R2: -0.09 (+/- 1.30)
MSE: 0.01 (+/- 0.01)

Target: 144
Number of clusters: 1011
R2: 0.40 (+/- 0.19)
MSE: 0.49 (+/- 0.24)

Target: 150
Number of clusters: 50
R2: -0.52 (+/- 1.58)
MSE: 0.91 (+/- 0.76)

Target: 153
Number of clusters: 392
R2: 0.27 (+/- 0.42)
MSE: 0.49 (+

KeyboardInterrupt: 