In [1]:
import numpy as np
import matplotlib.pyplot as plt
import os

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense, Conv1D, Dropout, Flatten

from sklearn.ensemble import RandomForestRegressor
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import ConstantKernel, RBF, WhiteKernel
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error

BOLD = '\033[1m'
END = '\033[0m'

In [2]:
# Support functions

def train_test_split(X, y, test_idx=None, test_size=0.2):
    ''' Split input data X and labels y into training and testing arrays '''
    n_list = list(range(len(y)))
    if test_idx is None:
        print('\n -- Performing NEW train-test split -- \n')
        test_idx = np.random.choice(n_list, replace=False, size=int(test_size*len(y)))
    else:
        test_idx = np.array(test_idx)
    train_idx = np.array([i for i in n_list if i not in test_idx])
    X_train = X[train_idx,:]
    X_test = X[test_idx,:]
    y_train = y[train_idx]
    y_test = y[test_idx]
    return X_train, y_train, X_test, y_test, test_idx


def pred_vs_true(y_true, y_pred, plotTitle=None, plotColor='tab:blue', saveLoc=None,
                 xLabel='Test Labels', yLabel='Predictions'):
    ''' Plot an array of predicted values vs. label values
        Return the correlation score (R^2), mean average error (MAE), and root mean squared error (RMSE)
    '''
    r2 = r2_score(y_true, y_pred)
    mae = mean_absolute_error(y_true, y_pred)
    rmse = mean_squared_error(y_true, y_pred, squared=False)
    if plotTitle:
        plt.figure(figsize=(6,6))
        plt.title(f'{plotTitle}\nR^2: {round(r2,3)},   MAE: {round(mae,2)},   RMSE: {round(rmse,2)}', size=14)
        plt.xlabel(xLabel, size=20)
        plt.ylabel(yLabel, size=20)
        plt.scatter(y_true, y_pred, c=plotColor)
        ymin, ymax = min(y_true), max(y_true)
        plt.plot([ymin, ymax], [ymin, ymax], c='k')
        plt.grid()
        if saveLoc: plt.savefig(f'{saveLoc}.png', facecolor='w', bbox_inches='tight')
        plt.show()
        plt.close()
    return r2, mae, rmse

    
def compare_metrics(r2_vals, mae_vals, rmse_vals, plotTitle=None, plotColor='tab:blue', saveLoc=None):
    plt.figure()
    plt.title(plotTitle, size=14)
    plt.xlabel('R^2', size=20)
    plt.ylabel('Error', size=20)
    plt.plot(r2_vals, mae_vals, 'D', c=plotColor, label='MAE', markerfacecolor='white')
    plt.plot(r2_vals, rmse_vals, 'o', c=plotColor, label='RMSE')
    plt.grid()
    plt.legend(bbox_to_anchor=(1,1), loc='upper left', shadow=True, fontsize=16)
    if saveLoc: plt.savefig(f'{saveLoc}.png', facecolor='w', bbox_inches='tight')
    plt.show()
    plt.close()

In [3]:
def Tao_MLP(X_train, y_train, epochs=100, verbose=0, out_dims=1):
    model = Sequential([
        Dense(8, activation='relu'),
        Dense(8, activation='relu'),
        Dense(out_dims)
    ])
    model.compile(loss='mean_squared_error', optimizer='adam')
    model.fit(X_train, y_train, batch_size=32, epochs=epochs, verbose=verbose)
    return model
print('Method Implemented:\tTao_MLP  \t(keras)')


def my_MLP(X_train, y_train, epochs=100, verbose=0):
    model = Sequential([
        Dense(32, activation='relu'),
        Dense(32, activation='relu'),
        Dense(32, activation='relu'),
        Dense(1)
    ])
    model.compile(loss='mean_squared_error', optimizer='adam')
    model.fit(X_train, y_train, batch_size=32, epochs=epochs, verbose=verbose)
    return model
print('Method Implemented:\tmy_MLP  \t(keras)')

    
def Tao_CNN_1D(X_train, y_train, epochs=100, verbose=0):
    if len(X_train.shape) == 2:
        X_train = np.expand_dims(X_train, axis=-1).astype(float)
    model = Sequential([
        Conv1D(filters=8, kernel_size=8, strides=1),
        Dropout(0.1),
        Flatten(),
        Dense(1)
    ])
    model.compile(loss='mean_squared_error', optimizer='adam')
    model.fit(X_train, y_train, epochs=100, verbose=verbose)
    return model
print('Method Implemented:\tTao_CNN_1D\t(keras)')


def Tao_RF(X_train, y_train):
    model = RandomForestRegressor(n_estimators=100, max_depth=10, min_samples_split=2)
    model.fit(X_train, y_train)
    return model
print('Method Implemented:\tTao_RF  \t(sklearn)')



def my_GPR(X_train, y_train, restarts=9):
    kernel = 1 * RBF(length_scale=1.0, length_scale_bounds=(1e-2, 1e2))
    model = GaussianProcessRegressor(kernel=kernel, n_restarts_optimizer=restarts)
    model.fit(X_train, y_train)
    return model
print('Method Implemented:\tmy_GPR  \t(sklearn)')


def Tao_GPR(X_train, y_train, restarts=9):
    kernel = ConstantKernel(constant_value=1.0) * RBF(length_scale=10) * WhiteKernel(noise_level=0.1)
    model = GaussianProcessRegressor(kernel=kernel, n_restarts_optimizer=restarts)
    model.fit(X_train, y_train)
    return model
print('Method Implemented:\tTao_GPR  \t(sklearn)')

None

Method Implemented:	Tao_MLP  	(keras)
Method Implemented:	my_MLP  	(keras)
Method Implemented:	Tao_CNN_1D	(keras)
Method Implemented:	Tao_RF  	(sklearn)


In [4]:
def multi_test(model, X, y, test_indices=[None]*10, nTrials=20, saveFile=None, **model_kwargs):
    
    r2_results, mae_results, rmse_results = [], [], []
    print(f'X.shape: {X.shape}')
    
    for test, test_idx in enumerate(test_indices):
        print(f'{BOLD}[Test #{test}]:{END}', end=' [')
        X_train, y_train, X_test, y_test, _ = train_test_split(X, y, test_idx)
        if model == Tao_CNN_1D: X_test = np.expand_dims(X_test, axis=-1)
        r2_test, mae_test, rmse_test, test_models = [], [], [], {}
        
        for trial in range(nTrials):
            mod = model(X_train, y_train, **model_kwargs)
            y_pred = mod.predict(X_test)
            r2, mae, rmse = pred_vs_true(y_test, y_pred)
            r2_test.append(r2)
            mae_test.append(mae)
            rmse_test.append(rmse)
            print(f'{round(r2*100)}', end=',')
                        
        r2_results.append(r2_test)
        mae_results.append(mae_test)
        rmse_results.append(rmse_test)
        print(f'] best R^2: {round(max(r2_test),3)}')
    
    print('done')
    print('\nAverage of Bests (+/- St.Dev.):')
    best = np.max(r2_results, axis=-1)
    r2_report = f'R^2:\t{np.mean(best)}   +/-   {np.std(best)}'
    print(r2_report)
    best = np.min(mae_results, axis=-1)
    mae_report = f'MAE:\t{np.mean(best)}   +/-   {np.std(best)}'
    print(mae_report)
    best = np.min(rmse_results, axis=-1)
    rmse_report = f'RMSE:\t{np.mean(best)}   +/-   {np.std(best)}'
    print(rmse_report)
    
    print('\nGlobal Average (+/- St.Dev.):')
    r2_global = f'R^2:\t{np.mean(r2_results)}   +/-   {np.std(r2_results)}'
    print(r2_global)
    mae_global = f'MAE:\t{np.mean(mae_results)}   +/-   {np.std(mae_results)}'
    print(mae_global)
    rmse_global = f'RMSE:\t{np.mean(rmse_results)}   +/-   {np.std(rmse_results)}'
    print(rmse_global)
    
    if saveFile:
        if saveFile[-4:] != '.txt':
            saveFile += '.txt'
        with open(saveFile, 'w') as file:
            file.write(f'Model:\t{str(model).split(" ")[1]}\n')
            file.write(f'X.shape:\t{X.shape}\n')
            file.write(f'nTests:\t{len(test_indices)}\n')
            file.write(f'nTrials:\t{nTrials}\n')
            
            file.write(f'\nAverage of Bests (+/- St.Dev.):\n')
            file.write(f'{r2_report}\n')
            file.write(f'{mae_report}\n')
            file.write(f'{rmse_report}\n')
            
            file.write(f'\nGlobal Average (+/- St.Dev.):\n')
            file.write(f'{r2_global}\n')
            file.write(f'{mae_global}\n')
            file.write(f'{rmse_global}\n')
            
            file.write(f'\nR^2:\n{r2_results}\n')
            file.write(f'\nMAE:\n{mae_results}\n')
            file.write(f'\nRMSE:\n{rmse_results}\n')
            
    return r2_results, mae_results, rmse_results

print('\nFunction Defined:\tmulti_test')


Function Defined:	multi_test


In [5]:
def benchmark(model, smiles, X, y, test_idx, prefix='No info\n', nTrials=20, saveFile=None, **model_kwargs):
    
    # opt, metric = best_metric # just do highest r^2 for now
    X_train, y_train, X_test, y_test, _ = train_test_split(X, y, test_idx)
    results = [] # list of dicts
    
    print('Start Trials:',end=' ')
    for trial in range(nTrials):
        mod = model(X_train, y_train, **model_kwargs)
        y_pred = mod.predict(X_test)
        r2, mae, rmse = pred_vs_true(y_test, y_pred)
        results.append({'r2':r2, 'model':mod})
        print(round(r2*100), end=' ') 
    
    results.sort(key=lambda res: res['r2'], reverse=True)
    print(f'\nBest Test R^2: {results[0]["r2"]}')
    
    mod = results[0]['model']
    y_pred = mod.predict(X)

    if saveFile:
        if saveFile[-4:] != '.txt':
            saveFile += '.txt'
        with open(saveFile, 'w') as file:
            file.write(f'SMILES,Predicted Value,Used in Training,Target Value ({prefix})\n')
            for i, (sm, y_true_val, y_pred_val) in enumerate(zip(smiles, y, y_pred)):
                train_included = 0 if i in test_idx else 1
                yt = float(y_pred_val)
                file.write(f'{sm},{round(yt,5)},{train_included},{round(y_true_val,5)}\n')
            
    return results
print('Function Defined:\tbenchmark')


Function Defined:	benchmark
