# ExtraTrees

In [1]:
import os.path
import json
import pandas as pd
from sklearn.ensemble import ExtraTreesRegressor
import numpy as np
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from scipy.stats import pearsonr
from hyperopt import fmin, tpe, hp, Trials, space_eval
from copy import deepcopy
from tqdm import tqdm
import warnings
warnings.filterwarnings("ignore")
from sklearnex import patch_sklearn
patch_sklearn()
current_dir = os.getcwd()


def return_scores(y_true, y_pred):
    y_true = np.array(y_true).ravel()
    y_pred = np.array(y_pred).ravel()
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    mae = mean_absolute_error(y_true, y_pred)
    r2 = r2_score(y_true, y_pred)
    pcc = pearsonr(y_true, y_pred)[0]
    return rmse, mae, r2, pcc

def return_x_y(df_filtered):
    y = df_filtered[label_name].values
    mask = ~np.isnan(y)

    # factors
    auxiliary_data = []
    if use_t_ph_embedding:
        ph = df_filtered['ph'].values.reshape(-1, 1)
        t = df_filtered['t'].values.reshape(-1, 1)
        auxiliary_data.append(ph)
        auxiliary_data.append(t)

    if use_mw_logp:
        mw = df_filtered['mw'].values.reshape(-1, 1)
        logp = df_filtered['logp'].values.reshape(-1, 1)
        auxiliary_data.append(mw)
        auxiliary_data.append(logp)

    protein_data = np.array(df_filtered[protein_column].tolist())
    substrate_data = np.array(df_filtered[substrate_column].tolist())
    x = np.hstack([protein_data, substrate_data] + auxiliary_data)

    return x[mask], y[mask]

def search_params(params):
    print(params)
    temp_params = deepcopy(params)
    temp_params["n_estimators"] = int(temp_params["n_estimators"])
    temp_params["max_depth"] = int(temp_params["max_depth"])
    temp_params["min_samples_split"] = int(temp_params["min_samples_split"])
    temp_params["min_samples_leaf"] = int(temp_params["min_samples_leaf"])

    val_scores_list = []
    for train_index, val_index in kf.split(df_train_val):
        df_train = df_train_val.iloc[train_index]
        df_val = df_train_val.iloc[val_index]

        train_x, train_y = return_x_y(df_train)
        val_x, val_y = return_x_y(df_val)

        model = ExtraTreesRegressor(**temp_params, random_state=random_state, n_jobs=-1)
        model.fit(train_x, train_y)

        val_predicted = model.predict(val_x)
        val_scores = return_scores(val_y, val_predicted)
        val_scores_list.append(val_scores)

    val_scores_mean = np.mean(val_scores_list, axis=0)
    print(f"[Val_mean] rmse {val_scores_mean[0]:.3f} mae {val_scores_mean[1]:.3f} r2 {val_scores_mean[2]:.3f} pcc {val_scores_mean[3]:.3f}")

    return val_scores_mean[0]

def search_best_param(max_evals):
    space = {
        "n_estimators": hp.randint("n_estimators", 10, 300),
        "max_depth": hp.randint("max_depth", 5, 20),
        "min_samples_split": hp.randint("min_samples_split", 2, 20),
        "min_samples_leaf": hp.randint("min_samples_leaf", 1, 20),
        "max_features": hp.uniform("max_features", 0.5, 1.0),
    }

    trials = Trials()
    print(f'[Info] Starting parameter search...')
    best_params = fmin(fn=search_params, space=space, algo=tpe.suggest, max_evals=max_evals, trials=trials)
    best_params["n_estimators"] = int(best_params["n_estimators"])
    best_params["max_depth"] = int(best_params["max_depth"])
    best_params["min_samples_split"] = int(best_params["min_samples_split"])
    best_params["min_samples_leaf"] = int(best_params["min_samples_leaf"])

    best_params = space_eval(space, best_params)

    # to json
    with open(params_json_path, 'w') as json_file:
        json.dump(best_params, json_file)

    return best_params


# main
protein_column,  substrate_column = 'prott5', 'molebert'
label_name = 'logkcatkm'
random_state = 66
search_max_evals = 60

df_input = pd.read_pickle(f'{current_dir}/../../data_process/dataset/df_all_log_transformed.pkl')
df_train_val, df_test = train_test_split(df_input, test_size=0.2, random_state=random_state)
kf = KFold(n_splits=5, shuffle=True, random_state=random_state)

# save results
results = []
cv_results = []
input_model = 'eta'

use_t_ph_embedding = True
use_mw_logp = True

val_scores_list = []
test_scores_list = []

# search best params
params_json_path = f'{current_dir}/model_dict/{input_model}_params.json'
if os.path.exists(params_json_path):
    with open(params_json_path) as json_file:
        params = json.load(json_file)
else:
    params = search_best_param(search_max_evals)

# train
_params = deepcopy(params)
_params["n_estimators"] = int(_params["n_estimators"])
_params["max_depth"] = int(_params["max_depth"])
_params["min_samples_split"] = int(_params["min_samples_split"])
_params["min_samples_leaf"] = int(_params["min_samples_leaf"])

for fold, (train_index, val_index) in enumerate(kf.split(df_train_val), start=1):
    print(f"Fold: {fold}/5")
    df_train = df_train_val.iloc[train_index]
    df_val = df_train_val.iloc[val_index]

    train_x, train_y = return_x_y(df_train)
    val_x, val_y = return_x_y(df_val)
    test_x, test_y = return_x_y(df_test)

    # train
    model = ExtraTreesRegressor(**_params, random_state=random_state, n_jobs=-1)
    model.fit(train_x, train_y)

    # val
    val_predicted = model.predict(val_x)
    val_scores = return_scores(val_y, val_predicted)
    val_scores_list.append(val_scores)

    # test
    test_predicted = model.predict(test_x)
    test_scores = return_scores(test_y, test_predicted)
    test_scores_list.append(test_scores)

    # fold
    cv_results.append([
        fold,
        val_scores[0], val_scores[1], val_scores[2], val_scores[3],
        test_scores[0], test_scores[1], test_scores[2], test_scores[3]
    ])

# mean
val_scores_mean = np.mean(val_scores_list, axis=0)
test_scores_mean = np.mean(test_scores_list, axis=0)
print(f"Dimension of x: {train_x.shape[1]}")
print(f"[Val] rmse {val_scores_mean[0]:.4f} mae {val_scores_mean[1]:.4f} r2 {val_scores_mean[2]:.4f} pcc {val_scores_mean[3]:.4f} "
      f"[Test] rmse {test_scores_mean[0]:.4f} mae {test_scores_mean[1]:.4f} r2 {test_scores_mean[2]:.4f} pcc {test_scores_mean[3]:.4f}\n")

# save cvs
df_cv_results = pd.DataFrame(cv_results, columns=[
    "Fold",
    "Val_RMSE", "Val_MAE", "Val_R2", "Val_PCC",
    "Test_RMSE", "Test_MAE", "Test_R2", "Test_PCC"])
df_cv_results.to_excel(f"{current_dir}/results/{input_model}_cv_results.xlsx", index=False)
print("Results saved")

Intel(R) Extension for Scikit-learn* enabled (https://github.com/intel/scikit-learn-intelex)


[Info] Starting parameter search...
{'max_depth': 15, 'max_features': 0.6752909947391166, 'min_samples_leaf': 8, 'min_samples_split': 18, 'n_estimators': 98}
[Val_mean] rmse 2.909 mae 2.197 r2 0.510 pcc 0.726    
{'max_depth': 13, 'max_features': 0.9588932649357096, 'min_samples_leaf': 16, 'min_samples_split': 7, 'n_estimators': 106}
[Val_mean] rmse 3.066 mae 2.342 r2 0.456 pcc 0.697                             
{'max_depth': 14, 'max_features': 0.6210520146355192, 'min_samples_leaf': 17, 'min_samples_split': 4, 'n_estimators': 200}
[Val_mean] rmse 3.055 mae 2.327 r2 0.459 pcc 0.699                             
{'max_depth': 18, 'max_features': 0.7233029402755917, 'min_samples_leaf': 11, 'min_samples_split': 17, 'n_estimators': 55}
[Val_mean] rmse 2.892 mae 2.166 r2 0.515 pcc 0.726                             
{'max_depth': 13, 'max_features': 0.7620667943551569, 'min_samples_leaf': 12, 'min_samples_split': 17, 'n_estimators': 106}
[Val_mean] rmse 3.033 mae 2.316 r2 0.467 pcc 0.705    