# SVR

In [2]:
import os.path
import json
import pandas as pd
# from sklearn.svm import SVR
from cuml.svm import SVR
from cuml.svm import LinearSVR
import numpy as np
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from scipy.stats import pearsonr
from hyperopt import fmin, tpe, hp, Trials, space_eval
from copy import deepcopy
import warnings
warnings.filterwarnings("ignore")
from sklearnex import patch_sklearn
patch_sklearn()
current_dir = os.getcwd()


def return_scores(y_true, y_pred):
    y_true = np.array(y_true).ravel()
    y_pred = np.array(y_pred).ravel()
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    mae = mean_absolute_error(y_true, y_pred)
    r2 = r2_score(y_true, y_pred)
    pcc = pearsonr(y_true, y_pred)[0]
    return rmse, mae, r2, pcc


def return_x_y(df_filtered):
    y = df_filtered[label_name].values
    mask = ~np.isnan(y)

    # factors
    auxiliary_data = []
    if use_t_ph_embedding:
        ph = df_filtered['ph'].values.reshape(-1, 1)
        t = df_filtered['t'].values.reshape(-1, 1)
        auxiliary_data.append(ph)
        auxiliary_data.append(t)

    if use_mw_logp:
        mw = df_filtered['mw'].values.reshape(-1, 1)
        logp = df_filtered['logp'].values.reshape(-1, 1)
        auxiliary_data.append(mw)
        auxiliary_data.append(logp)

    protein_data = np.array(df_filtered[protein_column].tolist())
    substrate_data = np.array(df_filtered[substrate_column].tolist())
    x = np.hstack([protein_data, substrate_data] + auxiliary_data)

    return x[mask], y[mask]

def search_params(params):
    print(params)
    temp_params = deepcopy(params)

    val_scores_list = []
    for train_index, val_index in kf.split(df_train_val):
        df_train = df_train_val.iloc[train_index]
        df_val = df_train_val.iloc[val_index]

        train_x, train_y = return_x_y(df_train)
        val_x, val_y = return_x_y(df_val)

        # 如果 kernel 是 linear，使用 LinearSVR
        if temp_params["kernel"] == "linear":
            model = LinearSVR(C=temp_params["C"], epsilon=temp_params["epsilon"], dual=False)
        else:
            model = SVR(kernel=temp_params["kernel"], C=temp_params["C"], epsilon=temp_params["epsilon"], dual=False)

        model.fit(train_x, train_y)
        val_predicted = model.predict(val_x)
        val_scores = return_scores(val_y, val_predicted)
        val_scores_list.append(val_scores)

    val_scores_mean = np.mean(val_scores_list, axis=0)
    print(f"[Val_mean] rmse {val_scores_mean[0]:.3f} mae {val_scores_mean[1]:.3f} r2 {val_scores_mean[2]:.3f} pcc {val_scores_mean[3]:.3f}")

    return val_scores_mean[0]

def search_best_param(max_evals):
    space = {
        "C": hp.uniform("C", 0.1, 100),
        "epsilon": hp.uniform("epsilon", 0.01, 0.5),
        "kernel": hp.choice("kernel", ["linear", "poly", "rbf", "sigmoid"]),
    }

    trials = Trials()
    print(f'[Info] Starting parameter search...')
    best_params = fmin(fn=search_params, space=space, algo=tpe.suggest, max_evals=max_evals, trials=trials)
    best_params = space_eval(space, best_params)

    # Save the best params to JSON
    with open(params_json_path, 'w') as json_file:
        json.dump(best_params, json_file)

    return best_params


# Main
protein_column,  substrate_column = 'prott5', 'molebert'
label_name = 'logkcatkm'
random_state = 66
search_max_evals = 60

df_input = pd.read_pickle(f'{current_dir}/../../data_process/dataset/df_all_log_transformed.pkl')
df_train_val, df_test = train_test_split(df_input, test_size=0.2, random_state=random_state)
kf = KFold(n_splits=5, shuffle=True, random_state=random_state)

# Save results
results = []
cv_results = []
input_model = 'svr'

use_t_ph_embedding = True
use_mw_logp = True

val_scores_list = []
test_scores_list = []

# Search best params
params_json_path = f'{current_dir}/model_dict/{input_model}_params.json'
if os.path.exists(params_json_path):
    with open(params_json_path) as json_file:
        params = json.load(json_file)
else:
    params = search_best_param(search_max_evals)

# Train
_params = deepcopy(params)

for fold, (train_index, val_index) in enumerate(kf.split(df_train_val), start=1):
    print(f"Fold: {fold}/5")
    df_train = df_train_val.iloc[train_index]
    df_val = df_train_val.iloc[val_index]

    train_x, train_y = return_x_y(df_train)
    val_x, val_y = return_x_y(df_val)
    test_x, test_y = return_x_y(df_test)

    # 如果 kernel 是 linear，使用 LinearSVR
    if _params["kernel"] == "linear":
        model = LinearSVR(C=_params["C"], epsilon=_params["epsilon"], dual=False)
    else:
        model = SVR(kernel=_params["kernel"], C=_params["C"], epsilon=_params["epsilon"], dual=False)

    model.fit(train_x, train_y)

    val_predicted = model.predict(val_x)
    val_scores = return_scores(val_y, val_predicted)
    val_scores_list.append(val_scores)

    test_predicted = model.predict(test_x)
    test_scores = return_scores(test_y, test_predicted)
    test_scores_list.append(test_scores)

    # Fold
    cv_results.append([
        fold,
        val_scores[0], val_scores[1], val_scores[2], val_scores[3],
        test_scores[0], test_scores[1], test_scores[2], test_scores[3]
    ])

# Mean
val_scores_mean = np.mean(val_scores_list, axis=0)
test_scores_mean = np.mean(test_scores_list, axis=0)
print(f"Dimension of x: {train_x.shape[1]}")
print(f"[Val] rmse {val_scores_mean[0]:.4f} mae {val_scores_mean[1]:.4f} r2 {val_scores_mean[2]:.4f} pcc {val_scores_mean[3]:.4f} "
      f"[Test] rmse {test_scores_mean[0]:.4f} mae {test_scores_mean[1]:.4f} r2 {test_scores_mean[2]:.4f} pcc {test_scores_mean[3]:.4f}\n")

# Save CV results to Excel
df_cv_results = pd.DataFrame(cv_results, columns=[
    "Fold",
    "Val_RMSE", "Val_MAE", "Val_R2", "Val_PCC",
    "Test_RMSE", "Test_MAE", "Test_R2", "Test_PCC"])
df_cv_results.to_excel(f"{current_dir}/results/{input_model}_cv_results.xlsx", index=False)
print("Results saved")

Intel(R) Extension for Scikit-learn* enabled (https://github.com/uxlfoundation/scikit-learn-intelex)


[Info] Starting parameter search...
  0%|          | 0/60 [00:00<?, ?trial/s, best loss=?]                                                      {'C': 83.07341272102286, 'epsilon': 0.01108793159160973, 'kernel': 'poly'}
  0%|          | 0/60 [00:00<?, ?trial/s, best loss=?]                                                      [Val_mean] rmse 4.880 mae 3.237 r2 -0.523 pcc 0.165
  0%|          | 0/60 [04:24<?, ?trial/s, best loss=?]  2%|▏         | 1/60 [04:24<4:19:50, 264.25s/trial, best loss: 4.8797476101383]                                                                                {'C': 7.862610555110293, 'epsilon': 0.41678879166080257, 'kernel': 'poly'}
  2%|▏         | 1/60 [04:24<4:19:50, 264.25s/trial, best loss: 4.8797476101383]                                                                                [Val_mean] rmse 4.477 mae 3.274 r2 -0.187 pcc 0.091
  2%|▏         | 1/60 [05:34<4:19:50, 264.25s/trial, best loss: 4.8797476101383]  3%|▎         | 2/60 [05: