# LightGBM

In [1]:
import os.path
import json
import pandas as pd
import lightgbm as lgb
import numpy as np
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from scipy.stats import pearsonr
from hyperopt import fmin, tpe, hp, Trials
from copy import deepcopy
import warnings
warnings.filterwarnings("ignore")
current_dir = os.getcwd()


def return_scores(y_true, y_pred):
    y_true = np.array(y_true).ravel()
    y_pred = np.array(y_pred).ravel()
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    mae = mean_absolute_error(y_true, y_pred)
    r2 = r2_score(y_true, y_pred)
    pcc = pearsonr(y_true, y_pred)[0]
    return rmse, mae, r2, pcc

def return_x_y(df_filtered):
    y = df_filtered[label_name].values
    mask = ~np.isnan(y)

    # factors
    auxiliary_data = []
    if use_t_ph_embedding:
        ph = df_filtered['ph'].values.reshape(-1, 1)
        t = df_filtered['t'].values.reshape(-1, 1)
        auxiliary_data.append(ph)
        auxiliary_data.append(t)

    if use_mw_logp:
        mw = df_filtered['mw'].values.reshape(-1, 1)
        logp = df_filtered['logp'].values.reshape(-1, 1)
        auxiliary_data.append(mw)
        auxiliary_data.append(logp)

    protein_data = np.array(df_filtered[protein_column].tolist())
    substrate_data = np.array(df_filtered[substrate_column].tolist())
    x = np.hstack([protein_data, substrate_data] + auxiliary_data)

    return x[mask], y[mask]

def search_params(params):
    print(params)
    temp_params = deepcopy(params)
    temp_params.update({"device": "gpu", "gpu_platform_id": 0, "gpu_device_id": 0,
                        "verbosity": -1, "objective": "regression", "metric": ["rmse"]})
    num_rounds = temp_params.pop("num_iterations")

    val_scores_list = []
    for train_index, val_index in kf.split(df_train_val):
        df_train = df_train_val.iloc[train_index]
        df_val = df_train_val.iloc[val_index]

        train_x, train_y = return_x_y(df_train)
        val_x, val_y = return_x_y(df_val)

        # LightGBM Dataset
        train_data = lgb.Dataset(train_x, label=train_y)
        val_data = lgb.Dataset(val_x, label=val_y, reference=train_data)

        # LightGBM Regressor
        model = lgb.train(temp_params, train_data, num_rounds, valid_sets=[val_data], callbacks=[
            lgb.early_stopping(stopping_rounds=60)#, lgb.log_evaluation(period=100)
        ])

        # predict
        val_predicted = model.predict(val_x)
        val_scores = return_scores(val_y, val_predicted)
        val_scores_list.append(val_scores)

    val_scores_mean = np.mean(val_scores_list, axis=0)
    print(f"[Val_mean] rmse {val_scores_mean[0]:.3f} mae {val_scores_mean[1]:.3f} r2 {val_scores_mean[2]:.3f} pcc {val_scores_mean[3]:.3f}")

    return val_scores_mean[0]

def search_best_param(max_evals):
    space = {
        "learning_rate": hp.uniform("learning_rate", 0.01, 0.1),
        "max_depth": hp.randint("max_depth", 6, 10),
        "num_leaves": hp.randint("num_leaves", 20, 100),
        "min_data_in_leaf": hp.randint("min_data_in_leaf", 10, 30),
        "lambda_l1": hp.uniform("lambda_l1", 0, 1),
        "lambda_l2": hp.uniform("lambda_l2", 0, 1),
        "feature_fraction": hp.uniform("feature_fraction", 0.6, 1),
        "bagging_fraction": hp.uniform("bagging_fraction", 0.6, 1),
        "num_iterations": hp.randint("num_iterations", 1000, 3000),
    }

    trials = Trials()
    print(f'[Info] Starting parameter search...')
    best_params = fmin(fn=search_params, space=space, algo=tpe.suggest, max_evals=max_evals, trials=trials)

    best_params['max_depth'] = int(best_params['max_depth'])
    best_params['num_leaves'] = int(best_params['num_leaves'])
    best_params['num_iterations'] = int(best_params['num_iterations'])
    best_params['min_data_in_leaf'] = int(best_params['min_data_in_leaf'])

    # to json
    with open(params_json_path, 'w') as json_file:
        json.dump(best_params, json_file)

    return best_params


# main
protein_column,  substrate_column = 'prott5', 'molebert'
label_name = 'logkcatkm'
random_state = 66
search_max_evals = 60

df_input = pd.read_pickle(f'{current_dir}/../../data_process/dataset/df_all_log_transformed.pkl')
df_train_val, df_test = train_test_split(df_input, test_size=0.2, random_state=random_state)
kf = KFold(n_splits=5, shuffle=True, random_state=random_state)

# save results
results = []
cv_results = []
input_model = 'lgm'

use_t_ph_embedding = True
use_mw_logp = True

val_scores_list = []
test_scores_list = []

# search best params
params_json_path = f'{current_dir}/model_dict/{input_model}_params.json'
if os.path.exists(params_json_path):
    with open(params_json_path) as json_file:
        params = json.load(json_file)
else:
    params = search_best_param(search_max_evals)

# train
_params = deepcopy(params)
_params.update({"device": "gpu", "gpu_platform_id": 0, "gpu_device_id": 0,
                        "verbosity": -1, "objective": "regression", "metric": ["rmse"]})
num_rounds = _params.pop("num_iterations")

for fold, (train_index, val_index) in enumerate(kf.split(df_train_val), start=1):
    print(f"Fold: {fold}/5")
    df_train = df_train_val.iloc[train_index]
    df_val = df_train_val.iloc[val_index]

    train_x, train_y = return_x_y(df_train)
    val_x, val_y = return_x_y(df_val)
    test_x, test_y = return_x_y(df_test)

    # LightGBM Dataset
    train_data = lgb.Dataset(train_x, label=train_y)
    val_data = lgb.Dataset(val_x, label=val_y, reference=train_data)

    # LightGBM Regressor
    model = lgb.train(_params, train_data, num_rounds, valid_sets=[val_data], callbacks=[
        lgb.early_stopping(stopping_rounds=60), lgb.log_evaluation(period=500)
    ])

    # predict
    val_predicted = model.predict(val_x)
    val_scores = return_scores(val_y, val_predicted)
    val_scores_list.append(val_scores)

    # predict
    test_predicted = model.predict(test_x)
    test_scores = return_scores(test_y, test_predicted)
    test_scores_list.append(test_scores)

    # fold
    cv_results.append([
        fold,
        val_scores[0], val_scores[1], val_scores[2], val_scores[3],
        test_scores[0], test_scores[1], test_scores[2], test_scores[3]
    ])

# mean
val_scores_mean = np.mean(val_scores_list, axis=0)
test_scores_mean = np.mean(test_scores_list, axis=0)
print(f"Dimension of x: {train_x.shape[1]}")
print(f"[Val] rmse {val_scores_mean[0]:.4f} mae {val_scores_mean[1]:.4f} r2 {val_scores_mean[2]:.4f} pcc {val_scores_mean[3]:.4f} "
      f"[Test] rmse {test_scores_mean[0]:.4f} mae {test_scores_mean[1]:.4f} r2 {test_scores_mean[2]:.4f} pcc {test_scores_mean[3]:.4f}\n")

# save cvs
df_cv_results = pd.DataFrame(cv_results, columns=[
    "Fold",
    "Val_RMSE", "Val_MAE", "Val_R2", "Val_PCC",
    "Test_RMSE", "Test_MAE", "Test_R2", "Test_PCC"])
df_cv_results.to_excel(f"{current_dir}/results/{input_model}_cv_results.xlsx", index=False)
print("Results saved")

[Info] Starting parameter search...
{'bagging_fraction': 0.684038036638882, 'feature_fraction': 0.6377110418450845, 'lambda_l1': 0.6818439433538236, 'lambda_l2': 0.48412928004619604, 'learning_rate': 0.0498887866101942, 'max_depth': 9, 'min_data_in_leaf': 17, 'num_iterations': 1580, 'num_leaves': 67}
Training until validation scores don't improve for 60 rounds
Early stopping, best iteration is:                    
[823]	valid_0's rmse: 2.67927
Training until validation scores don't improve for 60 rounds
Early stopping, best iteration is:                    
[876]	valid_0's rmse: 2.62193
Training until validation scores don't improve for 60 rounds
Early stopping, best iteration is:                    
[1003]	valid_0's rmse: 2.57512
Training until validation scores don't improve for 60 rounds
Early stopping, best iteration is:                    
[977]	valid_0's rmse: 2.65803
Training until validation scores don't improve for 60 rounds
Early stopping, best iteration is:                  