# UniKP with CBW

In [1]:
"""Refer and revise from UniKP https://github.com/Luo-SynBioLab/UniKP with gpl-3.0"""
import os, math
from sklearn.ensemble import ExtraTreesRegressor
import pandas as pd
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from scipy.stats import pearsonr
import random
from collections import Counter
from scipy.ndimage import gaussian_filter1d
from scipy.signal.windows import triang
import warnings
warnings.filterwarnings("ignore", category=UserWarning)
import numpy as np
from sklearnex import patch_sklearn
patch_sklearn()
current_dir = os.getcwd()
random_state = 66
random.seed(random_state)
np.random.seed(random_state)

def return_scores(y_test, y_pred):
    # 移除 NaN 值
    mask = ~np.isnan(y_test)
    y_test_filtered = y_test[mask]
    y_pred_filtered = y_pred[mask]

    # scores
    rmse = np.sqrt(mean_squared_error(y_test_filtered, y_pred_filtered))
    mae = mean_absolute_error(y_test_filtered, y_pred_filtered)
    r2 = r2_score(y_test_filtered, y_pred_filtered)
    pcc = pearsonr(y_test_filtered, y_pred_filtered)

    return rmse, mae, r2, pcc[0]


def return_x_y(df_filtered):
    y = df_filtered[label_name].values
    mask = ~np.isnan(y)

    # factors
    auxiliary_data = []
    if use_t_ph_embedding:
        ph = df_filtered['ph'].values.reshape(-1, 1)
        t = df_filtered['t'].values.reshape(-1, 1)
        auxiliary_data.append(ph)
        auxiliary_data.append(t)

    if use_mw_logp:
        mw = df_filtered['mw'].values.reshape(-1, 1)
        logp = df_filtered['logp'].values.reshape(-1, 1)
        auxiliary_data.append(mw)
        auxiliary_data.append(logp)

    protein_data = np.array(df_filtered[protein_column].tolist())
    substrate_data = np.array(df_filtered[substrate_column].tolist())
    x = np.hstack([protein_data, substrate_data] + auxiliary_data)

    return x[mask], y[mask]

def get_lds_kernel_window(kernel, ks, sigma):
    assert kernel in ['gaussian', 'triang', 'laplace']
    half_ks = (ks - 1) // 2
    if kernel == 'gaussian':
        base_kernel = [0.] * half_ks + [1.] + [0.] * half_ks
        kernel_window = gaussian_filter1d(base_kernel, sigma=sigma) / max(gaussian_filter1d(base_kernel, sigma=sigma))
    elif kernel == 'triang':
        kernel_window = triang(ks)
    else:
        laplace = lambda x: np.exp(-abs(x) / sigma) / (2. * sigma)
        kernel_window = list(map(laplace, np.arange(-half_ks, half_ks + 1))) / max(map(laplace, np.arange(-half_ks, half_ks + 1)))
    return kernel_window


def Smooth_Label(Label_new):
    labels = Label_new
    for i in range(len(labels)):
        labels[i] = labels[i] - min(labels)
    bin_index_per_label = [int(label*10) for label in labels]
    # print(bin_index_per_label)
    Nb = max(bin_index_per_label) + 1
    print(Nb)
    num_samples_of_bins = dict(Counter(bin_index_per_label))
    print(num_samples_of_bins)
    emp_label_dist = [num_samples_of_bins.get(i, 0) for i in range(Nb)]
    print(emp_label_dist, len(emp_label_dist))
    eff_label_dist = []
    beta = 0.9
    for i in range(len(emp_label_dist)):
        eff_label_dist.append((1-math.pow(beta, emp_label_dist[i])) / (1-beta))
    print(eff_label_dist)
    eff_num_per_label = [eff_label_dist[bin_idx] for bin_idx in bin_index_per_label]
    weights = [np.float32(1 / x) for x in eff_num_per_label]
    weights = np.array(weights)
    print(weights)
    print(len(weights))
    return weights


print('Reading data...', end='')
df_input = pd.read_pickle(current_dir + f'/../../data_process/dataset/df_all_log_transformed.pkl')
df_fold_index = pd.read_pickle(current_dir + f'/../../data_process/dataset/cdhit/cdhit_fold_index.pkl')
print('Finished.')

# TODO Split dataset
label_name = 'logkcatkm'
use_t_ph_embedding, use_mw_logp = True, True
protein_column, substrate_column = 'prott5', 'molebert'
score_names = ['rmse', 'mae', 'r2', 'pcc']


val_scores_list = []
for fold_idx, (fold_train_idx, fold_val_idx) in enumerate(df_fold_index.values.tolist(), start=1):
    print(f"Fold: {fold_idx}/5")
    train_x, train_y = return_x_y(df_input.loc[fold_train_idx])
    val_x, val_y = return_x_y(df_input.loc[fold_val_idx])

    model = ExtraTreesRegressor(n_jobs=-1)
    model.fit(train_x, train_y)

    val_pred = model.predict(val_x)
    val_scores = return_scores(val_y, val_pred)

    val_scores_list.append(val_scores)
    print(f'Val  fold{fold_idx} ', val_scores)

val_scores_mean = np.mean(val_scores_list, axis=0)
print(f"UniKP with CBW\t RMSE\t MAE\t R2\t PCC\t")
print(f"Val_mean \t {val_scores_mean[0]:.4f} \t {val_scores_mean[1]:.4f} \t {val_scores_mean[2]:.4f} \t {val_scores_mean[3]:.4f}\n")

Intel(R) Extension for Scikit-learn* enabled (https://github.com/intel/scikit-learn-intelex)


Reading data...Finished.
Fold: 1/5
Val  fold1  (3.8770357550684955, 3.0500917384642974, 0.09173480170004766, 0.3045479577896282)
Fold: 2/5
Val  fold2  (4.129246712301715, 3.1380462672368354, 0.0527759213688771, 0.2447088003343009)
Fold: 3/5
Val  fold3  (3.8617476029285407, 3.078067992353398, 0.09880543694889066, 0.3161083030342393)
Fold: 4/5
Val  fold4  (3.8971410787300442, 2.996582645086984, 0.08677901853441627, 0.2986677498670458)
Fold: 5/5
Val  fold5  (4.0559217749563805, 3.190458667517914, 0.10158010606408208, 0.32447202088101107)
UniKP with CBW	 RMSE	 MAE	 R2	 PCC	
Val_mean 	 3.9642 	 3.0906 	 0.0863 	 0.2977

