In [1]:
import torch.nn as nn
import torch
import math
import warnings
warnings.filterwarnings("ignore")

class CNN1d(nn.Module):
    def __init__(self, input_dimension, output_dimension=1, kernel_size_list=[3, 3], filters_list=[16, 32], fc_unit=[128], drop_ratio=0.5, activation=nn.ReLU):
        super(CNN1d, self).__init__()

        # Convolutional layers
        self.conv_layers = nn.ModuleList()
        in_channels = 1  # Since input_dimension is (batch_size, input_dimension), we treat it as a single channel sequence
        for i, filters in enumerate(filters_list):
            kernel_size = kernel_size_list[i]
            padding = (kernel_size - 1) // 2  # Calculate padding to maintain the same size
            out_channels = filters
            self.conv_layers.append(nn.Conv1d(in_channels=in_channels, out_channels=out_channels, kernel_size=kernel_size, padding=padding))
            in_channels = out_channels

        self.relu = activation()
        self.pool = nn.MaxPool1d(kernel_size=2, stride=2)

        # Compute the dimension of the feature map after the convolutional layers and pooling
        feature_map_dimension = input_dimension // (2 ** len(filters_list))

        # Fully connected layers
        self.drop = nn.Dropout(p=drop_ratio)
        self.fc_layers = nn.ModuleList()
        in_features = filters_list[-1] * feature_map_dimension
        for units in fc_unit:
            self.fc_layers.append(nn.Linear(in_features, units))
            self.fc_layers.append(self.relu)
            self.fc_layers.append(self.drop)  # Add Dropout after each fully connected layer
            in_features = units

        self.fc_layers.append(nn.Linear(in_features, output_dimension))

    def forward(self, x):
        try:
            x = x.unsqueeze(1)  # Add a channel dimension: (batch_size, 1, input_dimension)

            for conv in self.conv_layers:
                x = self.pool(self.relu(conv(x)))

            x = x.view(x.size(0), -1)  # Flatten the tensor

            for layer in self.fc_layers:
                x = layer(x)

        except Exception as e:
            print(e)

        return x

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import pandas as pd
import random
import numpy as np
import torch.optim as optim
import torch.utils.data as Data
from hyperopt import fmin, tpe, hp, Trials, space_eval  # 超参数搜索
import json
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from scipy.stats import pearsonr
from sklearn.model_selection import train_test_split, KFold
import os
current_dir = os.getcwd()


random_state = 66
random.seed(random_state)
np.random.seed(random_state)
torch.manual_seed(random_state)
torch.cuda.manual_seed(random_state)
torch.cuda.manual_seed_all(random_state)

def return_scores(y_true, y_pred):
    y_true = np.array(y_true).ravel()
    y_pred = np.array(y_pred).ravel()
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    mae = mean_absolute_error(y_true, y_pred)
    r2 = r2_score(y_true, y_pred)
    pcc = pearsonr(y_true, y_pred)[0]
    return rmse, mae, r2, pcc


def return_data_loader(x, y, batch_size, shuffle=True, seed=66):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    x = torch.FloatTensor(x)
    y = torch.FloatTensor(y)
    label_loader = Data.DataLoader(Data.TensorDataset(x, y), batch_size=batch_size, shuffle=shuffle)

    return label_loader

def return_x_y(df_filtered):
    y = df_filtered[label_name].values
    mask = ~np.isnan(y)

    # factors
    auxiliary_data = []
    if use_t_ph_embedding:
        ph = df_filtered['ph'].values.reshape(-1, 1)
        t = df_filtered['t'].values.reshape(-1, 1)
        auxiliary_data.append(ph)
        auxiliary_data.append(t)

    if use_mw_logp:
        mw = df_filtered['mw'].values.reshape(-1, 1)
        logp = df_filtered['logp'].values.reshape(-1, 1)
        auxiliary_data.append(mw)
        auxiliary_data.append(logp)

    protein_data = np.array(df_filtered[protein_column].tolist())
    substrate_data = np.array(df_filtered[substrate_column].tolist())
    x = np.hstack([protein_data, substrate_data] + auxiliary_data)

    return x[mask], y[mask]


def train_one_epoch(model, optimizer, train_loader):
    model.train()
    loss_function = torch.nn.MSELoss()
    accu_loss_train = torch.zeros(1).to(device)  # 累计损失
    optimizer.zero_grad()

    for step, data in enumerate(train_loader):
        data, label_value = data[0].to(device), data[1].to(device)
        pred = model(data)

        loss = loss_function(pred.float().squeeze(), label_value.float())
        loss.backward()
        accu_loss_train += loss.detach()

        # # 在更新权重之前，对梯度进行裁剪，使其不超过clip_value
        # torch.nn.utils.clip_grad_value_([p for p in model.parameters() if p.requires_grad], clip_value=clip_value)
        optimizer.step()
        optimizer.zero_grad()

    return accu_loss_train.item() / (step + 1), model


def evaluate_model(model, data_loader, mode='search'):
    model.eval()
    all_pred = []
    all_labels = []

    with torch.no_grad():
        loss_function = torch.nn.MSELoss()
        accu_loss = torch.zeros(1).to(device)  # 累计损失

        for step, data in enumerate(data_loader):
            inputs, labels = data[0].to(device), data[1].to(device)
            outputs = model(inputs)

            loss = loss_function(outputs.float().squeeze(), labels.float())
            accu_loss += loss.detach()

            if mode != 'search':
                all_pred.extend(outputs.cpu().numpy())
                all_labels.extend(labels.cpu().numpy())

    torch.cuda.empty_cache()  # 清理未使用的缓存

    if mode == 'search':
        return accu_loss.item() / len(data_loader)  # 返回平均损失

    else:
        return all_pred, all_labels


def search_model(params, train_x, train_y, val_x, val_y):
    # data loader
    train_loader = return_data_loader(train_x, train_y, batch_size=params['batch_size'], shuffle=True, seed=random_state)
    val_loader = return_data_loader(val_x, val_y, batch_size=params['batch_size'], shuffle=False, seed=random_state)

    model = CNN1d(
        input_dimension=len(train_x[0]),
        output_dimension=1,
        kernel_size_list=[params['kernel_size'] for i in range(len(params['num_filters']))],  # [3, 3]
        filters_list=params['num_filters'],  # [16, 32]
        fc_unit=params['fc_unit'],  # [128]
        drop_ratio=params['drop_ratio']
    ).to(device)
    
    # optimizer
    pg = [p for p in model.parameters() if p.requires_grad]
    optimizer = optim.AdamW(pg, lr=params['lr'], weight_decay=5E-5)  # optimizer
    lf = lambda x: ((1 + math.cos(x * math.pi / params['epochs'])) / 2) * (1 - params['lrf']) + params['lrf']  # cosine
    scheduler = optim.lr_scheduler.LambdaLR(optimizer, lr_lambda=lf)

    best_loss = np.Inf
    best_epoch, patience_nums = 0, 0

    for epoch_idx in range(params['epochs']):
        # train
        train_loss, model = train_one_epoch(model, optimizer, train_loader)
        scheduler.step()

        # evaluate
        val_loss = evaluate_model(model, val_loader, mode='search')
        if epoch_idx % 100 == 0:
            print(f'Epoch {epoch_idx} Train loss {train_loss:.3f} Val loss {val_loss:.3f}')

        # compare
        if val_loss <= best_loss:
            best_epoch = epoch_idx
            best_loss = val_loss
            patience_nums = 0

        else:
            patience_nums += 1

        if patience_nums > patience:
            break

    # print Log
    if patience_nums > patience:
        print(f'Early stopped at epoch {best_epoch} best_val_loss {best_loss:.3f}')
    else:
        print(f'Stopped at epoch {best_epoch} best_val_loss {best_loss:.3f}')

    return best_loss


def _search_params(params):
    print(params)
    val_loss_list = []
    for fold_idx, (train_index, val_index) in enumerate(kf.split(df_train_val), start=1):
        print(f"Fold: {fold_idx}/5")
        df_train = df_train_val.iloc[train_index]
        df_val = df_train_val.iloc[val_index]

        train_x, train_y = return_x_y(df_train)
        val_x, val_y = return_x_y(df_val)

        val_loss = search_model(params, train_x, train_y, val_x, val_y)
        val_loss_list.append(val_loss)

    val_loss_mean = np.mean(val_loss_list, axis=0)
    print(f"val MSE loss mean: {val_loss_mean:.5f}\n")

    return val_loss_mean


def search_best_param(max_evals):
    space = {
        'lr': hp.uniform('lr', 1e-4, 1e-3),
        'lrf': hp.choice('lrf', [0.01]),
        'drop_ratio': hp.uniform('drop_ratio', 0.1, 0.6),
        'kernel_size': hp.choice('kernel_size', [3, 5]),
        'fc_unit': hp.choice('fc_unit', [(64,), (128, 64)]),
        'num_filters': hp.choice('num_filters', [(16, 32), (16,), (32,)]),
        'batch_size': hp.choice('batch_size', [128, 256]),
        'epochs': hp.choice('epochs', [200, 300]),
    }

    trials = Trials()
    print(f'[Info] Starting parameter search with MSE_Loss...')
    best_params = fmin(fn=_search_params, space=space, algo=tpe.suggest, max_evals=max_evals, trials=trials)
    best_params = space_eval(space, best_params)

    # Save the best params to JSON
    with open(params_json_path, 'w') as json_file:
        json.dump(best_params, json_file)

    return best_params


# config
device = torch.device('cuda:0' if torch.cuda.is_available() else "cpu")
print(f"Current divice: {device}")
use_t_ph_embedding = True
use_mw_logp = True
search_max_evals = 60
patience = 30
label_name = 'logkcatkm'
protein_column,  substrate_column = 'prott5', 'molebert'
input_model = 'cnn1d_standard'
df_standardized = pd.read_pickle(f'{current_dir}/../../data_process/dataset/df_standardized.pkl')
df_train_val, df_test = train_test_split(df_standardized, test_size=0.2, random_state=random_state)
kf = KFold(n_splits=5, shuffle=True, random_state=random_state)

params_json_path = f'{current_dir}/model_dict/{input_model}_params.json'
if os.path.exists(params_json_path):
    with open(params_json_path) as json_file:
        params = json.load(json_file)
else:
    params = search_best_param(search_max_evals)

print(f'Best params:{params}\n')

# Train
val_scores_list, test_scores_list = [], []
fold_results = []

for fold_idx, (train_index, val_index) in enumerate(kf.split(df_train_val), start=1):
    print(f"Fold: {fold_idx}/5")
    df_train = df_train_val.iloc[train_index]
    df_val = df_train_val.iloc[val_index]

    train_x, train_y = return_x_y(df_train)
    val_x, val_y = return_x_y(df_val)
    test_x, test_y = return_x_y(df_test)

    # data loader
    train_loader = return_data_loader(train_x, train_y, batch_size=params['batch_size'], shuffle=True, seed=random_state)
    val_loader = return_data_loader(val_x, val_y, batch_size=params['batch_size'], shuffle=False, seed=random_state)
    test_loader = return_data_loader(test_x, test_y, batch_size=params['batch_size'], shuffle=False, seed=random_state)

    model = CNN1d(
        input_dimension=len(train_x[0]),
        output_dimension=1,
        kernel_size_list=[params['kernel_size'] for i in range(len(params['num_filters']))],  # [3, 3]
        filters_list=params['num_filters'],  # [16, 32]
        fc_unit=params['fc_unit'],  # [128]
        drop_ratio=params['drop_ratio']
    ).to(device)

    # optimizer
    pg = [p for p in model.parameters() if p.requires_grad]
    optimizer = optim.AdamW(pg, lr=params['lr'], weight_decay=5E-5)  # optimizer
    lf = lambda x: ((1 + math.cos(x * math.pi / params['epochs'])) / 2) * (1 - params['lrf']) + params['lrf']  # cosine
    scheduler = optim.lr_scheduler.LambdaLR(optimizer, lr_lambda=lf)

    best_loss = np.Inf
    best_epoch, patience_nums, best_model = 0, 0, None

    # train
    for epoch_idx in range(params['epochs']):
        train_loss, model = train_one_epoch(model, optimizer, train_loader)
        scheduler.step()

        val_loss = evaluate_model(model, val_loader, mode='search')

        # compare
        if val_loss <= best_loss:
            best_model = model
            best_epoch = epoch_idx
            best_loss = val_loss
            patience_nums = 0

        else:
            patience_nums += 1

        if patience_nums > patience:
            print(f'Early stopped at epoch {best_epoch} best_val_loss {best_loss:.3f}')
            break
        if epoch_idx % 50 == 0:
            print(f"[Epoch {epoch_idx} fold {fold_idx} {label_name}] Train loss {train_loss:.3f} Val loss {val_loss:.3f}")

    val_pred, val_labels = evaluate_model(best_model, val_loader, mode='val')
    test_pred, test_labels = evaluate_model(best_model, test_loader, mode='test')

    # scores
    val_scores = return_scores(val_labels, val_pred)
    test_scores = return_scores(test_labels, test_pred)
    val_scores_list.append(val_scores)
    test_scores_list.append(test_scores)

    # fold
    fold_results.append([
        fold_idx,
        val_scores[0], val_scores[1], val_scores[2], val_scores[3],
        test_scores[0], test_scores[1], test_scores[2], test_scores[3]
    ])

# mean
val_scores_mean = np.mean(val_scores_list, axis=0)
test_scores_mean = np.mean(test_scores_list, axis=0)

print(f"Dimension of x: {train_x.shape[1]}")
print(f"[Val] rmse {val_scores_mean[0]:.4f} mae {val_scores_mean[1]:.4f} r2 {val_scores_mean[2]:.4f} pcc {val_scores_mean[3]:.4f} "
      f"[Test] rmse {test_scores_mean[0]:.4f} mae {test_scores_mean[1]:.4f} r2 {test_scores_mean[2]:.4f} pcc {test_scores_mean[3]:.4f}\n")

# save cvs
df_cv_results = pd.DataFrame(fold_results, columns=[
    "Fold",
    "Val_RMSE", "Val_MAE", "Val_R2", "Val_PCC",
    "Test_RMSE", "Test_MAE", "Test_R2", "Test_PCC"])
df_cv_results.to_excel(f"{current_dir}/results/{input_model}_cv_results.xlsx", index=False)
print("Results saved")

Current divice: cuda:0
[Info] Starting parameter search with MSE_Loss...
  0%|          | 0/60 [00:00<?, ?trial/s, best loss=?]                                                      {'batch_size': 128, 'drop_ratio': 0.2344086664104653, 'epochs': 300, 'fc_unit': (64,), 'kernel_size': 5, 'lr': 0.0004006511173575456, 'lrf': 0.01, 'num_filters': (16, 32)}
  0%|          | 0/60 [00:00<?, ?trial/s, best loss=?]                                                      Fold: 1/5
  0%|          | 0/60 [00:00<?, ?trial/s, best loss=?]                                                      Epoch 0 Train loss 16.374 Val loss 15.970
  0%|          | 0/60 [00:07<?, ?trial/s, best loss=?]                                                      Epoch 100 Train loss 4.229 Val loss 8.551
  0%|          | 0/60 [00:46<?, ?trial/s, best loss=?]                                                      Early stopped at epoch 167 best_val_loss 8.351
  0%|          | 0/60 [01:23<?, ?trial/s, best loss=?]   