# TAL Predict

In [1]:
import os
import pandas as pd
import sys
sys.path.insert(0, sys.path[0] + "/../../")
sys.path.insert(0, sys.path[0] + "/../")
current_dir = os.getcwd()

print(f"Current directory: {current_dir}")
df_istal = pd.read_excel(f'{current_dir}/mutant_data.xlsx', sheet_name='IsTAL')
df_rgtal = pd.read_excel(f'{current_dir}/mutant_data.xlsx', sheet_name='RgTAL')
df_trcsmsx = pd.read_excel(f'{current_dir}/mutant_data.xlsx', sheet_name='trCsMSX')
df = pd.concat([df_istal, df_rgtal, df_trcsmsx], ignore_index=True)

print(len(df))
df.head()

Current directory: /nfs/my/Xu/jicm/WWDynoMTGBM/kinetic_params_evaluate/kcatkm_mutant_trend_test
26


Unnamed: 0,Number,Organism,Substrate,pH,Tempetature,Smiles,Sequence,Unnamed: 7,Unnamed: 8,Unnamed: 9,Experimental kcat/Km,Temperature,Type
0,IsTAL,Ilyonectria sp. MPI-CAGE-AT-0026,L-tyrosine,9.5,40.0,C1=CC(=CC=C1CC(C(=O)O)N)O,MGKATGHLKYDVHQQWPTPHVNKALESWARATELVKTGSVIIDGES...,,,,210.0,,
1,RgTAL,Rhodotorula glutinis,L-tyrosine,9.5,40.0,C1=CC(=CC=C1CC(C(=O)O)N)O,MAPRPTSQSQARTCPTTQVTQVDIVEKMLAAPTDSTLELDGYSLNL...,,,,327.2,,
2,PcTAL,Puccinia coronata f. sp. avenae,L-tyrosine,9.5,40.0,C1=CC(=CC=C1CC(C(=O)O)N)O,MTQQRQVASGSPHTVLAQQLISPLLLNHTNSKNPVTVVTIDGHSLS...,,,,391.6,,
3,SsTAL,Sporidiobolus salmonicolor,L-tyrosine,9.5,40.0,C1=CC(=CC=C1CC(C(=O)O)N)O,MVIRCNSLLRGHSAIRLSVLETLIKLINLNITPVVPLRGSISASGD...,,,,346.3,,
4,HiTAL,Heterobasidion irregulare,L-tyrosine,9.5,40.0,C1=CC(=CC=C1CC(C(=O)O)N)O,MPESWVRGAILVRANSLIRGHSGVRWELVEKMIELLRSNITPLVPL...,,,,361.9,,


In [2]:
import torch
from transformers import T5Tokenizer, T5EncoderModel
from tqdm import trange, tqdm
import re

def run_calculate(sequences, tokenizer, model, batch_size=2):
    embeddings = []
    error_sequences = []
    lengths = [len(seq) for seq in sequences]

    # replace all rare/ambiguous amino acids by X and introduce white-space between all amino acids
    sequence_preprocess = [" ".join(list(re.sub(r"[UZOB]", "X", sequence))) for sequence in sequences]

    # batch
    for start_idx in trange(0, len(sequence_preprocess), batch_size):
        batch_sequences = sequence_preprocess[start_idx: start_idx + batch_size]
        length_of_sequences = lengths[start_idx: start_idx + batch_size]

        # tokenize sequences and pad up to the longest sequence in the batch
        ids = tokenizer(batch_sequences, add_special_tokens=True, padding="longest")

        input_ids = torch.tensor(ids['input_ids']).to(device)
        attention_mask = torch.tensor(ids['attention_mask']).to(device)

        try:
            # generate embeddings
            with torch.no_grad():
                embedding_repr = model(input_ids=input_ids, attention_mask=attention_mask)

            # extract residue embeddings for the sequences in the batch and remove padded & special tokens
            for idx_in_batch in range(len(batch_sequences)):
                embedding = embedding_repr.last_hidden_state[idx_in_batch, :length_of_sequences[idx_in_batch]].mean(dim=0)
                embeddings.append(embedding.cpu().numpy())
        except Exception as e:
            print(f"Error processing batch starting at index {start_idx}: {e}")
            error_sequences.extend(batch_sequences)

    df_result = pd.DataFrame({'Sequence': sequences, 'prott5': embeddings})
    return df_result

# protT5 XL Uniref50
input_sequences = df['Sequence']
# model
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print('device: ', device)

df_prott5_path = f'{current_dir}/results/df_prott5.pkl'
if os.path.exists(df_prott5_path):
    df_prott5 = pd.read_pickle(df_prott5_path)
else:  
    # Load the tokenizer
    tokenizer = T5Tokenizer.from_pretrained(f'{current_dir}/../../data_process/inferred_functions/prott5/pre_models', do_lower_case=False)
    prott5_model = T5EncoderModel.from_pretrained(f'{current_dir}/../../data_process/inferred_functions/prott5/pre_models').to(device)
    df_prott5 = run_calculate(input_sequences, tokenizer, prott5_model, batch_size=2)
    df_prott5.to_pickle(df_prott5_path)
df_prott5.head()

  from .autonotebook import tqdm as notebook_tqdm


device:  cuda:0


Unnamed: 0,Sequence,prott5
0,MGKATGHLKYDVHQQWPTPHVNKALESWARATELVKTGSVIIDGES...,"[0.05902582, -0.011174141, 0.03016772, -0.0091..."
1,MAPRPTSQSQARTCPTTQVTQVDIVEKMLAAPTDSTLELDGYSLNL...,"[0.05909447, -0.016460072, 0.023931371, -0.016..."
2,MTQQRQVASGSPHTVLAQQLISPLLLNHTNSKNPVTVVTIDGHSLS...,"[0.034160577, -0.037989005, 0.027064191, 0.021..."
3,MVIRCNSLLRGHSAIRLSVLETLIKLINLNITPVVPLRGSISASGD...,"[0.03857084, -0.011963796, 0.017409109, -0.009..."
4,MPESWVRGAILVRANSLIRGHSGVRWELVEKMIELLRSNITPLVPL...,"[0.04258501, 0.023547765, 0.05098012, 0.012392..."


In [3]:
df_molebert_path = f'{current_dir}/results/df_molebert.pkl'
if os.path.exists(df_molebert_path):
    df_molebert = pd.read_pickle(df_molebert_path)
else:
    from WWDynoMTGBM.data_process.inferred_functions.mole_bert.model import GNN_graphpred
    from WWDynoMTGBM.data_process.inferred_functions.mole_bert.loader import mol_to_graph_data_obj_simple
    from rdkit import Chem
    MODEL_CONFIG = {
        "num_layer": 5,  # number of graph conv layers
        "emb_dim": 300,  # embedding dimension in graph conv layers
        "num_tasks": 1,  # output feature dimention
        "drop_ratio": 0.5,  # dropout ratio
    }

    mole_bert_model = GNN_graphpred(MODEL_CONFIG['num_layer'], MODEL_CONFIG['emb_dim'], num_tasks=MODEL_CONFIG['num_tasks'], drop_ratio=MODEL_CONFIG['drop_ratio']).to(device)
    mole_bert_model.from_pretrained(f'{current_dir}/../data_process/inferred_functions/mole_bert/model_gin/Mole-BERT.pth')
    for p in mole_bert_model.parameters():
        p.requires_grad = False

    input_smiles = df['Smiles']
    embeddings = []
    valid_smiles = []
    error_smiles = []
    mole_bert_model.eval()
    with torch.no_grad():
        for mol_smiles in tqdm(input_smiles):
            try:
                mol = Chem.MolFromSmiles(mol_smiles)
                graph = mol_to_graph_data_obj_simple(mol).to(device)
                result = mole_bert_model(graph)
                embedding = mole_bert_model(graph)[1].cpu().numpy().mean(axis=0)
                embeddings.append(embedding)
                valid_smiles.append(mol_smiles)

            except Exception as e:
                error_smiles.append(mol_smiles)

    df_molebert = pd.DataFrame({'Smiles': valid_smiles, 'molebert': embeddings})
    df_molebert.to_pickle(f'{current_dir}/results/df_molebert.pkl')
df_molebert.head()

Unnamed: 0,Smiles,molebert
0,C1=CC(=CC=C1CC(C(=O)O)N)O,"[-0.2597763, -0.0019763308, -0.21788795, 0.107..."
1,C1=CC(=CC=C1CC(C(=O)O)N)O,"[-0.2597763, -0.0019763308, -0.21788795, 0.107..."
2,C1=CC(=CC=C1CC(C(=O)O)N)O,"[-0.2597763, -0.0019763308, -0.21788795, 0.107..."
3,C1=CC(=CC=C1CC(C(=O)O)N)O,"[-0.2597763, -0.0019763308, -0.21788795, 0.107..."
4,C1=CC(=CC=C1CC(C(=O)O)N)O,"[-0.2597763, -0.0019763308, -0.21788795, 0.107..."


In [4]:
from rdkit import Chem
from rdkit.Chem import Descriptors
import warnings
warnings.filterwarnings('ignore')
from rdkit import RDLogger
RDLogger.DisableLog('rdApp.*')

def calculate_properties(smiles):
    try:
        mol = Chem.MolFromSmiles(smiles)
        if mol is None:
            raise ValueError(f"invalid SMILES: {smiles}")

        logp = Descriptors.MolLogP(mol)
        mw = Descriptors.MolWt(mol)
        return logp, mw
    except Exception as e:
        print(f"Failed: {e}")
        return np.NAN, np.NAN

df_prott5_unique = df_prott5.drop_duplicates(subset='Sequence')
df_merge = pd.merge(df, df_prott5_unique, on='Sequence', how='left')
df_molebert_unique = df_molebert.drop_duplicates(subset='Smiles')
df_merge = pd.merge(df_merge, df_molebert_unique, on='Smiles', how='left')
df_merge[['logp', 'mw']] = df_merge['Smiles'].apply(lambda x: pd.Series(calculate_properties(x)))
df_merge.head()

Unnamed: 0,Number,Organism,Substrate,pH,Tempetature,Smiles,Sequence,Unnamed: 7,Unnamed: 8,Unnamed: 9,Experimental kcat/Km,Temperature,Type,prott5,molebert,logp,mw
0,IsTAL,Ilyonectria sp. MPI-CAGE-AT-0026,L-tyrosine,9.5,40.0,C1=CC(=CC=C1CC(C(=O)O)N)O,MGKATGHLKYDVHQQWPTPHVNKALESWARATELVKTGSVIIDGES...,,,,210.0,,,"[0.05902582, -0.011174141, 0.03016772, -0.0091...","[-0.2597763, -0.0019763308, -0.21788795, 0.107...",0.3466,181.191
1,RgTAL,Rhodotorula glutinis,L-tyrosine,9.5,40.0,C1=CC(=CC=C1CC(C(=O)O)N)O,MAPRPTSQSQARTCPTTQVTQVDIVEKMLAAPTDSTLELDGYSLNL...,,,,327.2,,,"[0.05909447, -0.016460072, 0.023931371, -0.016...","[-0.2597763, -0.0019763308, -0.21788795, 0.107...",0.3466,181.191
2,PcTAL,Puccinia coronata f. sp. avenae,L-tyrosine,9.5,40.0,C1=CC(=CC=C1CC(C(=O)O)N)O,MTQQRQVASGSPHTVLAQQLISPLLLNHTNSKNPVTVVTIDGHSLS...,,,,391.6,,,"[0.034160577, -0.037989005, 0.027064191, 0.021...","[-0.2597763, -0.0019763308, -0.21788795, 0.107...",0.3466,181.191
3,SsTAL,Sporidiobolus salmonicolor,L-tyrosine,9.5,40.0,C1=CC(=CC=C1CC(C(=O)O)N)O,MVIRCNSLLRGHSAIRLSVLETLIKLINLNITPVVPLRGSISASGD...,,,,346.3,,,"[0.03857084, -0.011963796, 0.017409109, -0.009...","[-0.2597763, -0.0019763308, -0.21788795, 0.107...",0.3466,181.191
4,HiTAL,Heterobasidion irregulare,L-tyrosine,9.5,40.0,C1=CC(=CC=C1CC(C(=O)O)N)O,MPESWVRGAILVRANSLIRGHSGVRWELVEKMIELLRSNITPLVPL...,,,,361.9,,,"[0.04258501, 0.023547765, 0.05098012, 0.012392...","[-0.2597763, -0.0019763308, -0.21788795, 0.107...",0.3466,181.191


In [5]:
import lightgbmmt as lgb
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from scipy.stats import pearsonr
import random
import numpy as np
import math, json
from copy import deepcopy

# init seed
random_state = 66
random.seed(random_state)
np.random.seed(random_state)
torch.manual_seed(random_state)
torch.cuda.manual_seed(random_state)
torch.cuda.manual_seed_all(random_state)


def return_mtgbm_x_y(df_data, tasks):
    y = np.array(df_data[tasks].values)

    auxiliary_data = []
    ph = df_data['ph'].values.reshape(-1, 1)
    t = df_data['t'].values.reshape(-1, 1)
    auxiliary_data.append(ph)
    auxiliary_data.append(t)

    mw = df_data['mw'].values.reshape(-1, 1)
    logp = df_data['logp'].values.reshape(-1, 1)
    auxiliary_data.append(mw)
    auxiliary_data.append(logp)

    protein_data = np.array(df_data[protein_column].tolist())
    substrate_data = np.array(df_data[substrate_column].tolist())

    x = np.hstack([protein_data, substrate_data] + auxiliary_data)
    return x, y


def return_mtgbm_x(df_data):
    auxiliary_data = []
    ph = df_data['pH'].values.reshape(-1, 1)
    t = df_data['Tempetature'].values.reshape(-1, 1)
    auxiliary_data.append(ph)
    auxiliary_data.append(t)

    mw = df_data['mw'].values.reshape(-1, 1)
    logp = df_data['logp'].values.reshape(-1, 1)
    auxiliary_data.append(mw)
    auxiliary_data.append(logp)

    protein_data = np.array(df_data[protein_column].tolist())
    substrate_data = np.array(df_data[substrate_column].tolist())

    x = np.hstack([protein_data, substrate_data] + auxiliary_data)
    return x


def self_kcatkm_rmse(preds, train_data):
    labels = torch.tensor(train_data.get_label(), device=device)
    preds = torch.tensor(preds, device=device)

    # extract kcatkm values
    labels = labels.view(num_tasks, -1).T[:, num_tasks - 1]
    preds = preds.view(num_tasks, -1).T[:, num_tasks - 1]

    # mask
    valid_mask = labels != fill_nan_value
    valid_labels = labels[valid_mask]
    valid_preds = preds[valid_mask]

    kcatkm_rmse = torch.sqrt(torch.mean((valid_labels - valid_preds) ** 2))

    return 'rmse_kcatkm', kcatkm_rmse.item(), False

def return_scores(y_true, y_pred):
    mask = y_true != fill_nan_value
    y_true = y_true[mask]
    y_pred = y_pred[mask]

    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    mae = mean_absolute_error(y_true, y_pred)
    r2 = r2_score(y_true, y_pred)
    pcc = pearsonr(y_true, y_pred)[0]

    return rmse, mae, r2, pcc


def print_scores(task_scores_dict):
    for task_name in task_names:
        print(f"{task_name}\t RMSE\t MAE\t R2\t PCC\t")

        task_val_scores = task_scores_dict[task_name]['val']
        task_test_scores = task_scores_dict[task_name]['test']

        val_metrics = [f"{np.mean(task_val_scores[metric_name]):.4f}\t" for metric_name in
                       score_names]
        print("Val  " + " ".join(val_metrics))

        test_metrics = [f"{np.mean(task_test_scores[metric_name]):.4f}\t" for metric_name in
                        score_names]
        print("Test " + " ".join(test_metrics))
        print()


def cal_grad(preds, train_data, ep=0):
    labels = torch.tensor(train_data.get_label(), device=device)
    preds = torch.tensor(preds, device=device)
    labels = labels.view(num_tasks, -1).T
    preds = preds.view(num_tasks, -1).T

    # mask
    valid_mask = labels != fill_nan_value
    grad = torch.zeros_like(preds)
    grad[valid_mask] = preds[valid_mask] - labels[valid_mask]

    # sum
    grad_final = grad.mean(dim=1)

    # Hessian
    grad_flattened = grad.T.flatten()
    hess = torch.ones_like(grad_final)
    hess2 = torch.ones_like(grad_flattened)

    return grad_final.cpu().numpy(), hess.cpu().numpy(), grad_flattened.cpu().numpy(), hess2.cpu().numpy()


def train_mtgbm(params):
    temp_params = deepcopy(params)
    temp_params.update({"verbosity": -1, "objective": "custom", "num_labels": num_tasks, "tree_learner": 'serial2',
                        "num_threads": num_threads})
    num_iterations = temp_params.pop("num_iterations")
    task_scores_dict = {task_name: {'val': {name: [] for name in score_names},
                                    'test': {name: [] for name in score_names}} for task_name in task_names}

    model_list = []
    for fold_idx, (train_index, val_index) in enumerate(kf.split(train_val_x), start=1):
        print(f"Fold {fold_idx}")
        # split dataset
        train_x, val_x = train_val_x[train_index], train_val_x[val_index]
        train_y, val_y = train_val_y[train_index], train_val_y[val_index]

        train_data = lgb.Dataset(train_x, label=train_y)
        val_data = lgb.Dataset(val_x, label=val_y)

        # get the best epoch number
        evals_result_mt = {}
        lgb.train(temp_params, train_data, num_iterations, valid_sets=[val_data],
                  fobj=cal_grad, feval=self_kcatkm_rmse, verbose_eval=1000, evals_result=evals_result_mt,
                  callbacks=[lgb.early_stopping(stopping_rounds=500)])
        valid_records = evals_result_mt['valid_0']['rmse_kcatkm']
        min_index = np.argmin(np.array(valid_records))
        print(f"valid_records min_index {min_index}")

        # train model for all scores of validation and test
        train_data = lgb.Dataset(train_x, label=train_y)
        val_data = lgb.Dataset(val_x, label=val_y)
        evals_result_mt = {}
        model = lgb.train(temp_params, train_data, min_index + 1, valid_sets=[val_data],
                          fobj=cal_grad, feval=self_kcatkm_rmse, verbose_eval=1000,
                          evals_result=evals_result_mt)
        model.set_num_labels(num_tasks)

        # validation predict
        val_predicted = model.predict(val_x)
        val_scores = {task_name: return_scores(val_y[:, idx], val_predicted[:, idx]) for idx, task_name in
                      enumerate(task_names)}

        # test predict
        test_predicted = model.predict(test_x)
        test_scores = {task_name: return_scores(test_y[:, idx], test_predicted[:, idx]) for idx, task_name in
                       enumerate(task_names)}

        # record
        model_list.append(model)
        for task_name in task_names:
            for score_idx, score_name in enumerate(score_names):
                task_scores_dict[task_name]['val'][score_name].append(val_scores[task_name][score_idx])
                task_scores_dict[task_name]['test'][score_name].append(test_scores[task_name][score_idx])
        print(f"Val  {val_scores} \n Test {test_scores}\n")

    print_scores(task_scores_dict)
    return model_list


# config
dataset_path = f"{current_dir}/../../data_process/dataset/df_all_log_transformed.pkl"
num_threads = 32

# input
score_names = ['rmse', 'mae', 'r2', 'pcc']
task_names = ['logkm', 'logkcat', 'logkcatkm']
num_tasks = len(task_names)
df_input = pd.read_pickle(dataset_path)
df_input['logkm'] = -df_input['logkm']
fill_nan_value = -100
df_input = df_input.fillna(fill_nan_value)

# split dataset
df_train_val, df_test = train_test_split(df_input, test_size=0.2, random_state=random_state)
kf = KFold(n_splits=5, shuffle=True, random_state=random_state)
protein_column,  substrate_column = 'prott5', 'molebert'
train_val_x, train_val_y = return_mtgbm_x_y(df_train_val, task_names)
test_x, test_y = return_mtgbm_x_y(df_test, task_names)
# evaluate_x = return_mtgbm_x(df_merge)

input_model = 'mtgbm_km_kcat_kcatkm'
with open(f'{current_dir}/../../kcatkm_mtgbm_ablation/{input_model}_params.json', 'r') as json_file:
    best_params = json.load(json_file)
print('best_params:', best_params)
print('using -km kcat kcatkm resample')
model_list = train_mtgbm(best_params)

best_params: {'bagging_fraction': 0.729611058732434, 'feature_fraction': 0.6643005188332146, 'lambda_l1': 0.346846951564011, 'lambda_l2': 0.7149783548509333, 'learning_rate': 0.07838547411322133, 'max_bin': 95, 'max_depth': 9, 'min_data_in_leaf': 21, 'num_iterations': 3273, 'num_leaves': 2350}
using -km kcat kcatkm resample
Fold 1
Training until validation scores don't improve for 500 rounds
[1000]	valid_0's rmse_kcatkm: 2.681
[2000]	valid_0's rmse_kcatkm: 2.66536
Early stopping, best iteration is:
[2011]	valid_0's rmse_kcatkm: 2.66506
valid_records min_index 2010
[1000]	valid_0's rmse_kcatkm: 2.681
[2000]	valid_0's rmse_kcatkm: 2.66536
inner_predict 22338
inner_predict 27921
Val  {'logkm': (1.751801400303909, 1.2363345265824102, 0.6506586527402988, 0.8085104578640728), 'logkcat': (2.1498353062197597, 1.4751936901448115, 0.6199608228130944, 0.7876183586110916), 'logkcatkm': (2.6650555104066562, 1.8946390688802637, 0.6013117442462484, 0.7756966336682282)} 
 Test {'logkm': (1.73289317082

In [6]:
evaluate_x = return_mtgbm_x(df_merge)
pred_kcatkm_list = []
for model in model_list:
    pred = model.predict(evaluate_x)
    pred_kcatkm = pred
    print(pred_kcatkm)
    pred_kcatkm_list.append(pred_kcatkm)

np.save(f'{current_dir}/results/pred_kcatkm_list_last.npy', np.array(pred_kcatkm_list))

inner_predict 78
[[ 0.26707737  1.34501968  1.51268554]
 [ 1.16408875  4.50023143  5.28352133]
 [ 0.53473617  0.23947572  1.8718349 ]
 [ 1.13157976  2.00272758  2.61797233]
 [-0.55929666  0.91583955  2.96966198]
 [ 0.22559716  1.87680357  3.75374097]
 [-0.36923708  1.02490494  2.72986858]
 [ 1.49919542  2.81149304  4.91646758]
 [ 1.00433031  2.522002    4.18751263]
 [ 1.0784432   1.78905623  4.18678622]
 [ 0.75780053  1.853918    4.07093934]
 [ 0.83283318  2.00688296  4.3237367 ]
 [ 1.09561366  2.39864123  4.054011  ]
 [ 0.74487573  2.34404083  4.32411771]
 [ 1.39485509  2.62668315  4.39982704]
 [ 0.94185089  2.85690119  4.5879899 ]
 [ 0.78627222  1.81024779  4.27542179]
 [ 4.04970379 -3.68890387  0.15079985]
 [ 4.01768386 -3.57317956  0.18915652]
 [ 4.06404227 -3.4316937   0.17381515]
 [ 4.00737926 -3.44072105  0.16452818]
 [ 3.76594082 -3.50839849  0.35002903]
 [ 4.31331584 -3.52802654  0.41531508]
 [ 4.09539842 -3.53073141  0.44576024]
 [ 4.00273466 -3.92482846  0.50324509]
 [ 4.069

In [7]:
np.array(pred_kcatkm_list)[:, :, 2].mean(axis=0)


array([1.82206912, 5.33213418, 1.61632907, 2.21195108, 2.39480214,
       2.69547943, 2.00366451, 4.98298754, 4.3567504 , 4.45469538,
       4.24896751, 4.36345386, 4.4792894 , 4.57155171, 4.70263842,
       4.59621425, 4.17744985, 0.61171985, 0.53583279, 0.56880364,
       0.62000203, 0.61144053, 0.66916788, 0.63722456, 0.73409817,
       0.59162375])

In [8]:
print(np.array(pred_kcatkm_list)[:, :, 2].mean(axis=0)[:7])
df_merge['Number'][:7]

[1.82206912 5.33213418 1.61632907 2.21195108 2.39480214 2.69547943
 2.00366451]


0    IsTAL
1    RgTAL
2    PcTAL
3    SsTAL
4    HiTAL
5    LeTAL
6    AaTAL
Name: Number, dtype: object

In [9]:
print(np.array(pred_kcatkm_list)[:, :, 2].mean(axis=0)[7:17])
df_merge['Number'][7:17]

[4.98298754 4.3567504  4.45469538 4.24896751 4.36345386 4.4792894
 4.57155171 4.70263842 4.59621425 4.17744985]


7     Wildtype
8      MT-603P
9      MT-366H
10     MT-366W
11     MT-587V
12      MT-10Y
13     MT-337C
14     MT-668S
15     MT-489T
16     MT-337D
Name: Number, dtype: object

In [10]:
print(np.array(pred_kcatkm_list)[:, :, 2].mean(axis=0)[17:])
df_merge['Number'][17:]

[0.61171985 0.53583279 0.56880364 0.62000203 0.61144053 0.66916788
 0.63722456 0.73409817 0.59162375]


17    trCsMSX
18         M1
19         M2
20         M3
21         M4
22         M5
23         M6
24         M7
25         M8
Name: Number, dtype: object