# Trainer

> In this module, we develop trainers of different models

## Setup

In [None]:

import pandas as pd
import xgboost as xgb
import matplotlib.pyplot as plt
from scipy.stats import spearmanr
from sklearn.model_selection import train_test_split

from kdock.dataset import Data
from kdock.feature import *

## xgb

In [None]:

def xgb_trainer(df,
                feature_col,
                target_col,
                test_index=None,
                xgb_params = { 
                            'max_depth':7, #from 4 to 7
                            'learning_rate':0.001, #from 0.001
                            'subsample':0.8,
                            'colsample_bytree':0.2, # from 0.2 to 1, because need to take position
                            'eval_metric':'rmse',
                            'objective':'reg:squarederror',
                            'tree_method':'gpu_hist',
                            'predictor':'gpu_predictor',
                            'random_state':123
                        }
               ):
    
    X = df[feature_col]
    y = df[target_col]
    
    print(f'xgb params is: {xgb_params}')
    
    if test_index is None:
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)
    else:
        X_train,y_train = X.loc[~X.index.isin(test_index)],y.loc[~X.index.isin(test_index)]
        X_test, y_test = X.loc[test_index],y.loc[test_index]

        
    print(X_train.shape,y_train.shape,X_test.shape, y_test.shape)
    print(y_test.index)
    #prepare matrix for xgb
    dtrain = xgb.DMatrix(X_train, y_train)
    dtest = xgb.DMatrix(X_test, y_test)
    
    model = xgb.train(xgb_params, 
            dtrain=dtrain,
            evals=[(dtrain,'train'),(dtest,'valid')],
            num_boost_round=9999,
            early_stopping_rounds=100,
            verbose_eval=100,)
    
    pred = model.predict(dtest)
    spearman_corr, _ = spearmanr(y_test, pred)
    print(f'Spearman correlation: {spearman_corr:.2f}')
    
    fig, ax = plt.subplots()
    ax.scatter(y_test, pred)
    ax.set_xlabel('True values')
    ax.set_ylabel('Predicted values')
    ax.set_title('Scatter plot of true versus predicted values')
    plt.show()
    plt.close()
    
    
    dd = model.get_score(importance_type='gain')
    gain = pd.DataFrame({'feature':dd.keys(),f'gain_importance':dd.values()})
    gain.set_index('feature').sort_values(by = 'gain_importance')[:15].plot.barh(figsize=(10,20))
    plt.show()
    plt.close()
    
        
    dd = model.get_score(importance_type='weight')
    weight = pd.DataFrame({'feature':dd.keys(),f'weight_importance':dd.values()})
    weight.set_index('feature').sort_values(by = 'weight_importance')[:15].plot.barh(figsize=(10,20))
    plt.show()
    plt.close()
    
    return gain, weight

Example

In [None]:
df = Data.get_g12d()

In [None]:
df

Prepare training df

In [None]:
smi_feat = get_rdkit_df(df,'SMILES','ID',True,True)

In [None]:
smi_feat

In [None]:
df = df.merge(smi_feat)

In [None]:
df = df.dropna(subset = ['IC50'])
df = df.query('IC50<2000').reset_index(drop=True)

In [None]:
df.shape

In [None]:
FEATURES = smi_feat.columns[1:]

In [None]:
df['IC50'].hist();

In [None]:
df['IC50'].apply(lambda x: -np.log(x)).hist();

In [None]:
df['pIC50'] = df['IC50'].apply(lambda x: -np.log(x))

In [None]:
df.group.unique()

In [None]:
df.query('group == "31678"').index

## Train

In [None]:

import pandas as pd
import numpy as np
import xgboost as xgb
import matplotlib.pyplot as plt
from scipy.stats import spearmanr
from sklearn.model_selection import train_test_split

In [None]:
df.IC50.sort_values()

In [None]:
df.pIC50.sort_values()

In [None]:
xgb_trainer(df=df,
            feature_col = FEATURES,
            target_col=['pIC50'],
           # test_index=df.query('group == "31678"').index
           )

In [None]:
xgb_trainer(df=df,
            feature_col = FEATURES,
            target_col=['pIC50'],
           # test_index=df.query('group == "31678"').index
           )

In [None]:
# xgb_trainer(df=df,
#             feature_col = FEATURES,
#             target_col=['pIC50'],
#            # test_index=df.query('group == "31678"').index
#            )

In [None]:
# xgb_trainer(df=df,
#             feature_col = FEATURES,
#             target_col=['pIC50'],
#            test_index=df.query('group == "31678"').index
#            )

In [None]:
# xgb_trainer(df=df,
#             feature_col = FEATURES,
#             target_col=['IC50_log'],
#            test_index=df.query('group == "646"').index
#            )

In [None]:
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# train_size = int(0.8 * train.shape[0])
# train_data = train.iloc[:train_size, :]
# test_data = train.iloc[train_size:, :]

# dtrain = xgb.DMatrix(train_data.drop('IC50', axis=1), label=train_data['IC50'])
# dvalid = xgb.DMatrix(test_data.drop('IC50', axis=1), label=test_data['IC50'])

# xgb_params = { 
#     'max_depth':7, #from 4 to 7
#     'learning_rate':0.001, #from 0.001
#     'subsample':0.8,
#     'colsample_bytree':1, # from 0.2 to 1, because need to take position
    
#     'eval_metric':'rmse',
#     'objective':'reg:squarederror',
#     'tree_method':'gpu_hist',
#     'predictor':'gpu_predictor',
#     'random_state':123
# }

# model = xgb.train(xgb_params, 
#             dtrain=dtrain,
#             evals=[(dtrain,'train'),(dvalid,'valid')],
#             num_boost_round=9999,
#             early_stopping_rounds=100,
#             verbose_eval=100,)

# test_data['pred'] = model.predict(dvalid)

# spearman_corr, _ = spearmanr(test_data['IC50'], test_data['pred'])

# print(f'Spearman correlation: {spearman_corr:.2f}')

In [None]:
# fig, ax = plt.subplots()
# ax.scatter(test_data['IC50'], test_data['pred'])
# ax.set_xlabel('True values')
# ax.set_ylabel('Predicted values')
# ax.set_title('Scatter plot of true versus predicted values')
# plt.show()

In [None]:
# dd = model.get_score(importance_type='gain')
# gain = pd.DataFrame({'feature':dd.keys(),f'importance':dd.values()})


In [None]:
# gain.set_index('feature').sort_values(by = 'importance').plot.barh(figsize=(10,30));

Concatenate features, prepare training

In [None]:
# smi_feat

In [None]:
# # if the sample size is too small, can't use pca to reduce dimension
# from sklearn.decomposition import PCA

# pca = PCA(n_components=2,random_state = 123)

# seq_pca = pca.fit_transform(seq_feat.values)