# Prepare dataset (Protein+aa)
> In this module, we develop trainers of different models

In [None]:
#| hide
import sys
sys.path.append("/notebooks/katlas")
from nbdev.showdoc import *
%matplotlib inline
%matplotlib inline
%load_ext autoreload
%autoreload 2

In [None]:
from fastbook import *
from katlas.core import Data
from katlas.feature import *
from katlas.train import *
from katlas.plot import *

from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import *
from sklearn.svm import *
from sklearn.ensemble import *
from sklearn.model_selection import train_test_split, StratifiedGroupKFold
from sklearn.metrics import mean_squared_error
from scipy.stats import spearmanr,pearsonr

import xgboost as xgb
import joblib

# import matplotlib.pyplot as plt

from pathlib import Path
import math

  @numba.jit()
  @numba.jit()
  @numba.jit()
  @numba.jit()


In [None]:
def get_pca_df(df, # feature dataframe with kinase ID as first column
               startswith, # string to start with of features
               n_components):
    pca = reduce_dim(df,n_components=n_components) 
    pca.columns = [startswith + '_' + col if i > 0 else col for i, col in enumerate(pca.columns)]
    return pca

In [None]:
def combine(target, kinase, aa):
    df = target.merge(kinase).merge(aa).reset_index(drop=True)
    return df

## Target

### Scaled

In [None]:
scaled = Data.get_unstack_q85_up40()

### Standardized

In [None]:
standard = Data.get_unstack_standard()

## Kinase

### ESM2

In [None]:
esm = Data.get_esm_full()

ESM2 - PCA64

In [None]:
esm_pca = get_pca_df(esm, 'esm', 64)

### T5

In [None]:
t5 = Data.get_t5_full()

T5 - PCA64

In [None]:
t5_pca = get_pca_df(t5, 't5', 64)

## Amino Acid

In [None]:
aa = Data.get_aa_feature()

aa - pca16

In [None]:
aa_pca = get_pca_df(aa, 'aa', 16)

One-hot encoded

In [None]:
aa_onehot = pd.get_dummies(aa.aa).set_index(aa.aa).reset_index()

## Combine

In [None]:
target = {'scaled': scaled, 'standard':standard}

In [None]:
kinase = {'esm':esm,'t5':t5}

kinase_pca = {'esmPCA':esm_pca, 't5PCA': t5_pca}

kinase_all = {'esm':esm,'t5':t5, 'esmPCA':esm_pca, 't5PCA': t5_pca}

### Full features

In [None]:
df_full = {}
for i, t in target.items():
    for j, k in kinase.items():
        df = combine(t,k,aa)
        df_full[i+'_'+j + '_aa'] = df

In [None]:
df_full.keys()

dict_keys(['scaled_esm_aa', 'scaled_t5_aa', 'standard_esm_aa', 'standard_t5_aa'])

In [None]:
for key, df in df_full.items():
    df.to_parquet(f'train/{key}.parquet') # parquet takes smallest storage and read fast

### PCA features

In [None]:
df_pca = {}
for i, t in target.items():
    for j, k in kinase_pca.items():
        df = combine(t,k,aa_pca)
        df_pca[i+'_'+j + '_aaPCA'] = df

In [None]:
df_pca.keys()

dict_keys(['scaled_esmPCA_aaPCA', 'scaled_t5PCA_aaPCA', 'standard_esmPCA_aaPCA', 'standard_t5PCA_aaPCA'])

In [None]:
for key, df in df_pca.items():
    df.to_parquet(f'train/{key}.parquet') # parquet takes smallest storage and read fast

### With aa encoded as one-hot

In [None]:
df_aa_onehot = {}
for i, t in target.items():
    for j, k in kinase_all.items():
        df = combine(t,k,aa_onehot)
        df_aa_onehot[i+'_'+j + '_aa1hot'] = df

In [None]:
df_aa_onehot.keys()

dict_keys(['scaled_esm_aa1hot', 'scaled_t5_aa1hot', 'scaled_esmPCA_aa1hot', 'scaled_t5PCA_aa1hot', 'standard_esm_aa1hot', 'standard_t5_aa1hot', 'standard_esmPCA_aa1hot', 'standard_t5PCA_aa1hot'])

In [None]:
for key, df in df_aa_onehot.items():
    df.to_parquet(f'train/{key}.parquet')

## Train

In [None]:
df1 = pd.read_parquet('train/scaled_t5PCA_aaPCA.parquet')

In [None]:
df2 = pd.read_parquet('train/scaled_esmPCA_aaPCA.parquet')

In [None]:
df1_1hot = pd.read_parquet('aa1hot/scaled_t5PCA_aa1hot.parquet')

In [None]:
df2_1hot = pd.read_parquet('aa1hot/scaled_esmPCA_aa1hot.parquet')

In [None]:
dfs = {'t5':df1, 't5_1hot':df1_1hot, 'esm':df2, 'esm_1hot':df2_1hot}

make a splits

In [None]:
sgkf = StratifiedGroupKFold(n_splits=5, shuffle=True, random_state=123)
for fold, (train_idx, test_idx) in enumerate(sgkf.split(info.index.values, info.category,info.kinase)):
    print(f'# kinase in test set: {info.loc[test_idx].kinase.unique().shape[0]}')
    print(f'# kinase in train set: {info.loc[train_idx].kinase.unique().shape[0]}')
    print(f'test set: {info.loc[test_idx].kinase.unique()}')

# kinase in test set: 61
# kinase in train set: 242
test set: ['AAK1' 'AKT3' 'ANKRD3' 'AURA' 'BMPR1A' 'BRSK2' 'CAMK1G' 'CAMKK1' 'CDK1' 'CDK4' 'CDK8' 'CDK16' 'CHAK1' 'CK1G1' 'CK2A1' 'CLK4' 'DCAMKL1' 'DNAPK' 'DYRK1B' 'ERK2' 'GCK' 'GSK3A' 'HGK' 'HPK1' 'HUNK'
 'IRAK4' 'KHS1' 'LRRK2' 'MAPKAPK2' 'MARK3' 'MEKK1' 'MEKK6' 'MLK4' 'MOS' 'MSK1' 'MYLK4' 'MYO3A' 'NEK7' 'NEK9' 'NUAK2' 'P70S6K' 'P90RSK' 'PASK' 'PHKG1' 'PINK1' 'PKACB' 'PKCH' 'PKCZ' 'PLK2' 'PLK3' 'QSK'
 'ROCK1' 'RSK3' 'SLK' 'SNRK' 'STLK3' 'TGFBR1' 'TLK2' 'TTK' 'WNK3' 'YSK1']
# kinase in test set: 61
# kinase in train set: 242
test set: ['ACVR2A' 'ALK2' 'AMPKA2' 'AURB' 'BMPR1B' 'BUB1' 'CAMK2A' 'CAMK2G' 'CDK2' 'CDK9' 'CDK10' 'CDK17' 'CHAK2' 'CHK2' 'CK1D' 'COT' 'DLK' 'DSTYK' 'DYRK2' 'ERK7' 'GRK1' 'GRK3' 'GRK7' 'HIPK4' 'IKKA' 'IRE1'
 'KHS2' 'LATS2' 'LKB1' 'MARK2' 'MEK2' 'MELK' 'MNK1' 'MSK2' 'MST1' 'MTOR' 'NEK3' 'NEK11' 'NIK' 'P38B' 'PAK3' 'PAK5' 'PDHK1' 'PDHK4' 'PIM3' 'PKCB' 'PKG1' 'PKR' 'PRKD1' 'PRKD2' 'PRP4' 'RIPK3' 'RSK4'
 'SKMLCK' 'SSTK

In [None]:
def train_df(df, model):
    
    feat_col = ['position'] + df.columns.tolist()[5:]
    target_col = ['target']

    # prepare the dataframe for split
    kinase = Data.get_kinase_info_full()

    info = df[['kinase']].merge(kinase,'left')
    print((info.kinase == df.kinase).value_counts())
    
    sgkf = StratifiedGroupKFold(n_splits=5, shuffle=True, random_state=123)
    for fold, (train_idx, test_idx) in enumerate(sgkf.split(info.index.values, info.category,info.kinase)):
        print(f'# kinase in test set: {info.loc[test_idx].kinase.unique().shape[0]}')
        print(f'# kinase in train set: {info.loc[train_idx].kinase.unique().shape[0]}')
        print(f'test set: {info.loc[test_idx].kinase.unique()}')
        break
    
    X_train = df.loc[train_idx][feat_col]
    X_test = df.loc[test_idx][feat_col]
    y_train = df.loc[train_idx][target_col]
    y_test = df.loc[test_idx][target_col]
    
    pred = train_ML(model,X_train, X_test, y_train, y_test)

In [None]:
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.tree import DecisionTreeRegressor
from xgboost import XGBRegressor

In [None]:
models = {
    'KNN': KNeighborsRegressor(n_neighbors=2),
    'LinearRegression': LinearRegression(),
    'Ridge': Ridge(alpha=1.0),
    'Lasso': Lasso(0.1),
    'ElasticNet': ElasticNet(0.1),
    'DecisionTreeRegressor': DecisionTreeRegressor(),
    'XGBRegressor': XGBRegressor(max_depth=10),
    # 'RandomForestRegressor': RandomForestRegressor()
}

In [None]:
for i, model in models.items():
    for j, df in dfs.items():
        print(f'-----------------------model:{i}, data:{j}----------------------')
        train_df(df,model)

-----------------------model:KNN, data:t5----------------------
True    59994
Name: kinase, dtype: int64
# kinase in test set: 61
# kinase in train set: 242
test set: ['AAK1' 'AKT3' 'ANKRD3' 'AURA' 'BMPR1A' 'BRSK2' 'CAMK1G' 'CAMKK1' 'CDK1' 'CDK4' 'CDK8' 'CDK16' 'CHAK1' 'CK1G1' 'CK2A1' 'CLK4' 'DCAMKL1' 'DNAPK' 'DYRK1B' 'ERK2' 'GCK' 'GSK3A' 'HGK' 'HPK1' 'HUNK'
 'IRAK4' 'KHS1' 'LRRK2' 'MAPKAPK2' 'MARK3' 'MEKK1' 'MEKK6' 'MLK4' 'MOS' 'MSK1' 'MYLK4' 'MYO3A' 'NEK7' 'NEK9' 'NUAK2' 'P70S6K' 'P90RSK' 'PASK' 'PHKG1' 'PINK1' 'PKACB' 'PKCH' 'PKCZ' 'PLK2' 'PLK3' 'QSK'
 'ROCK1' 'RSK3' 'SLK' 'SNRK' 'STLK3' 'TGFBR1' 'TLK2' 'TTK' 'WNK3' 'YSK1']
KNeighborsRegressor(n_neighbors=2)
rmse is 0.7049
Spearman correlation coefficient: 0.6543
Pearson correlation coefficient: 0.7595 
-----------------------model:KNN, data:t5_1hot----------------------
True    59994
Name: kinase, dtype: int64
# kinase in test set: 61
# kinase in train set: 242
test set: ['AAK1' 'AKT3' 'ANKRD3' 'AURA' 'BMPR1A' 'BRSK2' 'CAMK1G' 'CAM



True    59004
Name: kinase, dtype: int64
# kinase in test set: 61
# kinase in train set: 237
test set: ['ACVR2B' 'AKT1' 'ALPHAK3' 'AMPKA1' 'AURA' 'BMPR1B' 'CAMK1B' 'CAMKK2' 'CAMLCK' 'CDK4' 'CDK6' 'CDK8' 'CDK13' 'CHK1' 'CK1D' 'CK1G2' 'DMPK1' 'DSTYK' 'DYRK1A' 'ERK5' 'GRK2' 'GRK4' 'GRK7' 'HIPK1' 'ICK'
 'IRAK1' 'KHS1' 'LOK' 'MAPKAPK5' 'MARK2' 'MARK4' 'MEKK2' 'MLK3' 'MNK1' 'MSK2' 'NDR1' 'NDR2' 'NEK2' 'NEK9' 'P70S6K' 'PAK2' 'PAK3' 'PBK' 'PKCA' 'PKCE' 'PKCZ' 'PKG1' 'PLK2' 'PLK3' 'RIPK2' 'RIPK3' 'RSK4'
 'SBK' 'SIK' 'SKMLCK' 'SRPK3' 'SSTK' 'TNIK' 'VRK2' 'YANK2' 'ZAK']
Lasso(alpha=0.1)
rmse is 1.0785
Spearman correlation coefficient: 0.2997
Pearson correlation coefficient: 0.2288 
-----------------------model:Lasso, data:esm_1hot----------------------
True    59004
Name: kinase, dtype: int64
# kinase in test set: 61
# kinase in train set: 237
test set: ['ACVR2B' 'AKT1' 'ALPHAK3' 'AMPKA1' 'AURA' 'BMPR1B' 'CAMK1B' 'CAMKK2' 'CAMLCK' 'CDK4' 'CDK6' 'CDK8' 'CDK13' 'CHK1' 'CK1D' 'CK1G2' 'DMPK1' 'DSTYK'



True    59994
Name: kinase, dtype: int64
# kinase in test set: 61
# kinase in train set: 242
test set: ['AAK1' 'AKT3' 'ANKRD3' 'AURA' 'BMPR1A' 'BRSK2' 'CAMK1G' 'CAMKK1' 'CDK1' 'CDK4' 'CDK8' 'CDK16' 'CHAK1' 'CK1G1' 'CK2A1' 'CLK4' 'DCAMKL1' 'DNAPK' 'DYRK1B' 'ERK2' 'GCK' 'GSK3A' 'HGK' 'HPK1' 'HUNK'
 'IRAK4' 'KHS1' 'LRRK2' 'MAPKAPK2' 'MARK3' 'MEKK1' 'MEKK6' 'MLK4' 'MOS' 'MSK1' 'MYLK4' 'MYO3A' 'NEK7' 'NEK9' 'NUAK2' 'P70S6K' 'P90RSK' 'PASK' 'PHKG1' 'PINK1' 'PKACB' 'PKCH' 'PKCZ' 'PLK2' 'PLK3' 'QSK'
 'ROCK1' 'RSK3' 'SLK' 'SNRK' 'STLK3' 'TGFBR1' 'TLK2' 'TTK' 'WNK3' 'YSK1']
ElasticNet(alpha=0.1)
rmse is 1.0141
Spearman correlation coefficient: 0.3554
Pearson correlation coefficient: 0.2556 
-----------------------model:ElasticNet, data:t5_1hot----------------------
True    59994
Name: kinase, dtype: int64
# kinase in test set: 61
# kinase in train set: 242
test set: ['AAK1' 'AKT3' 'ANKRD3' 'AURA' 'BMPR1A' 'BRSK2' 'CAMK1G' 'CAMKK1' 'CDK1' 'CDK4' 'CDK8' 'CDK16' 'CHAK1' 'CK1G1' 'CK2A1' 'CLK4' 'DCAM