# Prepare dataset (protein only)

> In this module, we prepare protein sequence embeddings with target values

In [None]:
#| hide
import sys
sys.path.append("/notebooks/katlas")
from nbdev.showdoc import *
%matplotlib inline
%matplotlib inline
%load_ext autoreload
%autoreload 2

In [None]:
from fastbook import *
from katlas.core import Data
from katlas.feature import *
from katlas.train import *
from katlas.plot import *

from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import *
from sklearn.svm import *
from sklearn.ensemble import *
from sklearn.model_selection import train_test_split, StratifiedGroupKFold
from sklearn.metrics import mean_squared_error
from scipy.stats import spearmanr,pearsonr

import xgboost as xgb
import joblib

# import matplotlib.pyplot as plt

from pathlib import Path
import math

  @numba.jit()
  @numba.jit()
  @numba.jit()
  @numba.jit()


In [None]:
import pandas as pd
import numpy as np
from katlas.core import *
from katlas.plot import *
from katlas.train import *

In [None]:
def get_pca_df(df, # feature dataframe with kinase ID as first column
               startswith, # string to start with of features
               n_components):
    pca = reduce_dim(df,n_components=n_components) 
    pca.columns = [startswith + '_' + col if i > 0 else col for i, col in enumerate(pca.columns)]
    return pca

In [None]:
def combine(target, kinase, aa):
    df = target.merge(kinase).merge(aa).reset_index(drop=True)
    return df

## Target

### Scaled

In [None]:
scaled = Data.get_unstack_q85_up40()

In [None]:
scaled = scaled.pivot(columns = 'substrate',index='kinase',values='target').reset_index()

In [None]:
scaled.columns[1:]

Index(['-1A', '-1C', '-1D', '-1E', '-1F', '-1G', '-1H', '-1I', '-1K', '-1L',
       ...
       '4P', '4Q', '4R', '4S', '4T', '4V', '4W', '4Y', '4t', '4y'],
      dtype='object', name='substrate', length=198)

### Standardized

In [None]:
standard = Data.get_unstack_standard()

In [None]:
standard = standard.pivot(columns = 'substrate',index='kinase',values='target').reset_index()

### Raw

In [None]:
raw = Data.get_unstack_raw()

In [None]:
raw = raw.pivot(columns = 'substrate',index='kinase',values='target').reset_index()

## Check the target correlations

In [None]:
raw.corrwith(standard, axis=1).value_counts()

1.0    62
1.0    49
1.0    34
1.0    24
1.0    20
1.0    20
1.0    18
1.0    18
1.0    17
1.0    12
1.0     9
1.0     7
1.0     4
1.0     3
1.0     1
1.0     1
1.0     1
1.0     1
1.0     1
1.0     1
dtype: int64

In [None]:
raw.corrwith(scaled, axis=1).value_counts()

1.000000    69
1.000000    43
1.000000    41
1.000000    21
1.000000    20
1.000000    16
1.000000    15
1.000000    15
1.000000    14
1.000000    11
1.000000    10
1.000000     8
1.000000     5
0.985367     1
0.904324     1
0.996279     1
1.000000     1
1.000000     1
0.998510     1
0.996573     1
0.954989     1
0.968876     1
0.999949     1
0.998813     1
0.997229     1
0.995039     1
1.000000     1
0.987548     1
dtype: int64

As we capped the max value in scaled to be 40, so some of the pearson correlation is not 1.0

If we check spearman, which cares about ranking most, we can see the spearman correlation of the transformed target with raw is 1.0 accross each kinase

In [None]:
raw.corrwith(standard, axis=1,method='spearman').value_counts()

1.0    303
dtype: int64

In [None]:
raw.corrwith(scaled, axis=1,method='spearman').value_counts()

1.0    303
dtype: int64

## Kinase

### ESM2

In [None]:
esm = Data.get_esm_full()

ESM2 - PCA64

In [None]:
esm_pca = get_pca_df(esm, 'esm', 64)

### T5

In [None]:
t5 = Data.get_t5_full()

T5 - PCA64

In [None]:
t5_pca = get_pca_df(t5, 't5', 64)

## Combine

In [None]:
target = {'scaled': scaled, 'standard':standard}

In [None]:
kinase_all = {'esm':esm,'t5':t5, 'esmPCA':esm_pca, 't5PCA': t5_pca}

In [None]:
dfs = {}
for i, t in target.items():
    for j, k in kinase_all.items():
        df = t.merge(k).reset_index(drop=True)
        dfs[i+'_'+j] = df

In [None]:
dfs.keys()

dict_keys(['scaled_esm', 'scaled_t5', 'scaled_esmPCA', 'scaled_t5PCA', 'standard_esm', 'standard_t5', 'standard_esmPCA', 'standard_t5PCA'])

In [None]:
for key, df in dfs.items():
    df.to_parquet(f'train/{key}.parquet')