# Extract KD features

In [1]:
import pandas as pd
from katlas.core import *
from katlas.feature import *
from katlas.plot import *

In [2]:
df = Data.get_kd_uniprot()

In [3]:
# filter active kinase domain
df = df[df.active_D1_D2.astype(bool)].reset_index(drop=True)

In [4]:
df.columns

Index(['kd_ID', 'Uniprot', 'Entry Name', 'Protein names', 'Gene Names',
       'Gene Names (primary)', 'Organism', 'kd_note', 'kd_evidence',
       'kd_start', 'kd_end', 'kd_seq', 'Domain [FT]', 'Domain [CC]', 'Region',
       'Motif', 'Protein families', 'Reactome', 'ComplexPortal',
       'Subcellular location [CC]', 'Gene Ontology (biological process)',
       'Tissue specificity', 'Interacts with', 'Subunit structure',
       'Function [CC]', 'Activity regulation', 'full_seq', 'D1', 'D2', 'D3',
       'N1', 'active_D1_D2'],
      dtype='object')

In [5]:
df.shape

(4209, 32)

## Onehot of sequence alignment

In [7]:
align = pd.read_parquet('raw/uniprot_kd_align.parquet')

In [8]:
align.columns = align.columns.astype(int)

In [9]:
active=(align[1525]=='D')& (align[1724]=='D')
align = align[active]

In [10]:
align.shape

(4209, 3434)

In [11]:
counts_df = align.apply(lambda col: col.value_counts(), axis=0).fillna(0)
freq_df = counts_df.div(counts_df.sum(axis=0), axis=1)

In [12]:
# remove '-' first line
max_series=freq_df.iloc[1:,:]

In [13]:
freq_max = pd.concat([max_series.idxmax(),max_series.max()],axis=1)
freq_max.columns = ['aa','max_value']
freq_max = freq_max.sort_values('max_value',ascending=False).reset_index(names='position')

In [14]:
freq_max

Unnamed: 0,position,aa,max_value
0,1724,D,1.000000
1,1525,D,1.000000
2,1549,N,0.988121
3,1730,G,0.959135
4,2618,D,0.948919
...,...,...,...
3429,337,A,0.000000
3430,338,A,0.000000
3431,339,A,0.000000
3432,1899,A,0.000000


In [15]:
onehot_col = freq_max[freq_max.max_value>0.05].position.sort_values().tolist()

In [16]:
len(onehot_col)

358

In [17]:
align = align[onehot_col]

In [18]:
from sklearn.preprocessing import OneHotEncoder

def get_onehot(df):
    df=df.copy()
    encoded_df = pd.DataFrame(index=df.index)
    
    encoder = OneHotEncoder(sparse_output=False, dtype=int, handle_unknown='ignore')
    
    for col in df.columns:
        reshaped = df[[col]]  # keep as DataFrame
        encoded = encoder.fit_transform(reshaped)
        aa_labels = encoder.categories_[0]
        new_col_names = [f"{col}_{aa}" for aa in aa_labels]
        encoded_subdf = pd.DataFrame(encoded,index=df.index)
        encoded_subdf.columns=new_col_names
        encoded_df = pd.concat([encoded_df, encoded_subdf], axis=1)
    return encoded_df

In [19]:
onehot=get_onehot(align)

In [20]:
onehot.head()

Unnamed: 0,65_-,65_A,65_C,65_D,65_E,65_F,65_G,65_H,65_I,65_K,...,3192_M,3192_N,3192_P,3192_Q,3192_R,3192_S,3192_T,3192_V,3192_W,3192_Y
A0A075F7E9_LERK1_ORYSI_KD1,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
A0A078CGE6_M3KE1_BRANA_KD1,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
A0A0K3AV08_MLK1_CAEEL_KD1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
A0A0P0VIP0_LRSK7_ORYSJ_KD1,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
A0A0P0XII1_CERK1_ORYSJ_KD1,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [21]:
onehot.shape

(4209, 6849)

It contains the active kd_ID

In [22]:
onehot.to_parquet('raw/onehot_kd.parquet')

## Onehot + PCA

Since onehot gives 6849 columns, we can PCA reduce the column number.

We chose 1000 as t5 and esm have a range of 1000 features.

In [78]:
onehot_pca = reduce_feature(onehot,n=1000)

In [81]:
onehot_pca.to_parquet('raw/onehot_pca_kd.parquet')

## T5 embeddings

In [20]:
feat_t5 = get_t5(df,'kd_seq')

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


model.safetensors:   0%|          | 0.00/2.42G [00:00<?, ?B/s]

  0%|          | 0/4209 [00:00<?, ?it/s]

In [23]:
feat_t5.index=df.kd_ID

In [25]:
feat_t5.head()

  has_large_values = (abs_vals > 1e6).any()
  has_large_values = (abs_vals > 1e6).any()


Unnamed: 0_level_0,T5_0,T5_1,T5_2,T5_3,T5_4,T5_5,T5_6,T5_7,T5_8,T5_9,...,T5_1014,T5_1015,T5_1016,T5_1017,T5_1018,T5_1019,T5_1020,T5_1021,T5_1022,T5_1023
kd_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A0A075F7E9_LERK1_ORYSI_KD1,0.014122,0.068848,0.016113,-0.001555,-0.001301,0.021393,0.030304,-0.062408,0.028427,-0.017532,...,-0.029297,0.0149,-0.006233,-0.069824,0.044067,-0.009636,-0.007458,0.02124,0.005264,-0.034637
A0A078CGE6_M3KE1_BRANA_KD1,0.054565,0.093811,-0.00742,0.01889,-0.042236,-0.005402,-0.004921,-0.046844,-0.013443,-0.019501,...,-0.016037,0.000758,-0.038391,-0.041321,0.06958,-0.022415,-0.026306,-0.001375,0.006599,-0.034485
A0A0K3AV08_MLK1_CAEEL_KD1,0.01487,-0.008934,0.000822,0.001986,-0.035767,0.02388,-0.012856,-0.056274,-0.004978,-0.001923,...,-0.032349,-0.057129,-0.005463,-0.10791,0.020889,-0.009666,-0.022614,-0.019882,0.040466,-0.021912
A0A0P0VIP0_LRSK7_ORYSJ_KD1,-0.008522,0.062134,-0.014053,-0.027054,-0.025665,0.014648,0.015022,-0.045959,0.023544,-0.036133,...,0.037994,-0.022903,0.046082,-0.082703,0.00094,-0.009811,0.008896,0.014931,0.028061,-0.037994
A0A0P0XII1_CERK1_ORYSJ_KD1,0.029053,0.07605,0.014656,0.015732,-0.00605,0.019882,-0.003336,-0.041016,0.060211,-0.014572,...,0.01474,0.019989,0.01123,-0.040222,0.044403,-0.001749,-0.013168,0.018524,-0.01828,-0.034912


In [27]:
# feat_t5.to_parquet('raw/t5_kd.parquet')

## ESM2 embeddings

In [None]:
get_esm

In [7]:
feat_esm = get_esm(df,'kd_seq')

repr_layers number for model esm2_t33_650M_UR50D is 33.
You can also choose other esm2 models: 
esm2_t48_15B_UR50D
esm2_t36_3B_UR50D
esm2_t33_650M_UR50D
esm2_t30_150M_UR50D
esm2_t12_35M_UR50D
esm2_t6_8M_UR50D



  0%|          | 0/4209 [00:00<?, ?it/s]

  if data.storage().size() > 0:


In [8]:
feat_esm.index=df.kd_ID

In [9]:
feat_esm.to_parquet('raw/esm_kd.parquet')

In [10]:
feat_esm.head()

  has_large_values = (abs_vals > 1e6).any()
  has_large_values = (abs_vals > 1e6).any()


Unnamed: 0_level_0,esm_0,esm_1,esm_2,esm_3,esm_4,esm_5,esm_6,esm_7,esm_8,esm_9,...,esm_1270,esm_1271,esm_1272,esm_1273,esm_1274,esm_1275,esm_1276,esm_1277,esm_1278,esm_1279
kd_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A0A075F7E9_LERK1_ORYSI_KD1,-0.014076,-0.047668,-0.030716,-0.045868,-0.062622,-0.116272,-0.007904,0.056915,0.01059,0.020737,...,0.020691,-0.012749,-0.077209,0.087402,0.007301,-0.010765,0.094604,-0.053802,0.049164,0.014648
A0A078CGE6_M3KE1_BRANA_KD1,0.012405,-0.039764,-0.004433,-0.019089,-0.064331,-0.048035,0.120789,-0.026199,-0.058777,0.143188,...,0.08252,-0.046143,-0.080872,0.077515,0.004078,-0.046143,0.029099,-0.214355,0.070923,0.146851
A0A0K3AV08_MLK1_CAEEL_KD1,0.04718,-0.048553,0.022888,0.006191,-0.028015,-0.094177,0.104492,-0.006901,0.016891,0.066223,...,0.023575,-0.010071,-0.090149,0.075684,0.022385,-0.044006,0.071838,-0.078369,0.01371,0.071838
A0A0P0VIP0_LRSK7_ORYSJ_KD1,-0.086121,-0.039001,-0.012978,-0.087524,-0.117981,-0.0466,-0.037628,0.059875,0.037323,0.087708,...,-0.061615,-0.025787,-0.085449,0.133667,0.016953,-0.106079,0.025635,-0.045868,0.08075,0.021927
A0A0P0XII1_CERK1_ORYSJ_KD1,0.004047,-0.067383,-0.022156,-0.011215,-0.11084,-0.077637,0.012314,0.012299,-0.011688,0.047485,...,0.056519,-0.02327,-0.094849,0.101746,-0.033417,-0.050415,0.054749,-0.071594,0.046631,0.021667


In [11]:
feat_esm.shape

(4209, 1280)