# Prepare data for training

In [1]:
import pandas as pd
from katlas.data import *
from katlas.pssm import *

## feature data

In [82]:
# T5 data
t5 = pd.read_parquet('raw/t5_kd.parquet')

# ESM data
esm = pd.read_parquet('raw/esm_kd.parquet')

# Onehot data
onehot = pd.read_parquet('raw/onehot_kd.parquet')

# Onehot pca data
onehot_pca = pd.read_parquet('raw/onehot_pca_kd.parquet')

In [83]:
t5_col = t5.columns
esm_col = esm.columns
onehot_col = onehot.columns
onehot_pca_col = onehot_pca.columns

In [5]:
t5.columns

Index(['T5_0', 'T5_1', 'T5_2', 'T5_3', 'T5_4', 'T5_5', 'T5_6', 'T5_7', 'T5_8',
       'T5_9',
       ...
       'T5_1014', 'T5_1015', 'T5_1016', 'T5_1017', 'T5_1018', 'T5_1019',
       'T5_1020', 'T5_1021', 'T5_1022', 'T5_1023'],
      dtype='object', length=1024)

In [6]:
esm.columns

Index(['esm_0', 'esm_1', 'esm_2', 'esm_3', 'esm_4', 'esm_5', 'esm_6', 'esm_7',
       'esm_8', 'esm_9',
       ...
       'esm_1270', 'esm_1271', 'esm_1272', 'esm_1273', 'esm_1274', 'esm_1275',
       'esm_1276', 'esm_1277', 'esm_1278', 'esm_1279'],
      dtype='object', length=1280)

In [7]:
onehot.columns

Index(['65_-', '65_A', '65_C', '65_D', '65_E', '65_F', '65_G', '65_H', '65_I',
       '65_K',
       ...
       '3192_M', '3192_N', '3192_P', '3192_Q', '3192_R', '3192_S', '3192_T',
       '3192_V', '3192_W', '3192_Y'],
      dtype='object', length=6849)

In [84]:
onehot_pca_col

Index(['PCA1', 'PCA2', 'PCA3', 'PCA4', 'PCA5', 'PCA6', 'PCA7', 'PCA8', 'PCA9',
       'PCA10',
       ...
       'PCA991', 'PCA992', 'PCA993', 'PCA994', 'PCA995', 'PCA996', 'PCA997',
       'PCA998', 'PCA999', 'PCA1000'],
      dtype='object', length=1000)

## target data

### PSPA

In [26]:
pspa = Data.get_pspa_all_scale()

In [27]:
# Remove those with _TYR due to their overall low specificity and overlap with main kinases
pspa = pspa[~pspa.index.str.contains('_')]

In [28]:
pspa.head()

Unnamed: 0_level_0,-5P,-5G,-5A,-5C,-5S,-5T,-5V,-5I,-5L,-5M,...,4H,4K,4R,4Q,4N,4D,4E,4pS,4pT,4pY
kinase,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
AAK1,0.05845,0.01989,0.02305,0.03702,0.0345,0.0345,0.0772,0.12615,0.08061,0.07014,...,0.04482,0.06651,0.07427,0.05082,0.04738,0.03113,0.03657,0.02009,0.02009,0.02161
ACVR2A,0.02971,0.03443,0.0418,0.035,0.04137,0.04137,0.04281,0.04474,0.04266,0.03729,...,0.04202,0.03865,0.03601,0.04517,0.04077,0.04693,0.04693,0.05155,0.05155,0.04319
ACVR2B,0.03779,0.03665,0.04013,0.05473,0.03779,0.03779,0.0385,0.03134,0.03339,0.03658,...,0.04056,0.03261,0.03514,0.04229,0.03846,0.05278,0.05039,0.05502,0.05502,0.04605
AKT1,0.04669,0.04599,0.04274,0.04684,0.03995,0.03995,0.03306,0.03368,0.03592,0.0391,...,0.05299,0.09151,0.08648,0.05874,0.05187,0.03541,0.02494,0.03141,0.03141,0.02102
AKT2,0.04617,0.04732,0.04931,0.04464,0.04095,0.04095,0.03321,0.03206,0.03781,0.03934,...,0.05199,0.08844,0.0758,0.04992,0.0477,0.02772,0.0268,0.04196,0.04196,0.03193


In [29]:
info = Data.get_kinase_info()

info = info[info.pseudo=='0']

In [30]:
info.shape

(462, 36)

In [31]:
info_map = info.set_index('kinase')['kd_ID']

In [32]:
pspa.index = pspa.index.to_series().map(info_map)

In [33]:
pspa

Unnamed: 0_level_0,-5P,-5G,-5A,-5C,-5S,-5T,-5V,-5I,-5L,-5M,...,4H,4K,4R,4Q,4N,4D,4E,4pS,4pT,4pY
kinase,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Q2M2I8_AAK1_HUMAN_KD1,0.05845,0.01989,0.02305,0.03702,0.03450,0.03450,0.07720,0.12615,0.08061,0.07014,...,0.04482,0.06651,0.07427,0.05082,0.04738,0.03113,0.03657,0.02009,0.02009,0.02161
P27037_AVR2A_HUMAN_KD1,0.02971,0.03443,0.04180,0.03500,0.04137,0.04137,0.04281,0.04474,0.04266,0.03729,...,0.04202,0.03865,0.03601,0.04517,0.04077,0.04693,0.04693,0.05155,0.05155,0.04319
Q13705_AVR2B_HUMAN_KD1,0.03779,0.03665,0.04013,0.05473,0.03779,0.03779,0.03850,0.03134,0.03339,0.03658,...,0.04056,0.03261,0.03514,0.04229,0.03846,0.05278,0.05039,0.05502,0.05502,0.04605
P31749_AKT1_HUMAN_KD1,0.04669,0.04599,0.04274,0.04684,0.03995,0.03995,0.03306,0.03368,0.03592,0.03910,...,0.05299,0.09151,0.08648,0.05874,0.05187,0.03541,0.02494,0.03141,0.03141,0.02102
P31751_AKT2_HUMAN_KD1,0.04617,0.04732,0.04931,0.04464,0.04095,0.04095,0.03321,0.03206,0.03781,0.03934,...,0.05199,0.08844,0.07580,0.04992,0.04770,0.02772,0.02680,0.04196,0.04196,0.03193
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
P17948_VGFR1_HUMAN_KD1,0.04074,0.04365,0.03788,0.03921,0.03896,0.03534,0.03027,0.02957,0.02881,0.03179,...,0.05811,0.06189,0.06516,0.03921,0.04734,0.04891,0.03636,0.04827,0.04827,0.02709
P35968_VGFR2_HUMAN_KD1,0.04816,0.05105,0.04224,0.03927,0.04110,0.03996,0.03244,0.03191,0.03251,0.03616,...,0.04386,0.07036,0.07377,0.04718,0.03979,0.03074,0.02808,0.02492,0.02492,0.02426
P35916_VGFR3_HUMAN_KD1,0.03554,0.04129,0.03795,0.04300,0.03982,0.03663,0.03360,0.03881,0.03686,0.04122,...,0.05899,0.05158,0.06051,0.04823,0.03842,0.04042,0.03962,0.03548,0.03548,0.03986
P07947_YES_HUMAN_KD1,0.05216,0.04399,0.04137,0.04084,0.04060,0.03891,0.03352,0.02889,0.03082,0.03567,...,0.04928,0.04643,0.05988,0.05157,0.03947,0.03330,0.03891,0.02934,0.02934,0.03694


Get the columns in order for reshape

In [35]:
pspa = pspa.apply(lambda r: pd.Series(flatten_pssm(recover_pssm(r),column_wise=False)), axis=1)

In [36]:
pspa # now the flattened pssm is row-wise flattend (go through position first)

Unnamed: 0_level_0,-5P,-4P,-3P,-2P,-1P,0P,1P,2P,3P,4P,...,-5pY,-4pY,-3pY,-2pY,-1pY,0pY,1pY,2pY,3pY,4pY
kinase,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Q2M2I8_AAK1_HUMAN_KD1,0.05845,0.04172,0.08610,0.01794,0.09607,0.0,0.04265,0.04048,0.05264,0.05026,...,0.04960,0.02648,0.02176,0.03857,0.01505,0.0,0.00921,0.03149,0.02775,0.02161
P27037_AVR2A_HUMAN_KD1,0.02971,0.03341,0.03345,0.01462,0.02657,0.0,0.01055,0.04285,0.04958,0.05559,...,0.05440,0.04882,0.06497,0.03279,0.07109,0.0,0.08706,0.03799,0.06031,0.04319
Q13705_AVR2B_HUMAN_KD1,0.03779,0.03774,0.03044,0.01482,0.02331,0.0,0.01020,0.05871,0.05072,0.05205,...,0.05268,0.05987,0.09086,0.03918,0.07259,0.0,0.08007,0.04636,0.05130,0.04605
P31749_AKT1_HUMAN_KD1,0.04669,0.04161,0.02642,0.01512,0.04884,0.0,0.01689,0.03375,0.03828,0.07361,...,0.03314,0.03166,0.01560,0.01368,0.04229,0.0,0.02693,0.02975,0.02656,0.02102
P31751_AKT2_HUMAN_KD1,0.04617,0.04437,0.01778,0.01873,0.05226,0.0,0.03177,0.03922,0.04509,0.07404,...,0.03651,0.02930,0.01671,0.02063,0.03864,0.0,0.03887,0.04794,0.04434,0.03193
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
P17948_VGFR1_HUMAN_KD1,0.04074,0.03414,0.02854,0.04313,0.01986,0.0,0.01625,0.02121,0.04472,0.06253,...,0.06599,0.10046,0.11411,0.09176,0.09614,1.0,0.10087,0.10220,0.03855,0.02709
P35968_VGFR2_HUMAN_KD1,0.04816,0.05296,0.04610,0.04524,0.03253,0.0,0.02688,0.02602,0.03470,0.05973,...,0.06115,0.08767,0.09090,0.08310,0.11178,1.0,0.08839,0.07561,0.01786,0.02426
P35916_VGFR3_HUMAN_KD1,0.03554,0.03875,0.03354,0.04111,0.02239,0.0,0.02272,0.01798,0.02467,0.05166,...,0.04596,0.05470,0.05802,0.05403,0.12136,1.0,0.09997,0.11333,0.02731,0.03986
P07947_YES_HUMAN_KD1,0.05216,0.04870,0.04363,0.05690,0.05838,0.0,0.01899,0.03019,0.02228,0.03449,...,0.05147,0.05494,0.05090,0.05237,0.03627,1.0,0.09052,0.05174,0.02739,0.03694


### CDDM

In [40]:
cddm=Data.get_cddm()

In [41]:
# cddm = pd.read_parquet('out/CDDM_pssms.parquet')

In [42]:
cddm.head()

Unnamed: 0,-20P,-20G,-20A,-20C,-20S,-20T,-20V,-20I,-20L,-20M,...,20H,20K,20R,20Q,20N,20D,20E,20pS,20pT,20pY
SRC,0.054538,0.08138,0.060077,0.012356,0.036216,0.032382,0.055816,0.052407,0.083511,0.023434,...,0.013351,0.076992,0.06097,0.037383,0.036938,0.05296,0.086337,0.025367,0.015576,0.023142
EPHA3,0.044276,0.088013,0.065335,0.008639,0.037797,0.036717,0.072354,0.048596,0.075594,0.026998,...,0.014132,0.083098,0.059356,0.031091,0.033917,0.056529,0.100622,0.025438,0.013567,0.015828
FES,0.047231,0.082519,0.070575,0.011401,0.034745,0.039088,0.061889,0.053203,0.088491,0.026059,...,0.013053,0.086266,0.055619,0.040863,0.038025,0.059024,0.085698,0.026674,0.013621,0.019296
NTRK3,0.044444,0.074644,0.074074,0.017094,0.033048,0.035328,0.060969,0.05812,0.08433,0.026781,...,0.015682,0.091677,0.052473,0.03076,0.044029,0.057298,0.088661,0.021713,0.015682,0.018094
ALK,0.045748,0.079765,0.073314,0.018182,0.032845,0.035191,0.067449,0.051026,0.076246,0.027566,...,0.015634,0.090198,0.069152,0.030066,0.043897,0.051112,0.096212,0.026458,0.013229,0.017438


In [43]:
cddm[cddm.index.to_series().map(info_map).isna()]

Unnamed: 0,-20P,-20G,-20A,-20C,-20S,-20T,-20V,-20I,-20L,-20M,...,20H,20K,20R,20Q,20N,20D,20E,20pS,20pT,20pY


In [44]:
cddm.index = cddm.index.to_series().map(info_map)

In [45]:
cddm = cddm[cddm.index.notna()]

In [50]:
cddm.shape

(328, 943)

In [51]:
cddm

Unnamed: 0,-20P,-20G,-20A,-20C,-20S,-20T,-20V,-20I,-20L,-20M,...,20H,20K,20R,20Q,20N,20D,20E,20pS,20pT,20pY
P12931_SRC_HUMAN_KD1,0.054538,0.081380,0.060077,0.012356,0.036216,0.032382,0.055816,0.052407,0.083511,0.023434,...,0.013351,0.076992,0.060970,0.037383,0.036938,0.052960,0.086337,0.025367,0.015576,0.023142
P29320_EPHA3_HUMAN_KD1,0.044276,0.088013,0.065335,0.008639,0.037797,0.036717,0.072354,0.048596,0.075594,0.026998,...,0.014132,0.083098,0.059356,0.031091,0.033917,0.056529,0.100622,0.025438,0.013567,0.015828
P07332_FES_HUMAN_KD1,0.047231,0.082519,0.070575,0.011401,0.034745,0.039088,0.061889,0.053203,0.088491,0.026059,...,0.013053,0.086266,0.055619,0.040863,0.038025,0.059024,0.085698,0.026674,0.013621,0.019296
Q16288_NTRK3_HUMAN_KD1,0.044444,0.074644,0.074074,0.017094,0.033048,0.035328,0.060969,0.058120,0.084330,0.026781,...,0.015682,0.091677,0.052473,0.030760,0.044029,0.057298,0.088661,0.021713,0.015682,0.018094
Q9UM73_ALK_HUMAN_KD1,0.045748,0.079765,0.073314,0.018182,0.032845,0.035191,0.067449,0.051026,0.076246,0.027566,...,0.015634,0.090198,0.069152,0.030066,0.043897,0.051112,0.096212,0.026458,0.013229,0.017438
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Q15746_MYLK_HUMAN_KD1,0.078947,0.052632,0.078947,0.026316,0.052632,0.026316,0.052632,0.052632,0.052632,0.000000,...,0.000000,0.066667,0.022222,0.044444,0.133333,0.044444,0.000000,0.044444,0.022222,0.000000
Q01973_ROR1_HUMAN_KD1,0.097561,0.097561,0.024390,0.000000,0.024390,0.146341,0.170732,0.024390,0.048780,0.000000,...,0.000000,0.075000,0.050000,0.000000,0.050000,0.025000,0.125000,0.125000,0.000000,0.025000
O14976_GAK_HUMAN_KD1,0.075000,0.075000,0.175000,0.000000,0.025000,0.025000,0.100000,0.025000,0.050000,0.175000,...,0.000000,0.024390,0.024390,0.000000,0.170732,0.195122,0.024390,0.048780,0.024390,0.000000
Q6P0Q8_MAST2_HUMAN_KD1,0.046512,0.116279,0.069767,0.000000,0.023256,0.069767,0.069767,0.000000,0.046512,0.046512,...,0.000000,0.048780,0.219512,0.000000,0.048780,0.024390,0.024390,0.024390,0.000000,0.000000


In [52]:
cddm.shape

(328, 943)

In [53]:
# row first flatten
cddm = cddm.apply(lambda r: pd.Series(flatten_pssm(recover_pssm(r),column_wise=False)), axis=1)

In [54]:
cddm

Unnamed: 0,-20P,-19P,-18P,-17P,-16P,-15P,-14P,-13P,-12P,-11P,...,11pY,12pY,13pY,14pY,15pY,16pY,17pY,18pY,19pY,20pY
P12931_SRC_HUMAN_KD1,0.054538,0.048428,0.054968,0.050526,0.045837,0.046599,0.060746,0.060644,0.056785,0.055857,...,0.016818,0.020346,0.015231,0.010512,0.014912,0.014952,0.010568,0.010615,0.018198,0.023142
P29320_EPHA3_HUMAN_KD1,0.044276,0.047875,0.047696,0.043803,0.046425,0.052716,0.053135,0.055231,0.040762,0.047015,...,0.015795,0.019704,0.012672,0.010538,0.012236,0.011161,0.010626,0.006738,0.016356,0.015828
P07332_FES_HUMAN_KD1,0.047231,0.045455,0.049569,0.049919,0.040729,0.047568,0.054400,0.056503,0.051623,0.043039,...,0.014786,0.018712,0.011621,0.018942,0.012828,0.012856,0.011211,0.007878,0.014698,0.019296
Q16288_NTRK3_HUMAN_KD1,0.044444,0.049404,0.046433,0.049689,0.041150,0.045557,0.049972,0.048603,0.045682,0.043382,...,0.013945,0.019848,0.011137,0.015348,0.016588,0.012470,0.014863,0.005970,0.016837,0.018094
Q9UM73_ALK_HUMAN_KD1,0.045748,0.044418,0.053976,0.046821,0.047921,0.051208,0.055587,0.056128,0.043950,0.045610,...,0.018508,0.016860,0.011703,0.012382,0.011820,0.010664,0.010702,0.005974,0.014397,0.017438
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Q15746_MYLK_HUMAN_KD1,0.078947,0.100000,0.047619,0.093023,0.069767,0.046512,0.136364,0.044444,0.044444,0.086957,...,0.021277,0.000000,0.000000,0.021739,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
Q01973_ROR1_HUMAN_KD1,0.097561,0.024390,0.024390,0.219512,0.024390,0.000000,0.073171,0.071429,0.047619,0.190476,...,0.000000,0.000000,0.000000,0.023810,0.000000,0.000000,0.000000,0.000000,0.047619,0.025000
O14976_GAK_HUMAN_KD1,0.075000,0.200000,0.000000,0.073171,0.048780,0.024390,0.073171,0.073171,0.024390,0.097561,...,0.000000,0.000000,0.048780,0.024390,0.000000,0.000000,0.000000,0.024390,0.000000,0.000000
Q6P0Q8_MAST2_HUMAN_KD1,0.046512,0.023256,0.023256,0.000000,0.139535,0.046512,0.069767,0.093023,0.000000,0.162791,...,0.000000,0.000000,0.000000,0.024390,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000


## Kinase not in active kd

In [65]:
cddm[~cddm.index.isin(t5.index)]

Unnamed: 0,-20P,-19P,-18P,-17P,-16P,-15P,-14P,-13P,-12P,-11P,...,11pY,12pY,13pY,14pY,15pY,16pY,17pY,18pY,19pY,20pY
O00418_EF2K_HUMAN_KD1,0.058201,0.015873,0.026316,0.062827,0.088083,0.036269,0.025773,0.040816,0.035714,0.030303,...,0.010471,0.010582,0.005319,0.010753,0.005376,0.005376,0.010753,0.038043,0.043478,0.022099
Q8TF76_HASP_HUMAN_KD1,0.045455,0.036364,0.036364,0.118182,0.117117,0.097345,0.034483,0.02521,0.066667,0.082645,...,0.0,0.008403,0.0,0.0,0.0,0.034483,0.0,0.00885,0.0,0.00885
O43683_BUB1_HUMAN_KD1,0.13,0.06,0.039604,0.089109,0.079208,0.049505,0.049505,0.019802,0.089109,0.148515,...,0.0,0.020619,0.010309,0.0,0.010417,0.0,0.0,0.0,0.0,0.043956
Q15118_PDK1_HUMAN_KD1,0.020619,0.030928,0.010309,0.030928,0.072165,0.051546,0.072165,0.051546,0.061856,0.092784,...,0.0,0.0,0.021978,0.021978,0.021978,0.010989,0.010989,0.065934,0.0,0.043956
Q15120_PDK3_HUMAN_KD1,0.044776,0.014925,0.059701,0.074627,0.029851,0.044776,0.104478,0.059701,0.059701,0.029851,...,0.0,0.015152,0.015625,0.0,0.0,0.016129,0.0,0.0,0.04918,0.016949
Q9UPE1_SRPK3_HUMAN_KD1,0.140351,0.137931,0.051724,0.034483,0.051724,0.034483,0.12069,0.033898,0.083333,0.033333,...,0.017857,0.0,0.018182,0.0,0.0,0.018182,0.072727,0.0,0.0,0.0
Q96QT4_TRPM7_HUMAN_KD1,0.056604,0.056604,0.037736,0.037736,0.018868,0.056604,0.037736,0.018868,0.075472,0.037736,...,0.0,0.0,0.0,0.0,0.0,0.018868,0.0,0.0,0.0,0.0
P78362_SRPK2_HUMAN_KD1,0.040816,0.12,0.1,0.06,0.04,0.06,0.06,0.117647,0.096154,0.057692,...,0.019231,0.0,0.038462,0.0,0.0,0.0,0.0,0.0,0.038462,0.0


In [70]:
pspa[~pspa.index.isin(t5.index)]

Unnamed: 0_level_0,-5P,-4P,-3P,-2P,-1P,0P,1P,2P,3P,4P,...,-5pY,-4pY,-3pY,-2pY,-1pY,0pY,1pY,2pY,3pY,4pY
kinase,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Q96QP1_ALPK1_HUMAN_KD1,0.03973,0.04168,0.03652,0.06123,0.02496,0.0,0.01772,0.04255,0.0316,0.04294,...,0.075,0.0755,0.05888,0.01946,0.27908,0.0,0.07937,0.05559,0.07007,0.11014
O14874_BCKD_HUMAN_KD1,0.03595,0.04598,0.0428,0.03844,0.02034,0.0,0.01931,0.02783,0.03869,0.04788,...,0.0393,0.04978,0.05735,0.04417,0.02739,0.0,0.02393,0.03109,0.02469,0.06467
O43683_BUB1_HUMAN_KD1,0.07426,0.07109,0.18419,0.03555,0.0664,0.0,0.01817,0.10522,0.18797,0.04496,...,0.04824,0.01959,0.01247,0.01383,0.01573,0.0,0.01245,0.01418,0.03246,0.02169
Q96QT4_TRPM7_HUMAN_KD1,0.04609,0.04097,0.0466,0.05748,0.04163,0.0,0.00657,0.06653,0.02913,0.03127,...,0.04517,0.0386,0.04748,0.03035,0.0361,0.0,0.03936,0.02287,0.03167,0.02498
Q9BX84_TRPM6_HUMAN_KD1,0.03886,0.05789,0.06523,0.07878,0.04588,0.0,0.01715,0.10699,0.0451,0.05681,...,0.03397,0.04355,0.04162,0.0331,0.03471,0.0,0.05262,0.03272,0.03617,0.03985
O00418_EF2K_HUMAN_KD1,0.04257,0.03937,0.04509,0.06556,0.04301,0.0,0.01077,0.01711,0.01597,0.04253,...,0.06092,0.03335,0.05386,0.08511,0.04491,0.0,0.07859,0.01808,0.04185,0.03273
Q8IXL6_FA20C_HUMAN_KD1,0.03618,0.04512,0.04155,0.04672,0.01342,0.0,0.01157,0.00896,0.01029,0.02706,...,0.05085,0.04692,0.06779,0.08276,0.04771,0.0,0.03191,0.05062,0.08779,0.09533
Q8TF76_HASP_HUMAN_KD1,0.05828,0.04374,0.04727,0.07855,0.02508,0.0,0.04105,0.02664,0.03435,0.03703,...,0.04459,0.03296,0.04591,0.04146,0.02958,0.0,0.01633,0.01709,0.02363,0.02728
Q15118_PDK1_HUMAN_KD1,0.0313,0.04429,0.04092,0.03627,0.03858,0.0,0.02843,0.03993,0.04791,0.047,...,0.04567,0.05228,0.0397,0.0404,0.03094,0.0,0.03391,0.04205,0.0301,0.03992
Q16654_PDK4_HUMAN_KD1,0.03068,0.03645,0.04143,0.04867,0.03913,0.0,0.03689,0.03562,0.03821,0.04236,...,0.04521,0.07603,0.05581,0.04229,0.03139,0.0,0.04003,0.05163,0.0407,0.0423


## target + feature

In [55]:
def combine(target_df,feat_df): return target_df.merge(feat_df,left_index=True,right_index=True)

In [56]:
pspa_onehot =combine(pspa,onehot)
pspa_onehot_pca =combine(pspa,onehot_pca)

pspa_esm = combine(pspa,esm)
pspa_t5 = combine(pspa,t5)

In [57]:
cddm_onehot =combine(cddm,onehot)
cddm_onehot_pca =combine(cddm,onehot_pca)

cddm_esm = combine(cddm,esm)
cddm_t5 = combine(cddm,t5)

In [86]:
print(pspa_onehot.shape)
print(pspa_onehot_pca.shape)
print(pspa_esm.shape)
print(pspa_t5.shape)

print(cddm_onehot.shape)
print(cddm_onehot_pca.shape)
print(cddm_esm.shape)
print(cddm_t5.shape)

(368, 7079)
(368, 1230)
(368, 1510)
(368, 1254)
(320, 7792)
(320, 1943)
(320, 2223)
(320, 1967)


In [87]:
pspa_onehot.to_parquet('train/pspa_onehot.parquet')
pspa_onehot_pca.to_parquet('train/pspa_onehot_pca.parquet')
pspa_esm.to_parquet('train/pspa_esm.parquet')
pspa_t5.to_parquet('train/pspa_t5.parquet')

In [88]:
cddm_onehot.to_parquet('train/cddm_onehot.parquet')
cddm_onehot_pca.to_parquet('train/cddm_onehot_pca.parquet')
cddm_esm.to_parquet('train/cddm_esm.parquet')
cddm_t5.to_parquet('train/cddm_t5.parquet')

In [89]:
pspa_onehot_pca

Unnamed: 0,-5P,-4P,-3P,-2P,-1P,0P,1P,2P,3P,4P,...,PCA991,PCA992,PCA993,PCA994,PCA995,PCA996,PCA997,PCA998,PCA999,PCA1000
Q2M2I8_AAK1_HUMAN_KD1,0.05845,0.04172,0.08610,0.01794,0.09607,0.0,0.04265,0.04048,0.05264,0.05026,...,0.049938,0.057859,0.005693,-0.006881,0.041570,0.039594,0.080002,0.022089,0.017665,0.013385
P27037_AVR2A_HUMAN_KD1,0.02971,0.03341,0.03345,0.01462,0.02657,0.0,0.01055,0.04285,0.04958,0.05559,...,-0.024776,0.001753,-0.056504,0.164427,-0.072278,0.052143,-0.015666,0.010091,0.138182,-0.033171
Q13705_AVR2B_HUMAN_KD1,0.03779,0.03774,0.03044,0.01482,0.02331,0.0,0.01020,0.05871,0.05072,0.05205,...,0.009370,-0.030940,0.050483,-0.100141,0.045942,-0.005657,-0.010641,0.112414,-0.166770,0.025360
P31749_AKT1_HUMAN_KD1,0.04669,0.04161,0.02642,0.01512,0.04884,0.0,0.01689,0.03375,0.03828,0.07361,...,0.081445,0.019706,0.042164,0.192072,0.001691,-0.068809,0.071163,-0.010879,-0.028764,0.042163
P31751_AKT2_HUMAN_KD1,0.04617,0.04437,0.01778,0.01873,0.05226,0.0,0.03177,0.03922,0.04509,0.07404,...,-0.059736,-0.120186,-0.060415,-0.245777,-0.113104,0.116407,0.054151,0.135826,0.001413,-0.007756
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
P17948_VGFR1_HUMAN_KD1,0.04074,0.03414,0.02854,0.04313,0.01986,0.0,0.01625,0.02121,0.04472,0.06253,...,-0.031060,-0.010548,0.035789,-0.113515,0.221636,0.034873,-0.093145,-0.005760,0.110737,0.147448
P35968_VGFR2_HUMAN_KD1,0.04816,0.05296,0.04610,0.04524,0.03253,0.0,0.02688,0.02602,0.03470,0.05973,...,0.131558,-0.033205,0.098801,0.054086,0.010220,0.041245,-0.004853,-0.205461,-0.048656,-0.156289
P35916_VGFR3_HUMAN_KD1,0.03554,0.03875,0.03354,0.04111,0.02239,0.0,0.02272,0.01798,0.02467,0.05166,...,-0.067793,0.110357,-0.015003,0.090916,-0.156411,0.001075,-0.101868,0.228029,0.076798,-0.143131
P07947_YES_HUMAN_KD1,0.05216,0.04870,0.04363,0.05690,0.05838,0.0,0.01899,0.03019,0.02228,0.03449,...,0.065050,-0.028076,-0.064543,0.093354,0.083031,0.011155,-0.002236,-0.147592,-0.093897,0.161915
