# Get kinase-specific PSSMs

In [1]:
from katlas.core import *
import pandas as pd

In [2]:
%%time
df = Data.get_ks_dataset()

CPU times: user 878 ms, sys: 338 ms, total: 1.22 s
Wall time: 5.63 s


In [3]:
df['kinase_id'] = df['kinase_uniprot'] + '_' + df['kinase_protein'].str.split().str[0]

In [4]:
cnt = df['kinase_id'].value_counts()

In [5]:
idx = cnt[cnt>=40].index
idx

Index(['P12931_SRC', 'P29320_EPHA3', 'P07332_FES', 'Q16288_NTRK3',
       'Q9UM73_ALK', 'P00519_ABL1', 'P36888_FLT3', 'P29322_EPHA8',
       'P29323_EPHB2', 'P54762_EPHB1',
       ...
       'P35626_GRK3', 'Q99640_PKMYT1', 'Q6P2M8_CAMK1B', 'O00311_CDC7',
       'Q9NYV4_CDK12', 'Q15746_SMMLCK', 'Q01973_ROR1', 'P15056_BRAF',
       'Q6P0Q8_MAST2', 'O14976_GAK'],
      dtype='object', name='kinase_id', length=333)

In [6]:
df.shape

(187066, 22)

We only evaluate those on kinome tree

In [7]:
df = df[df['kinase_id'].isin(idx)].copy()

In [8]:
df.shape

(185883, 22)

In [9]:
df['site_seq_upper']=df.site_seq.str.upper()

In [10]:
df['source_len'] = df.source.str.split('|').str.len()

In [11]:
def get_LO_all(pssms,site_type='STY'):
    out=[]
    for idx, flat_pssm in pssms.iterrows():
        out.append(flatten_pssm(get_pssm_LO_flat(flat_pssm,site_type)))
    return pd.DataFrame(out,index=pssms.index)

## Take out 20% of PSP as test 

In [12]:
psp = df[df.source.str.contains('PSP')].copy()

In [13]:
def sample(group):
    n = max(1, int(len(group) * 0.2))  # At least 1 row if group is small
    # weights = group['source_len'].values
    # weights = weights / weights.sum()  # normalize

    sampled = group.sample(
        n=n, replace=False, random_state=42
    )

    # Add back the group key as a column
    sampled['kinase_uniprot'] = group.name
    return sampled

In [14]:
test = psp.groupby('kinase_uniprot', group_keys=False)\
    .apply(sample,include_groups=False)

In [15]:
test.kinase_group.value_counts()

kinase_group
CMGC        777
AGC         511
TK          387
CAMK        357
Atypical    148
Other       108
STE          89
CK1          63
TKL          44
NEK          16
Name: count, dtype: int64

In [16]:
test

Unnamed: 0,kin_sub_site,substrate_uniprot,site,source,substrate_genes,substrate_phosphoseq,position,site_seq,sub_site,substrate_sequence,...,kinase_family,kinase_subfamily,kinase_pspa_big,kinase_pspa_small,kinase_coral_ID,num_kin,kinase_id,site_seq_upper,source_len,kinase_uniprot
160,O00141_P46527_T157,P46527,T157,SIGNOR|EPSD|PSP,CDKN1B KIP1 p27,MSNVRVSNGsPsLERMDARQAEHPKPSACRNLFGPVDHEELTRDLE...,157,PsDsQTGLAEQCAGIRKRPAtDDSSTQNKRANRTEENVsDG,P46527_T157,MSNVRVSNGSPSLERMDARQAEHPKPSACRNLFGPVDHEELTRDLE...,...,SGK,SGK,Basophilic,Akt/rock,SGK1,6,O00141_SGK1,PSDSQTGLAEQCAGIRKRPATDDSSTQNKRANRTEENVSDG,3,O00141
309,O00141_Q96J92_S1217,Q96J92,S1217,EPSD|PSP,WNK4 PRKWNK4,MLASPATETTVLMSQTEADLALRPPPPLGTAGQPRLGPPPRRARRF...,1217,SRRNsLQRSEPPGPGIMRRNsLsGsSTGSQEQRASKGVTFA,Q96J92_S1217,MLASPATETTVLMSQTEADLALRPPPPLGTAGQPRLGPPPRRARRF...,...,SGK,SGK,Basophilic,Akt/rock,SGK1,1,O00141_SGK1,SRRNSLQRSEPPGPGIMRRNSLSGSSTGSQEQRASKGVTFA,2,O00141
369,O00141_Q9UN36_S346,Q9UN36,S346,EPSD|PSP,NDRG2 KIAA1248 SYLD,MAELQEVQITEEKPLLPGQTPEAAKEAELAARILLDQGQTHSVETP...,346,sRsRtAsLtsAAsVDGNRsRsRtLsQssEsGtLsSGPPGHT,Q9UN36_S346,MAELQEVQITEEKPLLPGQTPEAAKEAELAARILLDQGQTHSVETP...,...,SGK,SGK,Basophilic,Akt/rock,SGK1,1,O00141_SGK1,SRSRTASLTSAASVDGNRSRSRTLSQSSESGTLSSGPPGHT,2,O00141
39,O00141_O60343_T642,O60343,T642,GPS6|PSP,TBC1D4 AS160 KIAA0603,MEPPSCIQDEPFPHPLEPEPGVSAQPGPGKPSDKRFRLWYVGGSCL...,642,AWQTFPEEDSDSPQFRRRAHtFsHPPsstKRKLNLQDGRAQ,O60343_T642,MEPPSCIQDEPFPHPLEPEPGVSAQPGPGKPSDKRFRLWYVGGSCL...,...,SGK,SGK,Basophilic,Akt/rock,SGK1,4,O00141_SGK1,AWQTFPEEDSDSPQFRRRAHTFSHPPSSTKRKLNLQDGRAQ,2,O00141
314,O00141_Q96PU5_S448,Q96PU5,S448,GPS6|SIGNOR|ELM|EPSD|PSP,NEDD4L KIAA0439 NEDL3,MATGLGEPVYGLsEDEGESRILRVKVVSGIDLAKKDIFGASDPYVK...,448,sATNSNNHLIEPQIRRPRsLssPtVTLSAPLEGAKDsPVRR,Q96PU5_S448,MATGLGEPVYGLSEDEGESRILRVKVVSGIDLAKKDIFGASDPYVK...,...,SGK,SGK,Basophilic,Akt/rock,SGK1,4,O00141_SGK1,SATNSNNHLIEPQIRRPRSLSSPTVTLSAPLEGAKDSPVRR,5,O00141
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
185492,Q9Y4K4_P49841_S9,P49841,S9,SIGNOR|EPSD|PSP,GSK3B,MSGRPRttsFAEsCKPVQQPsAFGsMKVSRDKDGSKVTTVVAtPGQ...,9,____________MSGRPRttsFAEsCKPVQQPsAFGsMKVS,P49841_S9,MSGRPRTTSFAESCKPVQQPSAFGSMKVSRDKDGSKVTTVVATPGQ...,...,STE20,KHS,Map4k,Map4k,KHS1,25,Q9Y4K4_KHS1,____________MSGRPRTTSFAESCKPVQQPSAFGSMKVS,3,Q9Y4K4
185874,Q9Y5S2_O43255_S6,O43255,S6,PSP,SIAH2,MSRPsstGPsANKPCsKQPPPQPQHtPsPAAPPAAATISAAGPGSS...,6,_______________MSRPsstGPsANKPCsKQPPPQPQHt,O43255_S6,MSRPSSTGPSANKPCSKQPPPQPQHTPSPAAPPAAATISAAGPGSS...,...,DMPK,GEK,Basophilic,Akt/rock,MRCKb,1,Q9Y5S2_MRCKB,_______________MSRPSSTGPSANKPCSKQPPPQPQHT,1,Q9Y5S2
186807,Q9Y6E0_Q9Y2K2_T221,Q9Y2K2,T221,PSP,SIK3 KIAA0999 QSK L19,MAAAAASGAGGAAGAGTGGAGPAGRLLPPPAPGSPAAPAAVSPAAG...,221,NIKIADFGFSNLFTPGQLLKtWCGSPPYAAPELFEGKEYDG,Q9Y2K2_T221,MAAAAASGAGGAAGAGTGGAGPAGRLLPPPAPGSPAAPAAVSPAAG...,...,STE20,YSK,Map4k,Map4k,MST3,5,Q9Y6E0_MST3,NIKIADFGFSNLFTPGQLLKTWCGSPPYAAPELFEGKEYDG,1,Q9Y6E0
186392,Q9Y6E0_P57059_T182,P57059,T182,PSP,SIK1 SIK SNF1LK,MVIMSEFSADPAGQGQGQQKPLRVGFYDIERTLGKGNFAVVKLARH...,182,DIKLADFGFGNFYKSGEPLStWCGsPPYAAPEVFEGKEYEG,P57059_T182,MVIMSEFSADPAGQGQGQQKPLRVGFYDIERTLGKGNFAVVKLARH...,...,STE20,YSK,Map4k,Map4k,MST3,3,Q9Y6E0_MST3,DIKLADFGFGNFYKSGEPLSTWCGSPPYAAPEVFEGKEYEG,1,Q9Y6E0


In [17]:
test.to_parquet('out/CDDM_test_set.parquet')

In [18]:
test.head()

Unnamed: 0,kin_sub_site,substrate_uniprot,site,source,substrate_genes,substrate_phosphoseq,position,site_seq,sub_site,substrate_sequence,...,kinase_family,kinase_subfamily,kinase_pspa_big,kinase_pspa_small,kinase_coral_ID,num_kin,kinase_id,site_seq_upper,source_len,kinase_uniprot
160,O00141_P46527_T157,P46527,T157,SIGNOR|EPSD|PSP,CDKN1B KIP1 p27,MSNVRVSNGsPsLERMDARQAEHPKPSACRNLFGPVDHEELTRDLE...,157,PsDsQTGLAEQCAGIRKRPAtDDSSTQNKRANRTEENVsDG,P46527_T157,MSNVRVSNGSPSLERMDARQAEHPKPSACRNLFGPVDHEELTRDLE...,...,SGK,SGK,Basophilic,Akt/rock,SGK1,6,O00141_SGK1,PSDSQTGLAEQCAGIRKRPATDDSSTQNKRANRTEENVSDG,3,O00141
309,O00141_Q96J92_S1217,Q96J92,S1217,EPSD|PSP,WNK4 PRKWNK4,MLASPATETTVLMSQTEADLALRPPPPLGTAGQPRLGPPPRRARRF...,1217,SRRNsLQRSEPPGPGIMRRNsLsGsSTGSQEQRASKGVTFA,Q96J92_S1217,MLASPATETTVLMSQTEADLALRPPPPLGTAGQPRLGPPPRRARRF...,...,SGK,SGK,Basophilic,Akt/rock,SGK1,1,O00141_SGK1,SRRNSLQRSEPPGPGIMRRNSLSGSSTGSQEQRASKGVTFA,2,O00141
369,O00141_Q9UN36_S346,Q9UN36,S346,EPSD|PSP,NDRG2 KIAA1248 SYLD,MAELQEVQITEEKPLLPGQTPEAAKEAELAARILLDQGQTHSVETP...,346,sRsRtAsLtsAAsVDGNRsRsRtLsQssEsGtLsSGPPGHT,Q9UN36_S346,MAELQEVQITEEKPLLPGQTPEAAKEAELAARILLDQGQTHSVETP...,...,SGK,SGK,Basophilic,Akt/rock,SGK1,1,O00141_SGK1,SRSRTASLTSAASVDGNRSRSRTLSQSSESGTLSSGPPGHT,2,O00141
39,O00141_O60343_T642,O60343,T642,GPS6|PSP,TBC1D4 AS160 KIAA0603,MEPPSCIQDEPFPHPLEPEPGVSAQPGPGKPSDKRFRLWYVGGSCL...,642,AWQTFPEEDSDSPQFRRRAHtFsHPPsstKRKLNLQDGRAQ,O60343_T642,MEPPSCIQDEPFPHPLEPEPGVSAQPGPGKPSDKRFRLWYVGGSCL...,...,SGK,SGK,Basophilic,Akt/rock,SGK1,4,O00141_SGK1,AWQTFPEEDSDSPQFRRRAHTFSHPPSSTKRKLNLQDGRAQ,2,O00141
314,O00141_Q96PU5_S448,Q96PU5,S448,GPS6|SIGNOR|ELM|EPSD|PSP,NEDD4L KIAA0439 NEDL3,MATGLGEPVYGLsEDEGESRILRVKVVSGIDLAKKDIFGASDPYVK...,448,sATNSNNHLIEPQIRRPRsLssPtVTLSAPLEGAKDsPVRR,Q96PU5_S448,MATGLGEPVYGLSEDEGESRILRVKVVSGIDLAKKDIFGASDPYVK...,...,SGK,SGK,Basophilic,Akt/rock,SGK1,4,O00141_SGK1,SATNSNNHLIEPQIRRPRSLSSPTVTLSAPLEGAKDSPVRR,5,O00141


In [19]:
df.shape

(185883, 24)

In [20]:
df_eval = df[~df.index.isin(test.index)]

In [21]:
df_eval.shape

(183383, 24)

### Get eval PSSMs

In [22]:
pssms = get_cluster_pssms(df_eval,
                          cluster_col = 'kinase_id',
                          count_thr=None, # since we already filtered, we set None here
                          valid_thr=None)

100%|███████████████████████████████████████████████████████████████████████████| 333/333 [00:11<00:00, 30.25it/s]


In [23]:
pssms.shape

(333, 943)

In [24]:
pssms_upper = get_cluster_pssms(df_eval,
                                     seq_col='site_seq_upper',
                          cluster_col = 'kinase_id',
                          count_thr=None, # since we already filtered, we set None here
                          valid_thr=None)

100%|███████████████████████████████████████████████████████████████████████████| 333/333 [00:11<00:00, 29.50it/s]


### LO of eval PSSMs

In [25]:
LO = get_LO_all(pssms)

In [26]:
LO_upper = get_LO_all(pssms_upper,'STY_upper')

In [27]:
LO.shape

(333, 943)

## Remove isoform and pseudo gene

In [28]:
info= Data.get_kinase_info()

In [29]:
info = info[info.pseudo=='0'].copy()

In [30]:
info['id'] = info.uniprot+'_'+info.kinase

In [31]:
LO[~LO.index.isin(info.id)]

Unnamed: 0,-20P,-20G,-20A,-20C,-20S,-20T,-20V,-20I,-20L,-20M,...,20H,20K,20R,20Q,20N,20D,20E,20pS,20pT,20pY
P07948-2_LYN,-0.695061,0.248942,-0.181606,-0.523522,-0.24188,-0.087184,0.270835,0.248626,-0.177055,0.151905,...,-1.002663,0.18709,-0.131176,-0.477873,0.089392,-0.06989,0.272381,-0.624875,-0.293957,0.686397
O60566_BUB1B,-0.098063,0.177522,0.006417,-2.364642,-0.336759,-0.656228,0.622357,0.285512,-1.214575,-1.313708,...,0.538365,0.313082,-0.184508,-1.521805,0.31492,0.267606,0.993983,-0.586346,-0.015962,-0.647041
P05771-2_PKCB,-0.513296,0.082197,-0.419347,0.153007,0.387344,0.446387,1.033094,1.359558,-0.959957,-0.05909,...,1.086193,-0.296631,-0.264711,-1.710942,0.415289,0.443465,0.117313,0.28341,-0.205099,-0.099214
Q13976-2_PKG1,0.26716,-0.880855,0.326552,-20.552054,-0.980098,-0.599127,0.959565,-0.422922,-0.420509,-0.84157,...,0.565165,0.388792,0.076757,0.183066,-0.243242,-0.843097,-0.192211,0.440454,-1.311089,1.379757


In [32]:
LO = LO[LO.index.isin(info.id)]
LO_upper =LO_upper[LO_upper.index.isin(info.id)]

In [33]:
pssms= pssms[pssms.index.isin(info.id)]
pssms_upper = pssms_upper[pssms_upper.index.isin(info.id)]

In [34]:
LO.shape,pssms.shape

((329, 943), (329, 943))

In [35]:
pssms.to_parquet('out/CDDM_pssms_eval_psp_02.parquet')
pssms_upper.to_parquet('out/CDDM_pssms_eval_upper_psp_02.parquet')

LO.to_parquet('out/CDDM_pssms_LO_eval_psp_02.parquet')
LO_upper.to_parquet('out/CDDM_pssms_LO_eval_upper_psp_02.parquet')