# Get kinase-specific PSSMs

In [32]:
from katlas.core import *
import pandas as pd

In [33]:
%%time
df = Data.get_ks_dataset()

CPU times: user 594 ms, sys: 607 ms, total: 1.2 s
Wall time: 17.8 s


In [34]:
df['kinase_id'] = df['kinase_uniprot'] + '_' + df['kinase_protein'].str.split().str[0]

In [35]:
cnt = df['kinase_id'].value_counts()

In [36]:
idx = cnt[cnt>=40].index
idx

Index(['P12931_SRC', 'P29320_EPHA3', 'P07332_FES', 'Q16288_NTRK3',
       'Q9UM73_ALK', 'P00519_ABL1', 'P36888_FLT3', 'P29322_EPHA8',
       'P29323_EPHB2', 'P54762_EPHB1',
       ...
       'P35626_GRK3', 'Q99640_PKMYT1', 'O00311_CDC7', 'Q6P2M8_CAMK1B',
       'Q9NYV4_CDK12', 'Q15746_SMMLCK', 'Q01973_ROR1', 'O14976_GAK',
       'Q6P0Q8_MAST2', 'P15056_BRAF'],
      dtype='object', name='kinase_id', length=333)

In [37]:
df.shape

(187066, 22)

## All PSSMs

In [38]:
df = df[df['kinase_id'].isin(idx)].copy()

In [45]:
pssms_all = get_cluster_pssms(df,
                          cluster_col = 'kinase_id',
                          count_thr=None, # since we already filtered, we set None here
                          valid_thr=0.6)

100%|███████████████████████████████████████████████████████████| 333/333 [00:05<00:00, 55.56it/s]


In [46]:
pssms_all.shape

(332, 943)

In [47]:
pssms_all.to_parquet('out/CDDM_pssms.parquet')

### Upper

In [48]:
df['site_seq_upper'] = df.site_seq.str.upper()

In [49]:
pssms_all_upper = get_cluster_pssms(df,
                              seq_col='site_seq_upper',
                          cluster_col = 'kinase_id',
                          count_thr=None, # since we already filtered, we set None here
                          valid_thr=0.6)

100%|███████████████████████████████████████████████████████████| 333/333 [00:06<00:00, 51.21it/s]


In [50]:
pssms_all_upper.to_parquet('out/CDDM_pssms_upper.parquet')

## All log-odds

In [51]:
def get_LO_all(pssms,site_type='STY'):
    out=[]
    for idx, flat_pssm in pssms.iterrows():
        out.append(flatten_pssm(get_pssm_LO_flat(flat_pssm,site_type)))
    return pd.DataFrame(out,index=pssms.index)

In [52]:
LO_all = get_LO_all(pssms_all)

In [53]:
LO_all.to_parquet('out/CDDM_pssms_LO.parquet')

In [54]:
LO_all_upper = get_LO_all(pssms_all_upper,site_type='STY_upper')

In [55]:
LO_all_upper.to_parquet('out/CDDM_pssms_LO_upper.parquet')

## Remove isoform and pseudo gene

In [56]:
LO = pd.read_parquet('out/CDDM_pssms_LO.parquet')
LO_upper = pd.read_parquet('out/CDDM_pssms_LO_upper.parquet')

In [57]:
pssms = pd.read_parquet('out/CDDM_pssms.parquet')
pssms_upper = pd.read_parquet('out/CDDM_pssms_upper.parquet')

In [58]:
LO.shape,pssms.shape

((332, 943), (332, 943))

In [59]:
info= Data.get_kinase_info()

In [60]:
info = info[info.pseudo=='0'].copy()

In [61]:
info['id'] = info.uniprot+'_'+info.kinase

In [62]:
LO[~LO.index.isin(info.id)]

Unnamed: 0_level_0,-20P,-20G,-20A,-20C,-20S,-20T,-20V,-20I,-20L,-20M,...,20H,20K,20R,20Q,20N,20D,20E,20pS,20pT,20pY
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
P07948-2_LYN,-0.695061,0.248942,-0.181606,-0.523522,-0.24188,-0.087184,0.270835,0.248626,-0.177055,0.151905,...,-1.002663,0.18709,-0.131176,-0.477873,0.089392,-0.06989,0.272381,-0.624875,-0.293957,0.686397
O60566_BUB1B,-0.098063,0.177522,0.006417,-2.364642,-0.336759,-0.656228,0.622357,0.285512,-1.214575,-1.313708,...,0.538365,0.313082,-0.184508,-1.521805,0.31492,0.267606,0.993983,-0.586346,-0.015962,-0.647041
P05771-2_PKCB,-0.513296,0.082197,-0.419347,0.153007,0.387344,0.446387,1.033094,1.359558,-0.959957,-0.05909,...,1.086193,-0.296631,-0.264711,-1.710942,0.415289,0.443465,0.117313,0.28341,-0.205099,-0.099214
Q13976-2_PKG1,0.26716,-0.880855,0.326552,-20.552054,-0.980098,-0.599127,0.959565,-0.422922,-0.420509,-0.84157,...,0.565165,0.388792,0.076757,0.183066,-0.243242,-0.843097,-0.192211,0.440454,-1.311089,1.379757


In [63]:
LO = LO[LO.index.isin(info.id)]
LO_upper =LO_upper[LO_upper.index.isin(info.id)]

In [64]:
pssms= pssms[pssms.index.isin(info.id)]
pssms_upper = pssms_upper[pssms_upper.index.isin(info.id)]

In [65]:
LO.shape,pssms.shape

((328, 943), (328, 943))

In [66]:
LO.to_parquet('out/CDDM_pssms_LO.parquet')
LO_upper.to_parquet('out/CDDM_pssms_LO_upper.parquet')

In [67]:
pssms.to_parquet('out/CDDM_pssms.parquet')
pssms_upper.to_parquet('out/CDDM_pssms_upper.parquet')

In [68]:
pssms

Unnamed: 0_level_0,-20P,-20G,-20A,-20C,-20S,-20T,-20V,-20I,-20L,-20M,...,20H,20K,20R,20Q,20N,20D,20E,20pS,20pT,20pY
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
P12931_SRC,0.054538,0.081380,0.060077,0.012356,0.036216,0.032382,0.055816,0.052407,0.083511,0.023434,...,0.013351,0.076992,0.060970,0.037383,0.036938,0.052960,0.086337,0.025367,0.015576,0.023142
P29320_EPHA3,0.044276,0.088013,0.065335,0.008639,0.037797,0.036717,0.072354,0.048596,0.075594,0.026998,...,0.014132,0.083098,0.059356,0.031091,0.033917,0.056529,0.100622,0.025438,0.013567,0.015828
P07332_FES,0.047231,0.082519,0.070575,0.011401,0.034745,0.039088,0.061889,0.053203,0.088491,0.026059,...,0.013053,0.086266,0.055619,0.040863,0.038025,0.059024,0.085698,0.026674,0.013621,0.019296
Q16288_NTRK3,0.044444,0.074644,0.074074,0.017094,0.033048,0.035328,0.060969,0.058120,0.084330,0.026781,...,0.015682,0.091677,0.052473,0.030760,0.044029,0.057298,0.088661,0.021713,0.015682,0.018094
Q9UM73_ALK,0.045748,0.079765,0.073314,0.018182,0.032845,0.035191,0.067449,0.051026,0.076246,0.027566,...,0.015634,0.090198,0.069152,0.030066,0.043897,0.051112,0.096212,0.026458,0.013229,0.017438
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Q15746_SMMLCK,0.078947,0.052632,0.078947,0.026316,0.052632,0.026316,0.052632,0.052632,0.052632,0.000000,...,0.000000,0.066667,0.022222,0.044444,0.133333,0.044444,0.000000,0.044444,0.022222,0.000000
Q01973_ROR1,0.097561,0.097561,0.024390,0.000000,0.024390,0.146341,0.170732,0.024390,0.048780,0.000000,...,0.000000,0.075000,0.050000,0.000000,0.050000,0.025000,0.125000,0.125000,0.000000,0.025000
P15056_BRAF,0.095238,0.071429,0.047619,0.000000,0.095238,0.047619,0.071429,0.000000,0.095238,0.047619,...,0.000000,0.075000,0.075000,0.025000,0.025000,0.075000,0.075000,0.050000,0.000000,0.000000
O14976_GAK,0.075000,0.075000,0.175000,0.000000,0.025000,0.025000,0.100000,0.025000,0.050000,0.175000,...,0.000000,0.024390,0.024390,0.000000,0.170732,0.195122,0.024390,0.048780,0.024390,0.000000
