# Query phosphsites for a specific protein/gene

## Setup

In [None]:
!pip install git+https://github.com/sky1ove/katlas.git -Uqq

In [40]:
from katlas.imports import *
import pandas as pd

## Query a specific gene

Download human phosphoproteomics, which is a combination of ochoa et al. dataset and phosphositeplus dataset (low-throughput>=1)

In [2]:
sites = Data.get_combine_site_pplus_ochoa()

In [4]:
sites.head()

Unnamed: 0,site_seq,gene_site,gene,source,num_site,acceptor,-7,-6,-5,-4,-3,-2,-1,0,1,2,3,4,5,6,7
0,AAAAAAASGGAGSDN,PBX1_S136,PBX1,ochoa,1,S,A,A,A,A,A,A,A,S,G,G,A,G,S,D,N
1,AAAAAAASGGGVSPD,PBX2_S146,PBX2,ochoa,1,S,A,A,A,A,A,A,A,S,G,G,G,V,S,P,D
2,AAAAAAASGVTTGKP,CLASR_S349,CLASR,ochoa,1,S,A,A,A,A,A,A,A,S,G,V,T,T,G,K,P
3,AAAAAAASQQGSAKN,TBL1R_S119,TBL1R,ochoa,1,S,A,A,A,A,A,A,A,S,Q,Q,G,S,A,K,N
4,AAAAAAASSPVGVGQ,SOX3_S249,SOX3,ochoa,1,S,A,A,A,A,A,A,A,S,S,P,V,G,V,G,Q


Query a specific gene:

In [5]:
df = query_gene(sites,'TP53')

In [6]:
df

Unnamed: 0,site_seq,gene_site,gene,source,num_site,acceptor,-7,-6,-5,-4,-3,-2,-1,0,1,2,3,4,5,6,7
115584,__MEEPQSDPSVEPP,TP53_S6|P53_S6,TP53|P53,phosphoplus|ochoa,2,S,_,_,M,E,E,P,Q,S,D,P,S,V,E,P,P
115282,_MDDLMLSPDDIEQW,TP53_S7,TP53,phosphoplus,1,S,_,M,D,D,L,M,L,S,P,D,D,I,E,Q,W
17936,EEPQSDPSVEPPLSQ,TP53_S9|P53_S9,TP53|P53,phosphoplus|ochoa,2,S,E,E,P,Q,S,D,P,S,V,E,P,P,L,S,Q
71937,PSVEPPLSQETFSDL,TP53_S15|P53_S15,TP53|P53,phosphoplus|ochoa,2,S,P,S,V,E,P,P,L,S,Q,E,T,F,S,D,L
21635,EPPLSQETFSDLWKL,TP53_T18,TP53,phosphoplus,1,T,E,P,P,L,S,Q,E,T,F,S,D,L,W,K,L
67813,PLSQETFSDLWKLLP,TP53_S20,TP53,phosphoplus,1,S,P,L,S,Q,E,T,F,S,D,L,W,K,L,L,P
53375,LPENNVLSPLPSQAM,TP53_S33,TP53,phosphoplus,1,S,L,P,E,N,N,V,L,S,P,L,P,S,Q,A,M
62644,NVLSPLPSQAMDDLM,TP53_S37,TP53,phosphoplus,1,S,N,V,L,S,P,L,P,S,Q,A,M,D,D,L,M
4244,AMDDLMLSPDDIEQW,TP53_S46,TP53,phosphoplus,1,S,A,M,D,D,L,M,L,S,P,D,D,I,E,Q,W
10328,DDIEQWFTEDPGPDE,TP53_T55,TP53,phosphoplus,1,T,D,D,I,E,Q,W,F,T,E,D,P,G,P,D,E


In [12]:
df.site_seq.head()

115584    __MEEPQSDPSVEPP
115282    _MDDLMLSPDDIEQW
17936     EEPQSDPSVEPPLSQ
71937     PSVEPPLSQETFSDL
21635     EPPLSQETFSDLWKL
Name: site_seq, dtype: object

Notice that the sequence are all in capital

## Predict kinase on the site sequence

### PSPA scoring

In [10]:
pspa = predict_kinase_df(df,'site_seq',**param1)

input dataframe has a length 35
Preprocessing
Finish preprocessing
Calculating position: [-5, -4, -3, -2, -1, 0, 1, 2, 3, 4]


  log_sum = np.sum(np.log2(values)) + (len(values) - 1) * np.log2(divide)
100%|██████████| 303/303 [00:00<00:00, 2452.33it/s]


As all of the sequences are in capital, we can consider converting all STY to sty; however, this is not recommended as not all STY are phosphorylation sites.

In [13]:
# predict_kinase_df(df,'site_seq',**param2)

Another option to consider phospho-priming is to convert the STY based on the current phosphosites(to do)

### CDDM scoring

As the site sequences are all in capital, we use param4 (ks_upper) to calculate.

In [14]:
cddm = predict_kinase_df(df,'site_seq',**param4)

input dataframe has a length 35
Preprocessing
Finish preprocessing
Calculating position: [-7, -6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7]


100%|██████████| 289/289 [00:00<00:00, 3894.43it/s]


### Get kinase rank

In [22]:
def get_top(r,n):
    top = r.sort_values(ascending=False)[:n].index
    return ','.join(top)

def get_top_df(df,n=10):
    df = df.apply(get_top, axis=1,args=(n,))
    return df

In [37]:
pspa_rnk = get_top_df(pspa)
cddm_rnk = get_top_df(cddm)

In [42]:
out = df.iloc[:,:2]

In [46]:
out['pspa'] = pspa_rnk
out['cddm'] = cddm_rnk

In [47]:
out

Unnamed: 0,site_seq,gene_site,pspa,cddm
115584,__MEEPQSDPSVEPP,TP53_S6|P53_S6,"CK1A,CDC7,MOS,BMPR1A,BMPR1B,CK2A2,GRK7,CK1E,CK2A1,CK1D","CK2A2,CK2A1,CLK1,HIPK4,GRK7,CK1A,ERK2,MTOR,ATR,ERK1"
115282,_MDDLMLSPDDIEQW,TP53_S7,"JNK1,JNK2,JNK3,P38B,P38D,PINK1,ERK2,ERK1,CDK8,DYRK4","CDK4,CDK3,CDK5,JNK2,P38D,DYRK4,ERK2,ERK1,CDK2,JNK1"
17936,EEPQSDPSVEPPLSQ,TP53_S9|P53_S9,"GRK1,CAMK2B,LATS1,CAMK2A,CAMK2D,CLK3,KIS,TGFBR2,COT,PLK2","HIPK4,HIPK1,CK1A,CLK3,HIPK3,ATM,GRK7,ATR,TBK1,ALK2"
71937,PSVEPPLSQETFSDL,TP53_S15|P53_S15,"ATM,SMG1,DNAPK,ATR,FAM20C,GRK1,SKMLCK,P38B,CK1E,TSSK2","ATR,ATM,DNAPK,CK2A1,CDK8,MTOR,CLK3,ERK1,ERK2,MAPKAPK5"
21635,EPPLSQETFSDLWKL,TP53_T18,"PLK3,PLK2,PLK1,HUNK,TGFBR1,TSSK2,CAMK1A,ALPHAK3,CAMK1D,CAMK1B","OSR1,CAMKK1,NUAK2,CAMKK2,MST1,AMPKA1,LKB1,PBK,MEK5,NEK11"
67813,PLSQETFSDLWKLLP,TP53_S20,"CDC7,MOS,GRK2,CK1E,CK1D,BMPR1B,CK1G3,BMPR1A,CK1A2,IRE2","CK2A2,CK2A1,GRK7,CK1G1,CAMK4,TSSK1,TSSK2,MTOR,CK1G2,CK1A"
53375,LPENNVLSPLPSQAM,TP53_S33,"P38D,CDK19,CDK8,KIS,ERK1,P38G,ERK2,P38B,JNK1,CDK17","CDK4,ERK1,ERK2,HIPK3,CDK1,CDK2,CDK9,CDK5,HIPK2,DYRK4"
62644,NVLSPLPSQAMDDLM,TP53_S37,"PRKD1,DNAPK,CHK1,ATR,GRK1,P90RSK,SMG1,BUB1,PRKD2,TSSK2","ATR,ATM,DNAPK,PRKD3,PRKD1,P90RSK,MAPKAPK3,RSK2,PRKD2,RSK4"
4244,AMDDLMLSPDDIEQW,TP53_S46,"JNK1,JNK2,JNK3,P38B,P38D,PINK1,ERK2,ERK1,CDK8,DYRK4","CDK4,CDK3,P38D,DYRK4,ERK1,CDK5,JNK2,ERK2,JNK1,CDK1"
10328,DDIEQWFTEDPGPDE,TP53_T55,"BMPR1A,ACVR2A,ACVR2B,ALK2,BMPR1B,CAMK2B,CHK1,CK2A2,CK2A1,TGFBR1","LKB1,ASK1,MEK2,GRK5,ALK2,CAMKK2,PBK,OSR1,TNIK,TAK1"
