<a target="_blank" href="https://colab.research.google.com/github/sky1ove/katlas/blob/main/nbs/tutorial_02_query_gene.ipynb">
  <img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/>
</a>

# Query phosphosites for a specific protein/gene

In this session, we will show you:
1. how to query all of the phosphorylation sites for a single protein;
2. transform the site sequence to phosphorylated status;
3. conduct substrate scoring on all of them;
4. optional: rank the site based on additional information

## Setup

In [None]:
!pip install git+https://github.com/sky1ove/katlas.git -Uqq

In [None]:
from katlas.core import *
import requests, pandas as pd, numpy as np

## Query a protein based on its gene name

Download human phosphoproteome, which is a combination of ochoa et al. dataset and phosphositeplus dataset (low-throughput>=1)

In [None]:
sites = Data.get_combine_site_psp_ochoa()

In [None]:
sites.head()

Unnamed: 0,site_seq,gene_site,gene,source,num_site,acceptor,-7,-6,-5,-4,...,-2,-1,0,1,2,3,4,5,6,7
0,AAAAAAASGGAGSDN,PBX1_S136,PBX1,ochoa,1,S,A,A,A,A,...,A,A,S,G,G,A,G,S,D,N
1,AAAAAAASGGGVSPD,PBX2_S146,PBX2,ochoa,1,S,A,A,A,A,...,A,A,S,G,G,G,V,S,P,D
2,AAAAAAASGVTTGKP,CLASR_S349,CLASR,ochoa,1,S,A,A,A,A,...,A,A,S,G,V,T,T,G,K,P
3,AAAAAAASQQGSAKN,TBL1R_S119,TBL1R,ochoa,1,S,A,A,A,A,...,A,A,S,Q,Q,G,S,A,K,N
4,AAAAAAASSPVGVGQ,SOX3_S249,SOX3,ochoa,1,S,A,A,A,A,...,A,A,S,S,P,V,G,V,G,Q


Query a specific gene:

In [None]:
df = query_gene(sites,'CTNNB1')

In [None]:
df

Unnamed: 0,site_seq,gene_site,gene,source,num_site,acceptor,-7,-6,-5,-4,...,-2,-1,0,1,2,3,4,5,6,7
64201,PDRKAAVSHWQQQSY,CTNNB1_S23,CTNNB1,phosphoplus,1,S,P,D,R,K,...,A,V,S,H,W,Q,Q,Q,S,Y
111457,VSHWQQQSYLDSGIH,CTNNB1_S29|CTNB1_S29,CTNNB1|CTNB1,phosphoplus|ochoa,2,S,V,S,H,W,...,Q,Q,S,Y,L,D,S,G,I,H
91015,SHWQQQSYLDSGIHS,CTNNB1_Y30|CTNB1_Y30,CTNNB1|CTNB1,phosphoplus|ochoa,2,Y,S,H,W,Q,...,Q,S,Y,L,D,S,G,I,H,S
77044,QQQSYLDSGIHSGAT,CTNNB1_S33,CTNNB1,phosphoplus,1,S,Q,Q,Q,S,...,L,D,S,G,I,H,S,G,A,T
114100,YLDSGIHSGATTTAP,CTNNB1_S37|CTNB1_S37,CTNNB1|CTNB1,phosphoplus|ochoa,2,S,Y,L,D,S,...,I,H,S,G,A,T,T,T,A,P
29849,GIHSGATTTAPSLSG,CTNNB1_T41,CTNNB1,phosphoplus,1,T,G,I,H,S,...,A,T,T,T,A,P,S,L,S,G
27548,GATTTAPSLSGKGNP,CTNNB1_S45|CTNB1_S45,CTNNB1|CTNB1,phosphoplus|ochoa,2,S,G,A,T,T,...,A,P,S,L,S,G,K,G,N,P
17342,EEEDVDTSQVLYEWE,CTNNB1_S60,CTNNB1,phosphoplus,1,S,E,E,E,D,...,D,T,S,Q,V,L,Y,E,W,E
107937,VDTSQVLYEWEQGFS,CTNNB1_Y64,CTNNB1,phosphoplus,1,Y,V,D,T,S,...,V,L,Y,E,W,E,Q,G,F,S
112759,WEQGFSQSFTQEQVA,CTNNB1_S73,CTNNB1,phosphoplus,1,S,W,E,Q,G,...,S,Q,S,F,T,Q,E,Q,V,A


In [None]:
df.site_seq.head()

64201     PDRKAAVSHWQQQSY
111457    VSHWQQQSYLDSGIH
91015     SHWQQQSYLDSGIHS
77044     QQQSYLDSGIHSGAT
114100    YLDSGIHSGATTTAP
Name: site_seq, dtype: object

Note that the sequence are all in capital

## Get protein sequence

In [None]:
def get_protein_sequence(uniprot_id):
    """
    Fetches the protein sequence from a UniProt FASTA URL.

    :param url: The URL to the UniProt FASTA file.
    :return: A string containing the protein sequence or a message if not found.
    """

    url = f"https://rest.uniprot.org/uniprotkb/{uniprot_id}.fasta"
    response = requests.get(url)

    if response.status_code == 200:
        # Remove the header line (starts with '>') and join the sequence lines
        sequence = ''.join(line for line in response.text.split('\n') if not line.startswith('>'))
        return sequence
    else:
        return "Protein sequence not found or error in fetching."

To query the sequence of specific protein, get its uniprot ID first, then implement the function

In [None]:
# get protein sequence through uniprot id
uniprot_idx = "P35222"
sequence = get_protein_sequence(uniprot_idx)
sequence

'MATQADLMELDMAMEPDRKAAVSHWQQQSYLDSGIHSGATTTAPSLSGKGNPEEEDVDTSQVLYEWEQGFSQSFTQEQVADIDGQYAMTRAQRVRAAMFPETLDEGMQIPSTQFDAAHPTNVQRLAEPSQMLKHAVVNLINYQDDAELATRAIPELTKLLNDEDQVVVNKAAVMVHQLSKKEASRHAIMRSPQMVSAIVRTMQNTNDVETARCTAGTLHNLSHHREGLLAIFKSGGIPALVKMLGSPVDSVLFYAITTLHNLLLHQEGAKMAVRLAGGLQKMVALLNKTNVKFLAITTDCLQILAYGNQESKLIILASGGPQALVNIMRTYTYEKLLWTTSRVLKVLSVCSSNKPAIVEAGGMQALGLHLTDPSQRLVQNCLWTLRNLSDAATKQEGMEGLLGTLVQLLGSDDINVVTCAAGILSNLTCNNYKNKMMVCQVGGIEALVRTVLRAGDREDITEPAICALRHLTSRHQEAEMAQNAVRLHYGLPVVVKLLHPPSHWPLIKATVGLIRNLALCPANHAPLREQGAIPRLVQLLVRAHQDTQRRTSMGGTQQQFVEGVRMEEIVEGCTGALHILARDVHNRIVIRGLNTIPLFVQLLYSPIENIQRVAAGVLCELAQDKEAAEAIEAEGATAPLTELLHSRNEGVATYAAAVLFRMSEDKPQDYKKRLSVELTSSLFRTEPMAWNETADLGLDIGAQGEPLGYRQDDPSYRSFHSGGYGQDALGMDPMMEHEMGGHHPGADYPVDGLPDLGHAQDLMDGLPPGDSNQLAWFDTDL'

In [None]:
# get site and position information
df['site'] = df.gene_site.str.split('|').str[0].str.split('_').str[-1]
df['position'] = df.site.str[1:].astype(int)

## Phosphorylate the sequence based on the site info

In [None]:
def phosphorylate_seq(seq:str, # protein sequence
                sites, # iterable item contains aa+position
                ):
    "Phosphorylate protein sequence based on site information"

    seq_list = list(seq)
    success=0
    fail = 0
    for s in sites:
        position = int(s[1:])
        aa = s[0]
        if seq_list[position-1] ==aa:
            seq_list[position-1] = seq_list[position-1].lower()
            success+=1
        else:
            print(f'position {p} in the sequence is {seq_list[p-1]}')
            fail+=1

    seq2 = ''.join(seq_list)
    print(f'Successfully phosphorylated {success} positions, with {fail} failed.')

    return seq2

In [None]:
seq = phosphorylate_seq(sequence,df.site)
seq

Successfully phosphorylated 41 positions, with 0 failed.


'MATQADLMELDMAMEPDRKAAVsHWQQQsyLDsGIHsGATtTAPsLSGKGNPEEEDVDTsQVLyEWEQGFSQsFTQEQVADIDGQyAMTRAQRVRAAMFPEtLDEGMQIPStQFDAAHPtNVQRLAEPSQMLKHAVVNLINyQDDAELATRAIPELTKLLNDEDQVVVNKAAVMVHQLSKKEAsRHAIMRsPQMVSAIVRTMQNTNDVETARCTAGTLHNLSHHREGLLAIFKSGGIPALVKMLGsPVDSVLFYAITTLHNLLLHQEGAKMAVRLAGGLQKMVALLNKTNVKFLAITtDCLQILAYGNQEsKLIILASGGPQALVNIMRTytyEKLLWTTSRVLKVLSVCSsNKPAIVEAGGMQALGLHLtDPsQRLVQNCLWtLRNLSDAAtKQEGMEGLLGTLVQLLGSDDINVVTCAAGILSNLTCNNYKNKMMVCQVGGIEALVRTVLRAGDREDITEPAICALRHLtSRHQEAEMAQNAVRLHyGLPVVVKLLHPPSHWPLIKATVGLIRNLALCPANHAPLREQGAIPRLVQLLVRAHQDTQRRtsMGGtQQQFVEGVRMEEIVEGCTGALHILARDVHNRIVIRGLNTIPLFVQLLYsPIENIQRVAAGVLCELAQDKEAAEAIEAEGATAPLTELLHsRNEGVAtyAAAVLFRMSEDKPQDyKKRLsVELTSSLFRTEPMAWNETADLGLDIGAQGEPLGYRQDDPsYRsFHSGGYGQDALGMDPMMEHEMGGHHPGADYPVDGLPDLGHAQDLMDGLPPGDSNQLAWFDTDL'

## Extract the site sequence

In [None]:
df['seq'] = seq
df['site_seq2'] = extract_site_seq(df,'seq','position')

100%|██████████| 41/41 [00:00<00:00, 10386.33it/s]


## PSPA Scoring

If we have substrate sequence as capital form:

In [None]:
pspa = predict_kinase_df(df,'site_seq',**param_PSPA)

input dataframe has a length 41
Preprocessing
Finish preprocessing
Calculating position: [-5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5]


100%|██████████| 396/396 [00:05<00:00, 78.03it/s] 


If we have phosphorylated substrate sequence:

In [None]:
pspa = predict_kinase_df(df,'site_seq2',**param_PSPA)

input dataframe has a length 41
Preprocessing
Finish preprocessing
Calculating position: [-5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5]


100%|██████████| 396/396 [00:00<00:00, 2662.29it/s]


## CDDM scoring

If the site sequences are all in capital, we use param_CDDM_upper to calculate:

In [None]:
cddm = predict_kinase_df(df,'site_seq',**param_CDDM_upper)

input dataframe has a length 41
Preprocessing
Finish preprocessing
Calculating position: [-7, -6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7]


100%|██████████| 289/289 [00:00<00:00, 2627.94it/s]


With converted sequence, we can consider using param3 (ks for phosphorylated status)

In [None]:
cddm = predict_kinase_df(df,'site_seq2',**param_CDDM)

input dataframe has a length 41
Preprocessing
Finish preprocessing
Calculating position: [-7, -6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7]


100%|██████████| 289/289 [00:00<00:00, 1716.23it/s]


## Rank kinase for each site

In [None]:
def get_top(r, n):
    "Get top n kinase from a row"
    if np.isinf(r).any(): # Check if 'inf' is in the row
        return "unavailable due to Y site"
    else:
        top = r.sort_values(ascending=False)[:n].index
        return ','.join(top)

def get_top_df(df, n=10):
    "Apply get_top in dataframe"
    df = df.apply(get_top, axis=1, args=(n,))
    return df

In [None]:
pspa_rnk = get_top_df(pspa)
cddm_rnk = get_top_df(cddm)

In [None]:
df.columns

Index([ 'site_seq', 'gene_site',      'gene',    'source',  'num_site',
        'acceptor',          -7,          -6,          -5,          -4,
                -3,          -2,          -1,           0,           1,
                 2,           3,           4,           5,           6,
                 7,      'site',  'position',       'seq', 'site_seq2'],
      dtype='object')

In [None]:
out = df[['gene_site','site_seq2','site','position']].copy() # get site information

In [None]:
out['pspa'] = pspa_rnk
out['cddm'] = cddm_rnk

In [None]:
out

Unnamed: 0,gene_site,site_seq2,site,position,pspa,cddm
64201,CTNNB1_S23,PDRKAAVsHWQQQsy,S23,23,"SSTK,BRSK1,PRKD3,BRSK2,P70S6K,SNRK,MARK3,MAPKA...","P90RSK,RSK4,MARK1,RSK2,AKT1,TSSK2,SGK1,P70S6K,..."
111457,CTNNB1_S29|CTNB1_S29,VsHWQQQsyLDsGIH,S29,29,"GSK3A,GSK3B,LATS2,MAPKAPK2,CAMK2A,CAMK2B,LATS1...","PAK4,CAMK1D,NIM1,LATS2,PAK5,TBK1,TSSK1,GRK7,NU..."
91015,CTNNB1_Y30|CTNB1_Y30,sHWQQQsyLDsGIHs,Y30,30,"BMPR2_TYR,PTK2,SYK,ERBB4,PDHK1_TYR,EPHA3,PDHK4...","ERBB4,FGFR4,TNK1,JAK3,CSK,KIT,EPHA5,EGFR,JAK2,..."
77044,CTNNB1_S33,QQQsyLDsGIHsGAT,S33,33,"CK1G2,CK1A,CK1G3,GSK3A,GSK3B,GRK3,CK1A2,CK1D,J...","GSK3B,IKKB,IKKA,GSK3A,TBK1,PAK4,GRK1,IKKE,P90R..."
114100,CTNNB1_S37|CTNB1_S37,yLDsGIHsGATtTAP,S37,37,"GSK3A,GSK3B,CK1A,CK1G2,GRK7,IKKA,GRK4,GRK5,IKK...","GSK3A,PAK6,PAK5,GSK3B,TBK1,PAK4,PRKX,IKKB,ULK3..."
29849,CTNNB1_T41,GIHsGATtTAPsLSG,T41,41,"GSK3A,GSK3B,PRP4,PASK,CK1G2,CK1A,GRK7,CK1D,CK1...","MPSK1,GSK3B,GSK3A,ASK1,PBK,MEK2,TNIK,MEKK2,LKB..."
27548,CTNNB1_S45|CTNB1_S45,GATtTAPsLSGKGNP,S45,45,"CK1A,CK1G1,CK1D,CK1E,CK1A2,CK1G2,CK1G3,IKKB,MT...","TBK1,IKKE,PAK6,PKACA,ULK3,CK1A,CK1G2,MTOR,PKAC..."
17342,CTNNB1_S60,EEEDVDTsQVLyEWE,S60,60,"ATM,ACVR2B,ACVR2A,GRK4,GRK7,CK1G2,PLK1,TLK2,PL...","ATM,ATR,GRK7,GRK1,BUB1B,CK2A2,GRK5,DNAPK,CK2A1..."
107937,CTNNB1_Y64,VDTsQVLyEWEQGFS,Y64,64,"SYK,MERTK,EPHA5,SRMS,EPHA4,FER,EPHA3,PTK6,FES,TEK","TEC,BLK,EPHA4,BTK,SRMS,SYK,FES,EPHA7,EPHA6,FYN"
112759,CTNNB1_S73,WEQGFSQsFTQEQVA,S73,73,"CAMK2G,ULK1,PLK1,PLK3,IKKA,CAMK2B,DSTYK,NEK2,I...","BRSK2,IKKB,ULK3,TSSK1,NUAK1,PKACB,PRKX,NDR2,TB..."


## Rank with AlphaMissense

AlphaMissense is a deep learning model developed by DeepMind  (paper [Accurate proteome-wide missense variant effect prediction with AlphaMissense](https://www.science.org/doi/10.1126/science.adg7492)). It incorporates structural context by using an AlphaFold-derived system, and fine-tuning on weak labels from population frequency data, which avoids bias from human-curated annotations. 

According to the paper, the pathogenecity score is highly correlated with the functional significance of the position (e.g., the score is high in the region that interact with other protein). Therefore, the score can be also interpreted as a functional score in the protein: the higher the score, the more important the region is.

Download the AlphaMissense aa substitution data, and convert to parquet for faster loading (uncomment the code)

In [None]:
!wget https://storage.googleapis.com/dm_alphamissense/AlphaMissense_aa_substitutions.tsv.gz

# df = pd.read_csv('AlphaMissense_aa_substitutions.tsv.gz', compression='gzip',  header=3, sep='\t', quotechar='"')

# df.to_parquet('AlphaMissense_aa_substitutions.gzip.parquet', compression='gzip')

--2024-02-25 18:32:33--  https://storage.googleapis.com/dm_alphamissense/AlphaMissense_aa_substitutions.tsv.gz
Resolving storage.googleapis.com (storage.googleapis.com)... 142.250.80.27, 142.250.65.219, 142.251.40.187, ...
Connecting to storage.googleapis.com (storage.googleapis.com)|142.250.80.27|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1207278510 (1.1G) [application/x-gzip]
Saving to: ‘AlphaMissense_aa_substitutions.tsv.gz’


2024-02-25 18:32:52 (84.2 MB/s) - ‘AlphaMissense_aa_substitutions.tsv.gz’ saved [1207278510/1207278510]



In [None]:
%%time
df = pd.read_csv('AlphaMissense_aa_substitutions.tsv.gz', compression='gzip',  header=3, sep='\t', quotechar='"')

Load using parquet:

In [None]:
# %%time
# df = pd.read_parquet('AlphaMissense_aa_substitutions.gzip.parquet')

Load in datatable (datatable can read much faster than parquet, but it is 7x size bigger than the parquet file (i.e. 7.9G vs. 1.3G) :

In [None]:
# !pip install datatable

# dt.Frame(df).to_jay("AlphaMissense_aa_substitutions.jay")

# DT = dt.fread("AlphaMissense_aa_substitutions.jay")

In [None]:
def query_uniprot(df_or_dt, # dataframe or datatable
                  uniprot_id: str, # uniprot id
                 ):

    "Query data by uniprot_id,extract relevant parts from the 'protein_variant' column,and pivot the data."

    
    # Check if input is pandas DataFrame
    if isinstance(df_or_dt, pd.DataFrame):
        data = df_or_dt.query(f'uniprot_id == "{uniprot_id}"').reset_index(drop=True)
    # Check if input is datatable Frame
    elif isinstance(df_or_dt, dt.Frame):
        data = df_or_dt[f.uniprot_id == uniprot_id, :].to_pandas()
    else:
        raise ValueError("Input must be a pandas DataFrame or a datatable Frame")

    # Extract parts from 'protein_variant' column
    data[['aa1', 'position', 'aa2']] = data['protein_variant'].str.extract(r'(\D)(\d+)(\D)')
    data['position_aa'] = data['aa1'] + data['position']

    # Pivot the data
    pivot_data = data.pivot_table(index='aa2', columns='position_aa', values='am_pathogenicity', aggfunc='first')

    # Ensure columns are sorted numerically by position
    columns = sorted(pivot_data.columns, key=lambda x: int(x[1:]) if x[1:].isdigit() else float('inf'))
    pivot_data = pivot_data[columns]

    return data, pivot_data

In [None]:
uniprot_id = "P35222" # beta-catenin
data, pivot = query_uniprot(df, uniprot_id) # Only human protein uniprot id is available

In [None]:
# This is a pivot table
pivot.head()

In [None]:
data['position'] = data['position'].astype(int)

In [None]:
data.groupby('position').am_pathogenicity.mean().plot.line(figsize=(20,5));

In [None]:
am = data.groupby('position').agg({'am_pathogenicity':'mean','aa1':'first'}).reset_index()
am

In [None]:
am.to_csv('raw/am.csv',index=False)

In [None]:
am = pd.read_csv('raw/am.csv')

In [None]:
# get the -7 to +7 mean of AM score surrounding each position
am['am_pathogenicity_-7to+7'] = am['am_pathogenicity'].rolling(window=15, min_periods=1, center=True).mean()

In [None]:
am['site'] =am['aa1']+am['position'].astype(str)

Get AM score at the phosphorylation site

In [None]:
out2 = out.merge(am)
print('before merge:',len(out))
print('after merge:',len(out2))

before merge: 41
after merge: 41


In [None]:
out2.sort_values('am_pathogenicity',ascending=False)

Unnamed: 0,gene_site,site_seq2,site,position,pspa,cddm,am_pathogenicity,aa1,am_pathogenicity_-7to+7
36,CTNNB1_Y654|CTNB1_Y654,RNEGVAtyAAAVLFR,Y654,654,unavailable due to Y site,"EPHA7,EPHA4,TXK,CSK,LTK,BTK,EPHB3,LYN,EPHA1,BLK",0.993016,Y,0.964376
28,CTNNB1_T472,ICALRHLtSRHQEAE,T472,472,"PIM2,PKN1,ROCK1,MRCKB,PKCE,AKT3,LOK,AKT1,SGK1,MST3","AKT1,ROCK1,ROCK2,SGK1,ERK7,MRCKB,PBK,AKT3,MST1,SGK2",0.988368,T,0.881822
35,CTNNB1_T653|CTNB1_T653,sRNEGVAtyAAAVLF,T653,653,"GRK7,PRP4,TGFBR1,ALK4,JNK1,P38G,P38B,ACVR2B,BMPR1B,JNK3","TNIK,PBK,OSR1,TAO2,MINK,MST1,KHS1,MST2,GCK,TAO1",0.986426,T,0.962545
22,CTNNB1_Y333,NIMRTytyEKLLWTT,Y333,333,unavailable due to Y site,"EPHA1,EPHA7,EPHA2,BLK,TNK2,SRMS,EPHA4,LTK,EPHA5,LYN",0.982195,Y,0.827118
14,CTNNB1_Y142,AVVNLINyQDDAELA,Y142,142,unavailable due to Y site,"EPHA4,ATR,EPHA6,EPHA7,EPHB1,EPHA3,EPHB2,EPHA8,FGFR2,EPHA2",0.980268,Y,0.954522
38,CTNNB1_S675|CTNB1_S675,QDyKKRLsVELTSSL,S675,675,"MYLK4,AURA,PKACA,CLK4,SKMLCK,DAPK1,PKG2,DAPK3,MSK1,PAK4","PAK4,PAK6,PAK5,PKACA,PKG2,PRKX,AURB,PAK1,PAK2,PKACB",0.980063,S,0.91069
3,CTNNB1_S33,QQQsyLDsGIHsGAT,S33,33,"CK1G2,CK1A,CK1G3,GSK3A,GSK3B,GRK3,CK1A2,CK1D,JNK1,GRK2","GSK3B,IKKB,IKKA,GSK3A,TBK1,PAK4,GRK1,IKKE,P90RSK,ERK5",0.978753,S,0.74051
18,CTNNB1_T298,VKFLAITtDCLQILA,T298,298,"MEK1,GSK3B,MOS,DAPK2,ALK4,ACVR2B,ACVR2A,MEK2,BRAF,GRK2","PBK,LKB1,TNIK,NEK4,MST2,MINK,CAMKK1,MST1,NEK1,HGK",0.972658,T,0.962921
19,CTNNB1_S311,LAYGNQEsKLIILAS,S311,311,"IRE2,SKMLCK,TSSK1,SSTK,SMMLCK,TLK1,TSSK2,MELK,CAMK1G,RIPK3","TSSK1,NIM1,QIK,TSSK2,MARK2,MARK1,CAMK1D,MARK3,PRKD1,BRSK2",0.9725,S,0.968567
25,CTNNB1_S374,GLHLtDPsQRLVQNC,S374,374,"CK1G3,CK1D,CK1A,CK1E,CK1A2,CK1G2,CK1G1,KIS,TGFBR2,ATM","ATR,ATM,DNAPK,ERK7,NIM1,TSSK2,LATS1,PKCA,DSTYK,NUAK1",0.960511,S,0.747484


In [None]:
out2.sort_values('am_pathogenicity_-7to+7',ascending=False)

Unnamed: 0,gene_site,site_seq2,site,position,pspa,cddm,am_pathogenicity,aa1,am_pathogenicity_-7to+7
19,CTNNB1_S311,LAYGNQEsKLIILAS,S311,311,"IRE2,SKMLCK,TSSK1,SSTK,SMMLCK,TLK1,TSSK2,MELK,CAMK1G,RIPK3","TSSK1,NIM1,QIK,TSSK2,MARK2,MARK1,CAMK1D,MARK3,PRKD1,BRSK2",0.9725,S,0.968567
36,CTNNB1_Y654|CTNB1_Y654,RNEGVAtyAAAVLFR,Y654,654,unavailable due to Y site,"EPHA7,EPHA4,TXK,CSK,LTK,BTK,EPHB3,LYN,EPHA1,BLK",0.993016,Y,0.964376
18,CTNNB1_T298,VKFLAITtDCLQILA,T298,298,"MEK1,GSK3B,MOS,DAPK2,ALK4,ACVR2B,ACVR2A,MEK2,BRAF,GRK2","PBK,LKB1,TNIK,NEK4,MST2,MINK,CAMKK1,MST1,NEK1,HGK",0.972658,T,0.962921
35,CTNNB1_T653|CTNB1_T653,sRNEGVAtyAAAVLF,T653,653,"GRK7,PRP4,TGFBR1,ALK4,JNK1,P38G,P38B,ACVR2B,BMPR1B,JNK3","TNIK,PBK,OSR1,TAO2,MINK,MST1,KHS1,MST2,GCK,TAO1",0.986426,T,0.962545
34,CTNNB1_S646,PLTELLHsRNEGVAt,S646,646,"GRK6,PINK1,NEK9,PLK2,CAMKK1,NEK7,PLK3,GRK5,COT,CK2A2","CK2A2,CK2A1,IKKB,GRK1,GRK7,PAK4,IKKA,ULK3,P90RSK,MAPKAPK3",0.956311,S,0.959204
14,CTNNB1_Y142,AVVNLINyQDDAELA,Y142,142,unavailable due to Y site,"EPHA4,ATR,EPHA6,EPHA7,EPHB1,EPHA3,EPHB2,EPHA8,FGFR2,EPHA2",0.980268,Y,0.954522
26,CTNNB1_T384,LVQNCLWtLRNLSDA,T384,384,"ERK7,TAK1,NEK4,MST1,HPK1,NEK8,CAMKK1,IRAK1,NEK1,MST2","CAMKK1,MINK,TNIK,CAMKK2,MST1,ERK7,NEK4,HGK,LKB1,MST2",0.9454,T,0.935751
37,CTNNB1_Y670,SEDKPQDyKKRLsVE,Y670,670,unavailable due to Y site,"FLT1,SYK,PTK2,KIT,CSF1R,EPHA4,EPHA6,JAK2,EPHA2,FGFR4",0.954805,Y,0.923685
23,CTNNB1_S352,KVLSVCSsNKPAIVE,S352,352,"BUB1,HIPK4,PRKD1,PRKD2,PRKD3,CDK18,KIS,PERK,MLK2,CK1E","PRKD2,PRKD3,HIPK4,PRKD1,MAPKAPK3,TSSK2,CAMK4,MAPKAPK5,CAMK1D,MAPKAPK2",0.839116,S,0.9152
38,CTNNB1_S675|CTNB1_S675,QDyKKRLsVELTSSL,S675,675,"MYLK4,AURA,PKACA,CLK4,SKMLCK,DAPK1,PKG2,DAPK3,MSK1,PAK4","PAK4,PAK6,PAK5,PKACA,PKG2,PRKX,AURB,PAK1,PAK2,PKACB",0.980063,S,0.91069
