# Build human phosphoproteome dataset with phosphorylated status

## Setup

In [1]:
import pandas as pd
from matplotlib import pyplot as plt
from katlas.core import *
import seaborn as sns
from tqdm import tqdm
import numpy as np

tqdm.pandas()

## Combine PhosphoSitePlus and Ochoa et al.

In [20]:
ochoa = Data.get_ochoa_site()
psp = Data.get_psp_human_site()

In [6]:
ochoa

Unnamed: 0,uniprot,position,residue,is_disopred,disopred_score,log10_hotspot_pval_min,isHotspot,uniprot_position,functional_score,current_uniprot,name,gene,Sequence,is_valid,site_seq,gene_site
0,A0A075B6Q4,24,S,True,0.91,6.839384,True,A0A075B6Q4_24,0.149257,A0A075B6Q4,A0A075B6Q4_HUMAN,,MDIQKSENEDDSEWEDVDDEKGDSNDDYDSAGLLSDEDCMSVPGKT...,True,VDDEKGDSNDDYDSA,A0A075B6Q4_S24
1,A0A075B6Q4,35,S,True,0.87,9.192622,False,A0A075B6Q4_35,0.136966,A0A075B6Q4,A0A075B6Q4_HUMAN,,MDIQKSENEDDSEWEDVDDEKGDSNDDYDSAGLLSDEDCMSVPGKT...,True,YDSAGLLSDEDCMSV,A0A075B6Q4_S35
2,A0A075B6Q4,57,S,False,0.28,0.818834,False,A0A075B6Q4_57,0.125364,A0A075B6Q4,A0A075B6Q4_HUMAN,,MDIQKSENEDDSEWEDVDDEKGDSNDDYDSAGLLSDEDCMSVPGKT...,True,IADHLFWSEETKSRF,A0A075B6Q4_S57
3,A0A075B6Q4,68,S,False,0.03,0.375986,False,A0A075B6Q4_68,0.119811,A0A075B6Q4,A0A075B6Q4_HUMAN,,MDIQKSENEDDSEWEDVDDEKGDSNDDYDSAGLLSDEDCMSVPGKT...,True,KSRFTEYSMTSSVMR,A0A075B6Q4_S68
4,A0A075B6Q4,71,S,False,0.05,0.000000,False,A0A075B6Q4_71,0.095193,A0A075B6Q4,A0A075B6Q4_HUMAN,,MDIQKSENEDDSEWEDVDDEKGDSNDDYDSAGLLSDEDCMSVPGKT...,True,FTEYSMTSSVMRRNE,A0A075B6Q4_S71
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
112276,V9GYY5,127,S,True,0.97,3.193174,False,V9GYY5_127,0.292446,V9GYY5,V9GYY5_HUMAN,,KRDGDDRRPRLVLSFDEEKRREYLTGFHKRKVERKKAAIEEIKQRL...,True,EGGAGDRSEEEASST,V9GYY5_S127
112277,V9GYY5,132,S,True,0.93,2.055830,False,V9GYY5_132,0.219329,V9GYY5,V9GYY5_HUMAN,,KRDGDDRRPRLVLSFDEEKRREYLTGFHKRKVERKKAAIEEIKQRL...,True,DRSEEEASSTEKPTK,V9GYY5_S132
112278,V9GYY5,133,S,True,0.89,2.055830,False,V9GYY5_133,0.202808,V9GYY5,V9GYY5_HUMAN,,KRDGDDRRPRLVLSFDEEKRREYLTGFHKRKVERKKAAIEEIKQRL...,True,RSEEEASSTEKPTKA,V9GYY5_S133
112279,V9GYY5,134,T,True,0.83,2.055830,False,V9GYY5_134,0.187417,V9GYY5,V9GYY5_HUMAN,,KRDGDDRRPRLVLSFDEEKRREYLTGFHKRKVERKKAAIEEIKQRL...,True,SEEEASSTEKPTKAL,V9GYY5_T134


In [7]:
# ochoa['gene'] = ochoa.gene.fillna(ochoa.current_uniprot)

# ochoa['gene_site'] = ochoa.gene + '_' + ochoa.residue+ochoa.position.astype(str)

# ochoa.to_parquet('ochoa_site.parquet')

PSP is gene name, ochoa is porotein name

## Process PSP

In [21]:
psp = psp[psp.site.str[0].isin(['S','T','Y'])]

In [22]:
psp = psp[psp.LT_LIT.notna()].reset_index(drop=True)

In [23]:
psp = psp[['uniprot','site']].copy() #'protein','gene'

## Process Ochoa

In [24]:
ochoa['site'] = ochoa.residue+ochoa.position.astype(str)

In [25]:
ochoa = ochoa[['uniprot','site']].copy()

## Merge the two

In [26]:
psp['source'], ochoa['source'] = 'psp','ochoa'

In [29]:
comb = psp.merge(ochoa,how='outer',on=['uniprot','site'],suffixes=('_1','_2'))

In [30]:
comb['source'] = comb.apply(lambda row: '|'.join(filter(pd.notna, [row['source_1'], row['source_2']])), axis=1)

In [31]:
comb = comb.drop(columns=['source_1','source_2'])

In [32]:
comb.source.value_counts()

source
ochoa        106327
psp            9138
psp|ochoa      5954
Name: count, dtype: int64

## Query uniprot sequence on uniprot; mapping sequence

Uncheck below to use the csv for uniprot id mapping

In [102]:
# comb.drop_duplicates(subset='uniprot').to_csv('uniprot.csv',index=False)

In [103]:
# unmapped = pd.Series(['AAC50053',
# 'P18433-2',
# 'AAA58698',
# 'NP_001184222',
# 'AAA60149'])

In [34]:
sequence = pd.read_excel('idmapping_2024_06_17.xlsx')

# there are few duplicates uniprot (history uniprot)
sequence = sequence.drop_duplicates(subset='uniprot')

In [105]:
seq = sequence[['uniprot','sequence']].copy()

In [106]:
comb = comb.merge(seq,how='inner',on='uniprot')

In [107]:
comb.shape

(121342, 4)

## Validate position

In [108]:
comb['position'] = comb.site.str[1:].astype(int)

In [109]:
comb['acceptor'] = comb.site.str[0]

In [110]:
def validate_position(row):
    # Extract amino acid and position from the new columns
    amino_acid = row['acceptor']
    position = int(row['position'])
    
    try:
        # Check if the amino acid at the given position matches the specified amino acid
        if row['sequence'][position-1] == amino_acid:
            return 1
        else:
            return 0
    except IndexError:  # Handle the case when position-1 exceeds the length of sequence
        return 0

In [111]:
comb['is_valid'] = comb.apply(validate_position,axis=1)

In [112]:
comb.is_valid.value_counts()

is_valid
1    120174
0      1168
Name: count, dtype: int64

In [113]:
comb = comb[comb.is_valid==1]

In [114]:
comb.source.value_counts()

source
ochoa        105845
psp            8382
psp|ochoa      5947
Name: count, dtype: int64

## Phosphorylate sequence

In [115]:
modify=comb.groupby('uniprot').agg({'site':lambda r: r.unique()}).reset_index()

In [116]:
modify = modify.merge(seq)

In [117]:
def phosphorylate_seq(row):
    seq = list(row['sequence'])
    for pos in row['site']:
        # extract character and position
        position = int(pos[1:]) - 1  # Subtracting 1 because Python uses 0-based indexing

        # convert sequence
        seq[position] = seq[position].lower()
    return ''.join(seq)

In [118]:
modify['phospho_seq'] = modify.apply(phosphorylate_seq,axis=1)

In [119]:
seq2 = modify[['uniprot','phospho_seq']]

## Extract sequence

In [120]:
comb = comb.merge(seq2)

In [121]:
site_seq = extract_site_seq(comb,'phospho_seq','position')

100%|██████████| 120174/120174 [00:17<00:00, 6912.90it/s]


In [122]:
comb['site_seq'] = site_seq

## Add gene, protein information

In [123]:
info = sequence[['uniprot','uniprot_name','gene']].copy()

In [124]:
info['protein'] = info.uniprot_name.str.split('_').str[0]

In [125]:
info['gene'] = info.gene.fillna(info.protein)

In [126]:
info['gene'] = info['gene'].str.split(' ').str[0]

In [127]:
comb = comb.merge(info)

In [128]:
comb['gene_site'] = comb.gene + '_' + comb.site

In [129]:
comb.source.value_counts()

source
ochoa        105845
psp            8382
psp|ochoa      5947
Name: count, dtype: int64

In [130]:
reorder_col = ['uniprot', 'source', 'gene_site','site', 'position', 'acceptor', 
'site_seq', 'uniprot_name', 'gene',
       'protein','phospho_seq']

In [131]:
comb = comb[reorder_col]

## Test

In [132]:
query_gene(comb,'CTNNB1')

Unnamed: 0,uniprot,source,gene_site,site,position,acceptor,site_seq,uniprot_name,gene,protein,phospho_seq
10801,P35222,ochoa,CTNNB1_T3,T3,3,T,_____MAtQADLMEL,CTNB1_HUMAN,CTNNB1,CTNB1,MAtQADLMELDMAMEPDRKAAVsHWQQQsyLDsGIHsGATtTAPsLsGKGNPEEEDVDTsQVLyEWEQGFSQsFTQEQVADIDGQyAMTRAQRVRAAMFPEtLDEGMQIPStQFDAAHPtNVQRLAEPSQMLKHAVVNLINyQDDAELATRAIPELTKLLNDEDQVVVNKAAVMVHQLSKKEAsRHAIMRsPQMVSAIVRTMQNTNDVETARCTAGTLHNLSHHREGLLAIFKSGGIPALVKMLGsPVDSVLFYAITTLHNLLLHQEGAKMAVRLAGGLQKMVALLNKTNVKFLAITtDCLQILAYGNQEsKLIILASGGPQALVNIMRTytyEKLLWTTSRVLKVLSVCSsNKPAIVEAGGMQALGLHLtDPsQRLVQNCLWtLRNLSDAAtKQEGMEGLLGTLVQLLGSDDINVVTCAAGILSNLTCNNYKNKMMVCQVGGIEALVRTVLRAGDREDITEPAICALRHLtSRHQEAEMAQNAVRLHyGLPVVVKLLHPPSHWPLIKATVGLIRNLALCPANHAPLREQGAIPRLVQLLVRAHQDtQRRtsMGGtQQQFVEGVRMEEIVEGCTGALHILARDVHNRIVIRGLNTI...
10760,P35222,psp,CTNNB1_S23,S23,23,S,PDRKAAVsHWQQQsy,CTNB1_HUMAN,CTNNB1,CTNB1,MAtQADLMELDMAMEPDRKAAVsHWQQQsyLDsGIHsGATtTAPsLsGKGNPEEEDVDTsQVLyEWEQGFSQsFTQEQVADIDGQyAMTRAQRVRAAMFPEtLDEGMQIPStQFDAAHPtNVQRLAEPSQMLKHAVVNLINyQDDAELATRAIPELTKLLNDEDQVVVNKAAVMVHQLSKKEAsRHAIMRsPQMVSAIVRTMQNTNDVETARCTAGTLHNLSHHREGLLAIFKSGGIPALVKMLGsPVDSVLFYAITTLHNLLLHQEGAKMAVRLAGGLQKMVALLNKTNVKFLAITtDCLQILAYGNQEsKLIILASGGPQALVNIMRTytyEKLLWTTSRVLKVLSVCSsNKPAIVEAGGMQALGLHLtDPsQRLVQNCLWtLRNLSDAAtKQEGMEGLLGTLVQLLGSDDINVVTCAAGILSNLTCNNYKNKMMVCQVGGIEALVRTVLRAGDREDITEPAICALRHLtSRHQEAEMAQNAVRLHyGLPVVVKLLHPPSHWPLIKATVGLIRNLALCPANHAPLREQGAIPRLVQLLVRAHQDtQRRtsMGGtQQQFVEGVRMEEIVEGCTGALHILARDVHNRIVIRGLNTI...
10761,P35222,psp|ochoa,CTNNB1_S29,S29,29,S,VsHWQQQsyLDsGIH,CTNB1_HUMAN,CTNNB1,CTNB1,MAtQADLMELDMAMEPDRKAAVsHWQQQsyLDsGIHsGATtTAPsLsGKGNPEEEDVDTsQVLyEWEQGFSQsFTQEQVADIDGQyAMTRAQRVRAAMFPEtLDEGMQIPStQFDAAHPtNVQRLAEPSQMLKHAVVNLINyQDDAELATRAIPELTKLLNDEDQVVVNKAAVMVHQLSKKEAsRHAIMRsPQMVSAIVRTMQNTNDVETARCTAGTLHNLSHHREGLLAIFKSGGIPALVKMLGsPVDSVLFYAITTLHNLLLHQEGAKMAVRLAGGLQKMVALLNKTNVKFLAITtDCLQILAYGNQEsKLIILASGGPQALVNIMRTytyEKLLWTTSRVLKVLSVCSsNKPAIVEAGGMQALGLHLtDPsQRLVQNCLWtLRNLSDAAtKQEGMEGLLGTLVQLLGSDDINVVTCAAGILSNLTCNNYKNKMMVCQVGGIEALVRTVLRAGDREDITEPAICALRHLtSRHQEAEMAQNAVRLHyGLPVVVKLLHPPSHWPLIKATVGLIRNLALCPANHAPLREQGAIPRLVQLLVRAHQDtQRRtsMGGtQQQFVEGVRMEEIVEGCTGALHILARDVHNRIVIRGLNTI...
10762,P35222,psp|ochoa,CTNNB1_Y30,Y30,30,Y,sHWQQQsyLDsGIHs,CTNB1_HUMAN,CTNNB1,CTNB1,MAtQADLMELDMAMEPDRKAAVsHWQQQsyLDsGIHsGATtTAPsLsGKGNPEEEDVDTsQVLyEWEQGFSQsFTQEQVADIDGQyAMTRAQRVRAAMFPEtLDEGMQIPStQFDAAHPtNVQRLAEPSQMLKHAVVNLINyQDDAELATRAIPELTKLLNDEDQVVVNKAAVMVHQLSKKEAsRHAIMRsPQMVSAIVRTMQNTNDVETARCTAGTLHNLSHHREGLLAIFKSGGIPALVKMLGsPVDSVLFYAITTLHNLLLHQEGAKMAVRLAGGLQKMVALLNKTNVKFLAITtDCLQILAYGNQEsKLIILASGGPQALVNIMRTytyEKLLWTTSRVLKVLSVCSsNKPAIVEAGGMQALGLHLtDPsQRLVQNCLWtLRNLSDAAtKQEGMEGLLGTLVQLLGSDDINVVTCAAGILSNLTCNNYKNKMMVCQVGGIEALVRTVLRAGDREDITEPAICALRHLtSRHQEAEMAQNAVRLHyGLPVVVKLLHPPSHWPLIKATVGLIRNLALCPANHAPLREQGAIPRLVQLLVRAHQDtQRRtsMGGtQQQFVEGVRMEEIVEGCTGALHILARDVHNRIVIRGLNTI...
10763,P35222,psp,CTNNB1_S33,S33,33,S,QQQsyLDsGIHsGAT,CTNB1_HUMAN,CTNNB1,CTNB1,MAtQADLMELDMAMEPDRKAAVsHWQQQsyLDsGIHsGATtTAPsLsGKGNPEEEDVDTsQVLyEWEQGFSQsFTQEQVADIDGQyAMTRAQRVRAAMFPEtLDEGMQIPStQFDAAHPtNVQRLAEPSQMLKHAVVNLINyQDDAELATRAIPELTKLLNDEDQVVVNKAAVMVHQLSKKEAsRHAIMRsPQMVSAIVRTMQNTNDVETARCTAGTLHNLSHHREGLLAIFKSGGIPALVKMLGsPVDSVLFYAITTLHNLLLHQEGAKMAVRLAGGLQKMVALLNKTNVKFLAITtDCLQILAYGNQEsKLIILASGGPQALVNIMRTytyEKLLWTTSRVLKVLSVCSsNKPAIVEAGGMQALGLHLtDPsQRLVQNCLWtLRNLSDAAtKQEGMEGLLGTLVQLLGSDDINVVTCAAGILSNLTCNNYKNKMMVCQVGGIEALVRTVLRAGDREDITEPAICALRHLtSRHQEAEMAQNAVRLHyGLPVVVKLLHPPSHWPLIKATVGLIRNLALCPANHAPLREQGAIPRLVQLLVRAHQDtQRRtsMGGtQQQFVEGVRMEEIVEGCTGALHILARDVHNRIVIRGLNTI...
10764,P35222,psp|ochoa,CTNNB1_S37,S37,37,S,yLDsGIHsGATtTAP,CTNB1_HUMAN,CTNNB1,CTNB1,MAtQADLMELDMAMEPDRKAAVsHWQQQsyLDsGIHsGATtTAPsLsGKGNPEEEDVDTsQVLyEWEQGFSQsFTQEQVADIDGQyAMTRAQRVRAAMFPEtLDEGMQIPStQFDAAHPtNVQRLAEPSQMLKHAVVNLINyQDDAELATRAIPELTKLLNDEDQVVVNKAAVMVHQLSKKEAsRHAIMRsPQMVSAIVRTMQNTNDVETARCTAGTLHNLSHHREGLLAIFKSGGIPALVKMLGsPVDSVLFYAITTLHNLLLHQEGAKMAVRLAGGLQKMVALLNKTNVKFLAITtDCLQILAYGNQEsKLIILASGGPQALVNIMRTytyEKLLWTTSRVLKVLSVCSsNKPAIVEAGGMQALGLHLtDPsQRLVQNCLWtLRNLSDAAtKQEGMEGLLGTLVQLLGSDDINVVTCAAGILSNLTCNNYKNKMMVCQVGGIEALVRTVLRAGDREDITEPAICALRHLtSRHQEAEMAQNAVRLHyGLPVVVKLLHPPSHWPLIKATVGLIRNLALCPANHAPLREQGAIPRLVQLLVRAHQDtQRRtsMGGtQQQFVEGVRMEEIVEGCTGALHILARDVHNRIVIRGLNTI...
10765,P35222,psp,CTNNB1_T41,T41,41,T,GIHsGATtTAPsLsG,CTNB1_HUMAN,CTNNB1,CTNB1,MAtQADLMELDMAMEPDRKAAVsHWQQQsyLDsGIHsGATtTAPsLsGKGNPEEEDVDTsQVLyEWEQGFSQsFTQEQVADIDGQyAMTRAQRVRAAMFPEtLDEGMQIPStQFDAAHPtNVQRLAEPSQMLKHAVVNLINyQDDAELATRAIPELTKLLNDEDQVVVNKAAVMVHQLSKKEAsRHAIMRsPQMVSAIVRTMQNTNDVETARCTAGTLHNLSHHREGLLAIFKSGGIPALVKMLGsPVDSVLFYAITTLHNLLLHQEGAKMAVRLAGGLQKMVALLNKTNVKFLAITtDCLQILAYGNQEsKLIILASGGPQALVNIMRTytyEKLLWTTSRVLKVLSVCSsNKPAIVEAGGMQALGLHLtDPsQRLVQNCLWtLRNLSDAAtKQEGMEGLLGTLVQLLGSDDINVVTCAAGILSNLTCNNYKNKMMVCQVGGIEALVRTVLRAGDREDITEPAICALRHLtSRHQEAEMAQNAVRLHyGLPVVVKLLHPPSHWPLIKATVGLIRNLALCPANHAPLREQGAIPRLVQLLVRAHQDtQRRtsMGGtQQQFVEGVRMEEIVEGCTGALHILARDVHNRIVIRGLNTI...
10766,P35222,psp|ochoa,CTNNB1_S45,S45,45,S,GATtTAPsLsGKGNP,CTNB1_HUMAN,CTNNB1,CTNB1,MAtQADLMELDMAMEPDRKAAVsHWQQQsyLDsGIHsGATtTAPsLsGKGNPEEEDVDTsQVLyEWEQGFSQsFTQEQVADIDGQyAMTRAQRVRAAMFPEtLDEGMQIPStQFDAAHPtNVQRLAEPSQMLKHAVVNLINyQDDAELATRAIPELTKLLNDEDQVVVNKAAVMVHQLSKKEAsRHAIMRsPQMVSAIVRTMQNTNDVETARCTAGTLHNLSHHREGLLAIFKSGGIPALVKMLGsPVDSVLFYAITTLHNLLLHQEGAKMAVRLAGGLQKMVALLNKTNVKFLAITtDCLQILAYGNQEsKLIILASGGPQALVNIMRTytyEKLLWTTSRVLKVLSVCSsNKPAIVEAGGMQALGLHLtDPsQRLVQNCLWtLRNLSDAAtKQEGMEGLLGTLVQLLGSDDINVVTCAAGILSNLTCNNYKNKMMVCQVGGIEALVRTVLRAGDREDITEPAICALRHLtSRHQEAEMAQNAVRLHyGLPVVVKLLHPPSHWPLIKATVGLIRNLALCPANHAPLREQGAIPRLVQLLVRAHQDtQRRtsMGGtQQQFVEGVRMEEIVEGCTGALHILARDVHNRIVIRGLNTI...
10802,P35222,ochoa,CTNNB1_S47,S47,47,S,TtTAPsLsGKGNPEE,CTNB1_HUMAN,CTNNB1,CTNB1,MAtQADLMELDMAMEPDRKAAVsHWQQQsyLDsGIHsGATtTAPsLsGKGNPEEEDVDTsQVLyEWEQGFSQsFTQEQVADIDGQyAMTRAQRVRAAMFPEtLDEGMQIPStQFDAAHPtNVQRLAEPSQMLKHAVVNLINyQDDAELATRAIPELTKLLNDEDQVVVNKAAVMVHQLSKKEAsRHAIMRsPQMVSAIVRTMQNTNDVETARCTAGTLHNLSHHREGLLAIFKSGGIPALVKMLGsPVDSVLFYAITTLHNLLLHQEGAKMAVRLAGGLQKMVALLNKTNVKFLAITtDCLQILAYGNQEsKLIILASGGPQALVNIMRTytyEKLLWTTSRVLKVLSVCSsNKPAIVEAGGMQALGLHLtDPsQRLVQNCLWtLRNLSDAAtKQEGMEGLLGTLVQLLGSDDINVVTCAAGILSNLTCNNYKNKMMVCQVGGIEALVRTVLRAGDREDITEPAICALRHLtSRHQEAEMAQNAVRLHyGLPVVVKLLHPPSHWPLIKATVGLIRNLALCPANHAPLREQGAIPRLVQLLVRAHQDtQRRtsMGGtQQQFVEGVRMEEIVEGCTGALHILARDVHNRIVIRGLNTI...
10767,P35222,psp,CTNNB1_S60,S60,60,S,EEEDVDTsQVLyEWE,CTNB1_HUMAN,CTNNB1,CTNB1,MAtQADLMELDMAMEPDRKAAVsHWQQQsyLDsGIHsGATtTAPsLsGKGNPEEEDVDTsQVLyEWEQGFSQsFTQEQVADIDGQyAMTRAQRVRAAMFPEtLDEGMQIPStQFDAAHPtNVQRLAEPSQMLKHAVVNLINyQDDAELATRAIPELTKLLNDEDQVVVNKAAVMVHQLSKKEAsRHAIMRsPQMVSAIVRTMQNTNDVETARCTAGTLHNLSHHREGLLAIFKSGGIPALVKMLGsPVDSVLFYAITTLHNLLLHQEGAKMAVRLAGGLQKMVALLNKTNVKFLAITtDCLQILAYGNQEsKLIILASGGPQALVNIMRTytyEKLLWTTSRVLKVLSVCSsNKPAIVEAGGMQALGLHLtDPsQRLVQNCLWtLRNLSDAAtKQEGMEGLLGTLVQLLGSDDINVVTCAAGILSNLTCNNYKNKMMVCQVGGIEALVRTVLRAGDREDITEPAICALRHLtSRHQEAEMAQNAVRLHyGLPVVVKLLHPPSHWPLIKATVGLIRNLALCPANHAPLREQGAIPRLVQLLVRAHQDtQRRtsMGGtQQQFVEGVRMEEIVEGCTGALHILARDVHNRIVIRGLNTI...


## Save

In [133]:
comb = comb.rename(columns={'phospho_seq':'sequence'})

In [134]:
comb.to_parquet('ochoa_psp_combine_site_phospho.parquet')

In [135]:
comb.shape

(120174, 11)