# About

This scripts check amount of protein with shared peptides.

In [10]:
import os 
import re
import pandas as pd
import numpy as np

# read in .fasta file and count shared peptides

os.chdir("/home/ptruong/git/dia_sum")
filename = "database/2021-06-07/UP00000625_UP000002311_UP000005640.fasta"
#filename = "database/napedro_3mixed_human_yeast_ecoli_20140403_iRT_reverse.fasta"

file = open(filename, "r")

protein_list = []
sequence_list = []
for line in file: 
    if line[0] == ">":
        protein = line 
    else:
        sequence = line.rstrip()
        split_sequence = re.split(r"(?<=[KR])", sequence)
        split_sequence = list(dict.fromkeys(split_sequence))
        sequence_list += split_sequence
        protein_list += [protein for i in range(len(split_sequence))]
        



In [11]:
df = pd.DataFrame(np.array([protein_list, sequence_list]).T, columns = ["protein", "sequence"])



In [12]:
def decoy_map(protein):
    if protein.split("_")[0] == ">reverse":
        return True
    else:
        return False

In [13]:
df["decoy"] = df.protein.map(decoy_map)

In [14]:
df = df[df.decoy == False]
df["seq_length"] = df.sequence.str.len()
df = df[df["seq_length"] > 7]
df.drop("seq_length", axis = 1, inplace = True)
df.drop_duplicates(inplace=True)
counted_df = df.groupby("sequence").count().sort_values(by = "protein", ascending = False)

In [15]:
counted_df

Unnamed: 0_level_0,protein,decoy
sequence,Unnamed: 1_level_1,Unnamed: 2_level_1
PYECNECGK,61,61
PYECSECGK,60,60
LSVDYTDIMK,59,59
PPPMLTSPND,58,58
EVHTNQDPLDVSASK,58,58
...,...,...
GTTFIIFGSLTFLISLSVLIWFPETK,1,1
GTTFSILSDVHNR,1,1
GTTFSQAK,1,1
GTTFTESFIK,1,1


In [16]:
counted_df[counted_df.protein > 1]


Unnamed: 0_level_0,protein,decoy
sequence,Unnamed: 1_level_1,Unnamed: 2_level_1
PYECNECGK,61,61
PYECSECGK,60,60
LSVDYTDIMK,59,59
PPPMLTSPND,58,58
EVHTNQDPLDVSASK,58,58
...,...,...
DIVTVGQCR,2,2
LSTIQNADLIVVFQNGR,2,2
GGQTIVQLQK,2,2
MDSTVPSALELPQR,2,2


In [17]:
counted_df[counted_df.protein == 1]


Unnamed: 0_level_0,protein,decoy
sequence,Unnamed: 1_level_1,Unnamed: 2_level_1
PMSPITAQMSQLELQQAALEGLALPHDLAVQAANFYQPGFGK,1,1
PNICQCLPGHGGATCDEEHCNPPCQHGGTCLAGNLCTC,1,1
PPPVLITLPAEPTLPPDAYSHLQGHLGHFPGPEPLAFPVK,1,1
PPSMEYCVLLFCCCICGFESTSK,1,1
PMVLVLLMLTCK,1,1
...,...,...
GTTFIIFGSLTFLISLSVLIWFPETK,1,1
GTTFSILSDVHNR,1,1
GTTFSQAK,1,1
GTTFTESFIK,1,1


In [18]:
# Conclusion

print(f"15167 sequences are shared in > 1 Proteins. ({round(len(counted_df[counted_df.protein > 1])/ len(counted_df), 6)} of proteins. ))")
print(f"682301 sequences has unique proteins.({round(len(counted_df[counted_df.protein == 1])/ len(counted_df), 6)} of proteins.))")

15167 sequences are shared in > 1 Proteins. (0.021746 of proteins. ))
682301 sequences has unique proteins.(0.978254 of proteins.))


In [19]:
df

Unnamed: 0,protein,sequence,decoy
2,>sp|P02925|RBSB_ECOLI Ribose import binding pr...,LATLVSAVALSATVSANAMAK,False
3,>sp|P02925|RBSB_ECOLI Ribose import binding pr...,DTIALVVSTLNNPFFVSLK,False
7,>sp|P02925|RBSB_ECOLI Ribose import binding pr...,VLDSQNNPAK,False
8,>sp|P02925|RBSB_ECOLI Ribose import binding pr...,ELANVQDLTVR,False
10,>sp|P02925|RBSB_ECOLI Ribose import binding pr...,ILLINPTDSDAVGNAVK,False
...,...,...,...
1955107,>sp|Q9ULT0|TTC7A_HUMAN Tetratricopeptide repea...,EAGFCIQEAAGLFPTSHSVLYMR,False
1955112,>sp|Q9ULT0|TTC7A_HUMAN Tetratricopeptide repea...,EALTVNPDGVR,False
1955114,>sp|Q9ULT0|TTC7A_HUMAN Tetratricopeptide repea...,IMHSLGLMLSR,False
1955119,>sp|Q9ULT0|TTC7A_HUMAN Tetratricopeptide repea...,QSTCHEAWQGLGEVLQAQGQNEAAVDCFLTAL,False


In [20]:
seqeunces_in_more_than_one_protein = counted_df[counted_df > 1].dropna().index
df_shared_peptides_protein = df[df.sequence.isin(seqeunces_in_more_than_one_protein)]

In [21]:
seqeunces_one_protein = counted_df[counted_df.protein == 1].dropna().index
df_non_shared_peptides_protein = df[df.sequence.isin(seqeunces_one_protein)]

In [22]:
seqeunces_in_more_than_one_protein.isin(seqeunces_one_protein).sum()

0

In [23]:
len(df_shared_peptides_protein)/len(df)

0.05975760164814343

In [24]:
# Conclusion

print(f"{len(counted_df[counted_df.protein > 1])} sequences are shared in > 1 Proteins. ({round(len(counted_df[counted_df.protein > 1])/ len(counted_df), 6)} of proteins. ))")
print(f"{len(counted_df[counted_df.protein == 1])} sequences has unique proteins.({round(len(counted_df[counted_df.protein == 1])/ len(counted_df), 6)} of proteins.))")

15167 sequences are shared in > 1 Proteins. (0.021746 of proteins. ))
682301 sequences has unique proteins.(0.978254 of proteins.))


In [25]:
# Conclusion
print(f"{len(df.protein.unique())} proteins in .FASTA")
print(f"{len(df_shared_peptides_protein.protein.unique())} proteins have shared peptides. ({round(len(df_shared_peptides_protein.protein.unique())/len(df.protein.unique()), 6)} of proteins. ))")
#print(f"{len(df_non_shared_peptides_protein)} proteins have unique peptides.({round(len(df_non_shared_peptides_protein.protein.unique()) / len(df.protein.unique()), 6)} of proteins.))")

31010 proteins in .FASTA
7564 proteins have shared peptides. (0.243921 of proteins. ))


In [26]:
df_non_shared_peptides_protein[df_non_shared_peptides_protein.protein.isin(df_shared_peptides_protein.protein)]

Unnamed: 0,protein,sequence,decoy
78,>sp|P06960|OTC2_ECOLI Ornithine carbamoyltrans...,LLDFTPAQFTSLLTLAAQLK,False
87,>sp|P06960|OTC2_ECOLI Ornithine carbamoyltrans...,CSFEVAAFDQGAR,False
93,>sp|P06960|OTC2_ECOLI Ornithine carbamoyltrans...,GHGQEVVETLAQY,False
94,>sp|P06960|OTC2_ECOLI Ornithine carbamoyltrans...,AGVPVWNGLTNEFHPTQLLADLMTMQEHLPGK,False
99,>sp|P06960|OTC2_ECOLI Ornithine carbamoyltrans...,ACWPEESLVAECSALAEK,False
...,...,...,...
1954907,>sp|Q9NUT2|MITOS_HUMAN Mitochondrial potassium...,QPTVLILDEATSALDAESER,False
1954908,>sp|Q9NUT2|MITOS_HUMAN Mitochondrial potassium...,VVQEALDR,False
1954913,>sp|Q9NUT2|MITOS_HUMAN Mitochondrial potassium...,GAHCIVVMADGR,False
1954914,>sp|Q9NUT2|MITOS_HUMAN Mitochondrial potassium...,VWEAGTHEELLK,False


In [27]:
df_shared_peptides_protein

Unnamed: 0,protein,sequence,decoy
83,>sp|P06960|OTC2_ECOLI Ornithine carbamoyltrans...,NIALIFEK,False
88,>sp|P06960|OTC2_ECOLI Ornithine carbamoyltrans...,VTYLGPSGSQIGHK,False
92,>sp|P06960|OTC2_ECOLI Ornithine carbamoyltrans...,MYDGIQYR,False
95,>sp|P06960|OTC2_ECOLI Ornithine carbamoyltrans...,AFNEMTLVYAGDAR,False
96,>sp|P06960|OTC2_ECOLI Ornithine carbamoyltrans...,NNMGNSMLEAAALT,False
...,...,...,...
1954644,>sp|Q9BXS5|AP1M1_HUMAN AP-1 complex subunit mu...,SVELEDVK,False
1954651,>sp|Q9BXS5|AP1M1_HUMAN AP-1 complex subunit mu...,PLIWIESVIEK,False
1954670,>sp|Q9BXS5|AP1M1_HUMAN AP-1 complex subunit mu...,SGYQALPWVR,False
1954903,>sp|Q9NUT2|MITOS_HUMAN Mitochondrial potassium...,GTTLSGGQK,False


In [35]:
df_shared_peptides_protein.protein.unique()[0]

'>sp|P06960|OTC2_ECOLI Ornithine carbamoyltransferase subunit F OS=Escherichia coli (strain K12) OX=83333 GN=argF PE=1 SV=4\n'

In [38]:
shared = df_shared_peptides_protein 

In [41]:
shared[shared.sequence == "NIALIFEK"]

Unnamed: 0,protein,sequence,decoy
83,>sp|P06960|OTC2_ECOLI Ornithine carbamoyltrans...,NIALIFEK,False
88088,>sp|P04391|OTC1_ECOLI Ornithine carbamoyltrans...,NIALIFEK,False


In [47]:
prot = shared[shared.sequence == "NIALIFEK"].iloc[0,:].protein

In [48]:
prot

'>sp|P06960|OTC2_ECOLI Ornithine carbamoyltransferase subunit F OS=Escherichia coli (strain K12) OX=83333 GN=argF PE=1 SV=4\n'

In [49]:
df[df.protein == prot]

Unnamed: 0,protein,sequence,decoy
78,>sp|P06960|OTC2_ECOLI Ornithine carbamoyltrans...,LLDFTPAQFTSLLTLAAQLK,False
83,>sp|P06960|OTC2_ECOLI Ornithine carbamoyltrans...,NIALIFEK,False
87,>sp|P06960|OTC2_ECOLI Ornithine carbamoyltrans...,CSFEVAAFDQGAR,False
88,>sp|P06960|OTC2_ECOLI Ornithine carbamoyltrans...,VTYLGPSGSQIGHK,False
92,>sp|P06960|OTC2_ECOLI Ornithine carbamoyltrans...,MYDGIQYR,False
93,>sp|P06960|OTC2_ECOLI Ornithine carbamoyltrans...,GHGQEVVETLAQY,False
94,>sp|P06960|OTC2_ECOLI Ornithine carbamoyltrans...,AGVPVWNGLTNEFHPTQLLADLMTMQEHLPGK,False
95,>sp|P06960|OTC2_ECOLI Ornithine carbamoyltrans...,AFNEMTLVYAGDAR,False
96,>sp|P06960|OTC2_ECOLI Ornithine carbamoyltrans...,NNMGNSMLEAAALT,False
99,>sp|P06960|OTC2_ECOLI Ornithine carbamoyltrans...,ACWPEESLVAECSALAEK,False


In [54]:
df[df.protein == prot].sequence

78                 LLDFTPAQFTSLLTLAAQLK
83                             NIALIFEK
87                        CSFEVAAFDQGAR
88                       VTYLGPSGSQIGHK
92                             MYDGIQYR
93                        GHGQEVVETLAQY
94     AGVPVWNGLTNEFHPTQLLADLMTMQEHLPGK
95                       AFNEMTLVYAGDAR
96                       NNMGNSMLEAAALT
99                   ACWPEESLVAECSALAEK
101                        ITLTEDVAAGVK
102                    GADFIYTDVWVSMGEA
107                  GYQVNAQMMALTDNPNVK
108                   FLHCLPAFHDDQTTLGK
110                           EFDLHGGME
111                VTDEVFESAASIVFDQAENR
113                           AVMMATLGE
Name: sequence, dtype: object