# About

This scripts check amount of protein with shared peptides.

In [1]:
import os 
import re
import pandas as pd
import numpy as np

# read in .fasta file and count shared peptides

os.chdir("/home/ptruong/git/dia_sum")
filename = "database/2021-06-07/UP00000625_UP000002311_UP000005640.fasta"

file = open(filename, "r")

protein_list = []
sequence_list = []
for line in file: 
    if line[0] == ">":
        protein = line 
    else:
        sequence = line.rstrip()
        split_sequence = re.split(r"(?<=[KR])", sequence)
        split_sequence = list(dict.fromkeys(split_sequence))
        sequence_list += split_sequence
        protein_list += [protein for i in range(len(split_sequence))]
        



In [2]:
df = pd.DataFrame(np.array([protein_list, sequence_list]).T, columns = ["protein", "sequence"])
df["seq_length"] = df.sequence.str.len()
df = df[df["seq_length"] > 7]
df.drop("seq_length", axis = 1, inplace = True)
df.drop_duplicates(inplace=True)
counted_df = df.groupby("sequence").count().sort_values(by = "protein", ascending = False)


In [3]:
counted_df

Unnamed: 0_level_0,protein
sequence,Unnamed: 1_level_1
PYECNECGK,61
PYECSECGK,60
LSVDYTDIMK,59
PPPMLTSPND,58
EVHTNQDPLDVSASK,58
...,...
GTTFIIFGSLTFLISLSVLIWFPETK,1
GTTFSILSDVHNR,1
GTTFSQAK,1
GTTFTESFIK,1


In [5]:
counted_df[counted_df.protein > 1]


Unnamed: 0_level_0,protein
sequence,Unnamed: 1_level_1
PYECNECGK,61
PYECSECGK,60
LSVDYTDIMK,59
PPPMLTSPND,58
EVHTNQDPLDVSASK,58
...,...
DIVTVGQCR,2
LSTIQNADLIVVFQNGR,2
GGQTIVQLQK,2
MDSTVPSALELPQR,2


In [6]:
counted_df[counted_df.protein == 1]


Unnamed: 0_level_0,protein
sequence,Unnamed: 1_level_1
PMSPITAQMSQLELQQAALEGLALPHDLAVQAANFYQPGFGK,1
PNICQCLPGHGGATCDEEHCNPPCQHGGTCLAGNLCTC,1
PPPVLITLPAEPTLPPDAYSHLQGHLGHFPGPEPLAFPVK,1
PPSMEYCVLLFCCCICGFESTSK,1
PMVLVLLMLTCK,1
...,...
GTTFIIFGSLTFLISLSVLIWFPETK,1
GTTFSILSDVHNR,1
GTTFSQAK,1
GTTFTESFIK,1


In [7]:
# Conclusion

print(f"15167 sequences are shared in > 1 Proteins. ({round(len(counted_df[counted_df.protein > 1])/ len(counted_df), 6)} of proteins. ))")
print(f"682301 sequences has unique proteins.({round(len(counted_df[counted_df.protein == 1])/ len(counted_df), 6)} of proteins.))")

15167 sequences are shared in > 1 Proteins. (0.021746 of proteins. ))
682301 sequences has unique proteins.(0.978254 of proteins.))


In [16]:
df

Unnamed: 0,protein,sequence
2,>sp|P02925|RBSB_ECOLI Ribose import binding pr...,LATLVSAVALSATVSANAMAK
3,>sp|P02925|RBSB_ECOLI Ribose import binding pr...,DTIALVVSTLNNPFFVSLK
7,>sp|P02925|RBSB_ECOLI Ribose import binding pr...,VLDSQNNPAK
8,>sp|P02925|RBSB_ECOLI Ribose import binding pr...,ELANVQDLTVR
10,>sp|P02925|RBSB_ECOLI Ribose import binding pr...,ILLINPTDSDAVGNAVK
...,...,...
1955107,>sp|Q9ULT0|TTC7A_HUMAN Tetratricopeptide repea...,EAGFCIQEAAGLFPTSHSVLYMR
1955112,>sp|Q9ULT0|TTC7A_HUMAN Tetratricopeptide repea...,EALTVNPDGVR
1955114,>sp|Q9ULT0|TTC7A_HUMAN Tetratricopeptide repea...,IMHSLGLMLSR
1955119,>sp|Q9ULT0|TTC7A_HUMAN Tetratricopeptide repea...,QSTCHEAWQGLGEVLQAQGQNEAAVDCFLTAL


In [34]:
seqeunces_in_more_than_one_protein = counted_df[counted_df.protein > 1].index
df_shared_peptides_protein = df[df.sequence.isin(seqeunces_in_more_than_one_protein)]

In [35]:
seqeunces_one_protein = counted_df[counted_df.protein == 1].index
df_non_shared_pep = df[df.sequence.isin(seqeunces_one_protein)]

In [36]:
df_shared_peptides_protein

Unnamed: 0,protein,sequence
83,>sp|P06960|OTC2_ECOLI Ornithine carbamoyltrans...,NIALIFEK
88,>sp|P06960|OTC2_ECOLI Ornithine carbamoyltrans...,VTYLGPSGSQIGHK
92,>sp|P06960|OTC2_ECOLI Ornithine carbamoyltrans...,MYDGIQYR
95,>sp|P06960|OTC2_ECOLI Ornithine carbamoyltrans...,AFNEMTLVYAGDAR
96,>sp|P06960|OTC2_ECOLI Ornithine carbamoyltrans...,NNMGNSMLEAAALT
...,...,...
1954644,>sp|Q9BXS5|AP1M1_HUMAN AP-1 complex subunit mu...,SVELEDVK
1954651,>sp|Q9BXS5|AP1M1_HUMAN AP-1 complex subunit mu...,PLIWIESVIEK
1954670,>sp|Q9BXS5|AP1M1_HUMAN AP-1 complex subunit mu...,SGYQALPWVR
1954903,>sp|Q9NUT2|MITOS_HUMAN Mitochondrial potassium...,GTTLSGGQK


In [37]:
df_non_shared_peptides_protein

Unnamed: 0,protein,sequence
2,>sp|P02925|RBSB_ECOLI Ribose import binding pr...,LATLVSAVALSATVSANAMAK
3,>sp|P02925|RBSB_ECOLI Ribose import binding pr...,DTIALVVSTLNNPFFVSLK
7,>sp|P02925|RBSB_ECOLI Ribose import binding pr...,VLDSQNNPAK
8,>sp|P02925|RBSB_ECOLI Ribose import binding pr...,ELANVQDLTVR
10,>sp|P02925|RBSB_ECOLI Ribose import binding pr...,ILLINPTDSDAVGNAVK
...,...,...
1955107,>sp|Q9ULT0|TTC7A_HUMAN Tetratricopeptide repea...,EAGFCIQEAAGLFPTSHSVLYMR
1955112,>sp|Q9ULT0|TTC7A_HUMAN Tetratricopeptide repea...,EALTVNPDGVR
1955114,>sp|Q9ULT0|TTC7A_HUMAN Tetratricopeptide repea...,IMHSLGLMLSR
1955119,>sp|Q9ULT0|TTC7A_HUMAN Tetratricopeptide repea...,QSTCHEAWQGLGEVLQAQGQNEAAVDCFLTAL


In [38]:
len(df_non_shared_peptides_protein)/len(df)

0.9402423983518565

In [39]:
len(df_shared_peptides_protein)/len(df)

0.05975760164814343

In [44]:
# Conclusion

print(f"{len(counted_df[counted_df.protein > 1])} sequences are shared in > 1 Proteins. ({round(len(counted_df[counted_df.protein > 1])/ len(counted_df), 6)} of proteins. ))")
print(f"{len(counted_df[counted_df.protein == 1])} sequences has unique proteins.({round(len(counted_df[counted_df.protein == 1])/ len(counted_df), 6)} of proteins.))")

15167 sequences are shared in > 1 Proteins. (0.021746 of proteins. ))
682301 sequences has unique proteins.(0.978254 of proteins.))


In [45]:
# Conclusion

print(f"{len(df_shared_peptides_protein)} sequences are shared in > 1 Proteins. ({round(len(df_shared_peptides_protein)/len(df), 6)} of proteins. ))")
print(f"{len(df_non_shared_peptides_protein)} sequences has unique proteins.({round(len(df_non_shared_peptides_protein)/len(df), 6)} of proteins.))")

43364 sequences are shared in > 1 Proteins. (0.059758 of proteins. ))
682301 sequences has unique proteins.(0.940242 of proteins.))


In [None]:
We can remove these 