# About 
This scripts removes as much shared peptides as possible with trivial methods.

Algorithm:
1. Read in the database
2. Loop through lines of database
3. Protein <- line starting with ">"
4. sequence <- other lines.
5. Split sequence at "K" and "R".
6. Create a list with the split sequences.
7. Create an equally sized list with the protein name for each split sequence. 
8. Create a DataFrame with all the proteins and splitted sequences.
9. If we want to replace I/L with L then set "modify_IL" to True.
10. Create a column with mapped decoys.
11. Create a column with mapped species.
12. Create a column with mapped "sequence length".
13. Keep only rows with "sequence length" > 7.
14. Drop duplicates.
15. Create a dataframe for number of protein per sequence by using - Groupby sequence (These are the splitted sequences still).
16. Count proteins for each sequence. 
17. Filter count dataframe to contain only sequences with count == 1.
18. Take only unique protein.
19. Make a list of the unique proteins. 
20. This protein contains the final result that we keep.
21. Read in the orignal db again and keep only proteins that exist in list created in 19).


In [2]:

import os 
import re
import pandas as pd
import numpy as np

In [3]:
# read in .fasta file and count shared peptides

os.chdir("/home/ptruong/git/dia_sum")

#human_fasta = "2021-06-15-decoys-reviewed-contam-UP000000625.fas"
#yeast_fasta = "2021-06-15-decoys-reviewed-contam-UP000002311"
#ecoli_fasta = "2021-06-15-decoys-reviewed-contam-UP000005640"

#filename = "database/2021-06-15/" + human_fasta
#filename = "database/napedro_3mixed_human_yeast_ecoli_20140403_iRT_reverse.fasta"
#filename = "database/2021-06-07/UP00000625_UP000002311_UP000005640.fasta"
filename = "database/2021-09-15_malaria_yeast_PXD017705/UP000002311_UP000001450.fasta"


file = open(filename, "r")


protein_list = []
sequence_list = []
for line in file: 
    if line[0] == ">":
        protein = line 
    else:
        sequence = line.rstrip()
        split_sequence = re.split(r"(?<=[KR])", sequence)
        split_sequence = list(dict.fromkeys(split_sequence))
        sequence_list += split_sequence
        protein_list += [protein for i in range(len(split_sequence))]
        

df_ = pd.DataFrame(np.array([protein_list, sequence_list]).T, columns = ["protein", "sequence"])
df = pd.DataFrame(np.array([protein_list, sequence_list]).T, columns = ["protein", "sequence"])


# If True replace I and L with common symbol

Amino acis I (Iso-leucine) and L (leucine) have the same weight and therefore the same peak in a mass-spec. We want to make them interchangable to see if it affects the final database.

In [4]:

mofidy_IL = False #Set the variable
if mofidy_IL == True:
    modify_IL_function = lambda x: x.replace("I", "L")
    df["sequence"] = df.sequence.map(modify_IL_function)
    print("All I and L are now L")
else:
    print("No I/L modification")

No I/L modification


In [5]:
def decoy_map(protein):
    if protein.split("_")[0] == ">reverse":
        return True
    else:
        return False
    
    
df["decoy"] = df.protein.map(decoy_map)

df = df[df.decoy == False]


In [6]:
df

Unnamed: 0,protein,sequence,decoy
0,>sp|O13587|YP096_YEAST Uncharacterized protein...,MR,False
1,>sp|O13587|YP096_YEAST Uncharacterized protein...,GETGVSIK,False
2,>sp|O13587|YP096_YEAST Uncharacterized protein...,NPR,False
3,>sp|O13587|YP096_YEAST Uncharacterized protein...,PSR,False
4,>sp|O13587|YP096_YEAST Uncharacterized protein...,PFSCFWR,False
...,...,...,...
963674,>tr|Q9U0J0|Q9U0J0_PLAF7 Replication protein A1...,SIK,False
963675,>tr|Q9U0J0|Q9U0J0_PLAF7 Replication protein A1...,NLTSK,False
963676,>tr|Q9U0J0|Q9U0J0_PLAF7 Replication protein A1...,R,False
963677,>tr|Q9U0J0|Q9U0J0_PLAF7 Replication protein A1...,D,False


In [7]:
specie_map = lambda x: x.split("_")[1].split(" ")[0]
df["specie"] = df.protein.map(specie_map)

In [8]:
df.specie.unique()

array(['YEAST', 'PLAF7'], dtype=object)

In [9]:
n_protein = len(df.protein.unique())
n_protein_yeast  = len(df[df["specie"] == "YEAST"].protein.unique())
n_protein_plasmodium = len(df[df["specie"] == "PLAF7"].protein.unique())


In [10]:
df.specie.unique()

array(['YEAST', 'PLAF7'], dtype=object)

In [11]:
print("Unfiltered protein statistics:")
print(f"All proteins: {n_protein}")
print(f"ECOLI proteins: {n_protein_yeast}")
print(f"HUMAN proteins: {n_protein_plasmodium}")

Unfiltered protein statistics:
All proteins: 11433
ECOLI proteins: 6050
HUMAN proteins: 5383


In [12]:
df["seq_length"] = df.sequence.str.len()
df = df[df["seq_length"] > 7]
df.drop("seq_length", axis = 1, inplace = True)
df.drop_duplicates(inplace=True)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


# Find protein with only one sequence

In [13]:
counted_df = df.groupby("sequence").count().sort_values(by = "protein", ascending = False)
counted_df

Unnamed: 0_level_0,protein,decoy,specie
sequence,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
LSVDYTDIMK,59,59,59
ANSQQTTTPAS,58,58,58
PPPMLTSPND,58,58,58
EVHTNQDPLDVSASK,58,58,58
FLQNSNLGGIIPTVNGK,56,56,56
...,...,...,...
IENDYFSANVNENIIK,1,1,1
IENDWYCNSCLNS,1,1,1
IENDSVIDEEK,1,1,1
IENDQSFHNILMK,1,1,1


In [15]:
single_protein_seq = counted_df[counted_df.protein == 1]

# investigate if a single protein sequence has proteins containing single sequences.

In [16]:
df_single_seq = df[df.sequence.isin(single_protein_seq.index)]

In [17]:
df_single_seq.groupby("sequence").count().max()

protein    1
decoy      1
specie     1
dtype: int64

In [18]:
# seems fine, keep these proteins 
len(df_single_seq.protein.unique())

11192

In [19]:
len(df.protein.unique())

11423

In [20]:
keep_list = df_single_seq.protein.unique()

In [21]:
keep_list

array(['>sp|O13587|YP096_YEAST Uncharacterized protein YPR096C OS=Saccharomyces cerevisiae (strain ATCC 204508 / S288c) OX=559292 GN=YPR096C PE=1 SV=1\n',
       '>sp|P00044|CYC1_YEAST Cytochrome c isoform 1 OS=Saccharomyces cerevisiae (strain ATCC 204508 / S288c) OX=559292 GN=CYC1 PE=1 SV=2\n',
       '>sp|P00498|HIS1_YEAST ATP phosphoribosyltransferase OS=Saccharomyces cerevisiae (strain ATCC 204508 / S288c) OX=559292 GN=HIS1 PE=1 SV=1\n',
       ...,
       '>tr|Q8ILG7|Q8ILG7_PLAF7 Mitochondrial ribosomal protein L15, putative OS=Plasmodium falciparum (isolate 3D7) OX=36329 GN=PF3D7_1429700 PE=3 SV=1\n',
       '>tr|Q8IM82|Q8IM82_PLAF7 Rifin OS=Plasmodium falciparum (isolate 3D7) OX=36329 GN=PF3D7_1400600 PE=4 SV=1\n',
       '>tr|Q9U0J0|Q9U0J0_PLAF7 Replication protein A1, large subunit OS=Plasmodium falciparum (isolate 3D7) OX=36329 GN=PF3D7_0409600 PE=3 SV=1\n'],
      dtype=object)

# Read in both fasta files and compare results.

This is the function to check if the protein is in "keep_list", if it is in keep it from the original.

In [22]:

os.chdir("/home/ptruong/git/dia_sum")
#filename = "database/2021-06-07/UP00000625_UP000002311_UP000005640.fasta"
filename = "database/2021-09-15_malaria_yeast_PXD017705/UP000002311_UP000001450.fasta"

#filename = "database/napedro_3mixed_human_yeast_ecoli_20140403_iRT_reverse.fasta"
file = open(filename, "r")

file_w = open("database/2021-09-15_malaria_yeast_PXD017705/UP000002311_UP000001450_no_shared.fasta", "w")

for line in file:
    if line[0] == ">":
        protein = line 
        file_w.write(protein)
    else:
        if protein in keep_list:
            sequence = line
            file_w.write(sequence)
        

# Protein database analysis

Here we analyse the amount of proteins in each library

In [25]:

def read_in_fasta(filename):
    file = open(filename, "r")

    protein_list = []
    sequence_list = []
    for line in file: 
        if line[0] == ">":
            protein = line 
        else:
            sequence = line.rstrip()
            #sequence_list += split_sequence

            #protein_list += [protein for i in range(len(sequence))]
            sequence_list.append(sequence)
            protein_list.append(protein)

    df = pd.DataFrame(np.array([protein_list, sequence_list]).T, columns = ["protein", "sequence"])
    df["decoy"] = df.protein.map(decoy_map)
    df["specie"] = df.protein.map(specie_map)
    return df


In [31]:
filename = "database/2021-09-15_malaria_yeast_PXD017705/UP000002311_UP000001450.fasta"
filename_no_shared = "database/2021-09-15_malaria_yeast_PXD017705/UP000002311_UP000001450_no_shared.fasta"
filename_no_shared_IL_mod = "database/2021-09-15_malaria_yeast_PXD017705/UP000002311_UP000001450_no_shared_IL_mod.fasta"

In [32]:
db = read_in_fasta(filename)
db_no_shared = read_in_fasta(filename_no_shared)
db_no_shared_IL_mod = read_in_fasta(filename_no_shared_IL_mod)

In [33]:
db

Unnamed: 0,protein,sequence,decoy,specie
0,>sp|O13587|YP096_YEAST Uncharacterized protein...,MRGETGVSIKNPRPSRPFSCFWRKGDVENIRKSDIGNEKKIDAKFN...,False,YEAST
1,>sp|O13587|YP096_YEAST Uncharacterized protein...,KAGLLYKELFFRSCFSYTTCSLDFQGKRHQVERKAVDIVL,False,YEAST
2,>sp|P00044|CYC1_YEAST Cytochrome c isoform 1 O...,MTEFKAGSAKKGATLFKTRCLQCHTVEKGGPHKVGPNLHGIFGRHS...,False,YEAST
3,>sp|P00044|CYC1_YEAST Cytochrome c isoform 1 O...,KNVLWDENNMSEYLTNPKKYIPGTKMAFGGLKKEKDRNDLITYLKKACE,False,YEAST
4,>sp|P00498|HIS1_YEAST ATP phosphoribosyltransf...,MDLVNHLTDRLLFAIPKKGRLYSKSVSILNGADITFHRSQRLDIAL...,False,YEAST
...,...,...,...,...
123172,>tr|Q9U0J0|Q9U0J0_PLAF7 Replication protein A1...,KIEINPDNIEKAYILKNWWIHNKKMICNSINLSSNYINIETQKTIQ...,False,PLAF7
123173,>tr|Q9U0J0|Q9U0J0_PLAF7 Replication protein A1...,SGKGIMFTTYGFIDQIYNNMPVYSACPDCNKKMISNSVDDNEYEPS...,False,PLAF7
123174,>tr|Q9U0J0|Q9U0J0_PLAF7 Replication protein A1...,KNNIPVYNYYINLKITDSTDSIRASAFANCAKIIMNGLSAEEYMKL...,False,PLAF7
123175,>tr|Q9U0J0|Q9U0J0_PLAF7 Replication protein A1...,IIEKIKLNEFFFRIKVYMTSHMDELKKNYTIIDIAPVGKLLIDNCR...,False,PLAF7


In [34]:
db_no_shared

Unnamed: 0,protein,sequence,decoy,specie
0,>sp|O13587|YP096_YEAST Uncharacterized protein...,MRGETGVSIKNPRPSRPFSCFWRKGDVENIRKSDIGNEKKIDAKFN...,False,YEAST
1,>sp|O13587|YP096_YEAST Uncharacterized protein...,KAGLLYKELFFRSCFSYTTCSLDFQGKRHQVERKAVDIVL,False,YEAST
2,>sp|P00044|CYC1_YEAST Cytochrome c isoform 1 O...,MTEFKAGSAKKGATLFKTRCLQCHTVEKGGPHKVGPNLHGIFGRHS...,False,YEAST
3,>sp|P00044|CYC1_YEAST Cytochrome c isoform 1 O...,KNVLWDENNMSEYLTNPKKYIPGTKMAFGGLKKEKDRNDLITYLKKACE,False,YEAST
4,>sp|P00498|HIS1_YEAST ATP phosphoribosyltransf...,MDLVNHLTDRLLFAIPKKGRLYSKSVSILNGADITFHRSQRLDIAL...,False,YEAST
...,...,...,...,...
121221,>tr|Q8IM82|Q8IM82_PLAF7 Rifin OS=Plasmodium fa...,IPTCVCEKSLADKMEKVCLKCAQNLGGIVAPSTGVLGEIAALAVNA...,False,PLAF7
121222,>tr|Q8IM82|Q8IM82_PLAF7 Rifin OS=Plasmodium fa...,KAGDAAGKIAGESKGVETIIGILEQYYSIYELKGTPLKSFFATTHY...,False,PLAF7
121223,>tr|Q8IM82|Q8IM82_PLAF7 Rifin OS=Plasmodium fa...,NTSCGLNSLANQAICGLRTKLGLVAKPGQVMVTQKEAITKMITNVV...,False,PLAF7
121224,>tr|Q8IM82|Q8IM82_PLAF7 Rifin OS=Plasmodium fa...,AATKTAAAIKMNTEAIEAATTPYYTPIIASIVAIVVIVLIMVIIYL...,False,PLAF7


In [43]:
db_no_shared_IL_mod

Unnamed: 0,protein,sequence,decoy,specie
0,>sp|O13587|YP096_YEAST Uncharacterized protein...,MRGETGVSIKNPRPSRPFSCFWRKGDVENIRKSDIGNEKKIDAKFN...,False,YEAST
1,>sp|O13587|YP096_YEAST Uncharacterized protein...,KAGLLYKELFFRSCFSYTTCSLDFQGKRHQVERKAVDIVL,False,YEAST
2,>sp|P00044|CYC1_YEAST Cytochrome c isoform 1 O...,MTEFKAGSAKKGATLFKTRCLQCHTVEKGGPHKVGPNLHGIFGRHS...,False,YEAST
3,>sp|P00044|CYC1_YEAST Cytochrome c isoform 1 O...,KNVLWDENNMSEYLTNPKKYIPGTKMAFGGLKKEKDRNDLITYLKKACE,False,YEAST
4,>sp|P00498|HIS1_YEAST ATP phosphoribosyltransf...,MDLVNHLTDRLLFAIPKKGRLYSKSVSILNGADITFHRSQRLDIAL...,False,YEAST
...,...,...,...,...
121219,>tr|Q8IM82|Q8IM82_PLAF7 Rifin OS=Plasmodium fa...,IPTCVCEKSLADKMEKVCLKCAQNLGGIVAPSTGVLGEIAALAVNA...,False,PLAF7
121220,>tr|Q8IM82|Q8IM82_PLAF7 Rifin OS=Plasmodium fa...,KAGDAAGKIAGESKGVETIIGILEQYYSIYELKGTPLKSFFATTHY...,False,PLAF7
121221,>tr|Q8IM82|Q8IM82_PLAF7 Rifin OS=Plasmodium fa...,NTSCGLNSLANQAICGLRTKLGLVAKPGQVMVTQKEAITKMITNVV...,False,PLAF7
121222,>tr|Q8IM82|Q8IM82_PLAF7 Rifin OS=Plasmodium fa...,AATKTAAAIKMNTEAIEAATTPYYTPIIASIVAIVVIVLIMVIIYL...,False,PLAF7


In [38]:

def print_db_proteins(db):
    n_protein = len(db.protein.unique())
    n_protein_plasmodium = len(df[df["specie"] == "PLAF7"].protein.unique())
    n_protein_yeast = len(db[db["specie"] == "YEAST"].protein.unique())

    print(f"All proteins: {n_protein}")
    print(f"HUMAN proteins: {n_protein_plasmodium}")
    print(f"YEAST protein: {n_protein_yeast}")



In [39]:
print("Unfiltered db:")
print_db_proteins(db)

Unfiltered db:
All proteins: 11433
HUMAN proteins: 5377
YEAST protein: 6050


In [40]:
print("No shared:")
print_db_proteins(db_no_shared)

No shared:
All proteins: 11191
HUMAN proteins: 5377
YEAST protein: 5848


In [44]:
print("No shared IL modified:")
print_db_proteins(db_no_shared_IL_mod)

No shared IL modified:
All proteins: 11190
HUMAN proteins: 5377
YEAST protein: 5847


In [48]:
def get_db_proteins(db):
    n_protein = len(db.protein.unique())
    n_protein_plasmodium = len(df[df["specie"] == "PLAF7"].protein.unique())
    n_protein_yeast = len(db[db["specie"] == "YEAST"].protein.unique())
    return n_protein, n_protein_plasmodium, n_protein_yeast


In [49]:
db_stats = pd.DataFrame([list(get_db_proteins(db)), list(get_db_proteins(db_no_shared)), list(get_db_proteins(db_no_shared_IL_mod))],
            index = ["Unfiltered", "No_shared", "No_shared_IL_mod"],
            columns = ["all", "plasmodium", "yeast"]).T

In [50]:
db_stats

Unnamed: 0,Unfiltered,No_shared,No_shared_IL_mod
all,11433,11191,11190
plasmodium,5377,5377,5377
yeast,6050,5848,5847
