In [3]:
import pandas as pd

In [4]:
idir = '/Users/bj8th/Documents/Lab-for-Proteoform-Systems-Biology/data/peptides/AllPeptides_PacBio.psmtsv'

In [22]:
peptides = pd.read_csv(idir, sep = '\t')
below_fdr = peptides['QValue'] <= 0.01
target = peptides['Decoy/Contaminant/Target'] == 'T'
peptides = peptides[below_fdr & target]

In [23]:
peptides['Protein Accession'] = peptides['Protein Accession'].str.split('|')

In [24]:
peptides = peptides.explode('Protein Accession')

In [27]:
peptides = peptides[['Protein Accession', 'Base Sequence']]

In [45]:
peptide_dict = peptides.groupby('Base Sequence')['Protein Accession'].apply(list).to_dict()

In [46]:
protein_dict = peptides.groupby('Protein Accession')['Base Sequence'].apply(list).to_dict()

In [42]:
peptide_sizes = peptides.groupby('Base Sequence').size().reset_index().rename(columns = {0:'size'})
single = peptide_sizes[peptide_sizes['size'] == 1]

In [76]:
inferred = peptides[peptides['Base Sequence'].isin(single['Base Sequence'])]
inferred = peptides[peptides['Protein Accession'].isin(inferred['Protein Accession'])]
remaining = peptides[~peptides['Protein Accession'].isin(inferred['Protein Accession'])]

In [147]:
def find_best_protein(remaining, original, protein_column):
    """
    Finds best protein in remaining graph based on number
    of peptides a protein matches to (# of edges a protein has). Choose max
    If there are proteins with an equal number of maximum matched peptides
    then those proteins compared to the original graph for number of edges 
    a protein has
    """
    sizes = remaining.groupby(protein_column).size().reset_index().rename(columns = {0:'size'})
    max_size = sizes['size'].max()
    best_proteins = list(sizes[sizes['size'] == max_size ][protein_column])
    if len(best_proteins) > 1:
        subset = original[original[protein_column].isin(best_proteins)]
        subsizes = subset.groupby(protein_column).size().reset_index().rename(columns = {0:'size'})
        max_subsize = subsizes['size'].max()
        best_proteins = list(subsizes[subsizes['size'] == max_subsize])
    return best_proteins[0]



def greedy_inference(original, protein_column = 'Protein Accession', peptide_column = 'Base Sequence'):
    """
    Greedy protein inference algorithm for matching peptids to corresponding proteins

    Notaion:
    G : original graph
    Gi : inferred graph
    Gr : remaining graph
    Gd: dropped graph
    p : greedily selcted protein
    s : peptides connected to p


    Select peptides in G that only match to single protein
    Add proteins corresponding to peptides and all attached peptides to Gi
    Remove said proteins from  Gr
    While Gr has edges connected proteins and peptides
        Greedily select best protein p
        Add p and connected peptides Gi
        Add peptide-protein edges where protein is not p and peptide is in s in Gd
        Remove edgees where peptides is in s from Gr
    
    Remake Gi and make Gd
        Gi remade to contain all protein-peptide edges that connect to an inferred protein
        Gd made to contain all protein-peptide edges that do not connect to an inferred protein

    Parameters
    ---------
    original : pandas DataFrame
        original peptide-protien graph
    protein_column : str
        column associated with protein accession
    peptide_column : str
        column associated with peptide

    Returns
    --------
    inferred: pandas DataFrame
        Gi, subgraph of G of proteins and their associated peptides
    dropped: pandas DataFrame
        Gd, subgraph of G of proteins and their associated peptides

    """

    dropped = pd.DataFrame(columns = original.columns)
    peptide_sizes = original.groupby(peptide_column).size().reset_index().rename(columns = {0:'size'})
    single = peptide_sizes[peptide_sizes['size'] == 1]
    inferred = original[original[peptide_column].isin(single[peptide_column])]
    inferred = original[original[protein_column].isin(inferred[protein_column])]
    remaining = original[~original[protein_column].isin(inferred[protein_column])]
    inferred = [ inferred ]
    # dropped = []

    while len(remaining) > 0:
        best_protein = find_best_protein(remaining, original, protein_column)
        matches = remaining[remaining[protein_column] == best_protein]
        tmp_peptides = list(matches[peptide_column])
        inferred.append(matches)

        is_matched_peptide = remaining[peptide_column].isin(tmp_peptides)
        is_best_protein = remaining[protein_column] == best_protein
        # dropped.append( remaining[~is_best_protein & is_matched_peptide] )
        
        remaining = remaining[~is_matched_peptide]
        
    inferred = pd.concat(inferred)
    # dropped = pd.concat(dropped)

    inferred_proteins = inferred[protein_column].unique()
    inferred = original[original[protein_column].isin(inferred_proteins)]
    dropped = original[~original[protein_column].isin(inferred_proteins)]
    
    return inferred, dropped
 

        





In [148]:
import time
start = time.time()
inferred, dropped = greedy_inference(peptides)
stop = time.time()


In [149]:
print(stop-start)

36.61134672164917


In [150]:
print(f"Original {len(peptides)}")
print(f"Inferred {len(inferred)}")
print(f"Dropped {len(dropped)}")
print(f"Dropped + Inferred {len(dropped) + len(inferred)}")


Original 71672
Inferred 44025
Dropped 27647
Dropped + Inferred 71672


In [63]:
protein_bag = set()
for pep in peptide_bag:
    protein_bag = protein_bag.union(peptide_dict[pep])


In [64]:
for protein in protein_bag:
    peptide_bag = peptide_bag.union(protein_dict[protein])

In [65]:
len(peptide_bag)

22413

In [61]:
len(single)

13810