In [1]:

import pandas as pd
import argparse

def process_metamorpheus_file(filename):
    """
    Process MetaMorpheus AllPeptides file 
    Keeps only target peptieds with FDR <= 0.01
    Explodes 'Protein Accession' column into individual rows

    Parameters
    ----------
    filename : str
        MetaMorpheus AllPeptides file location
    
    Returns
    --------
    original : pandas DataFrame
        exploded AllPeptieds dataframe
    """
    original = pd.read_csv(filename, sep = '\t')
    below_fdr = original['QValue'] <= 0.01
    target = original['Decoy/Contaminant/Target'] == 'T'
    original = original[below_fdr & target]
    original['Protein Accession'] = original['Protein Accession'].str.split('|')
    original = original.explode('Protein Accession')
    return original


def find_best_protein(remaining, original, protein_column):
    """
    Finds best protein in remaining graph based on number
    of peptides a protein matches to (# of edges a protein has). Choose max
    If there are proteins with an equal number of maximum matched peptides
    then those proteins compared to the original graph for number of edges 
    a protein has
    """
    sizes = remaining.groupby(protein_column).size().reset_index().rename(columns = {0:'size'})
    max_size = sizes['size'].max()
    best_proteins = list(sizes[sizes['size'] == max_size ][protein_column])
    if len(best_proteins) > 1:
        subset = original[original[protein_column].isin(best_proteins)]
        subsizes = subset.groupby(protein_column).size().reset_index().rename(columns = {0:'size'})
        max_subsize = subsizes['size'].max()
        best_proteins = list(subsizes[subsizes['size'] == max_subsize][protein_column])
    return best_proteins[0]

def rescue_indistinguishable(inferred, dropped):
    """
    If a dropped protein has the exact match of peptides to an inferred protein then that protein is
    added into the inferred graph
    """
    pass

def greedy_inference(original, protein_column = 'Protein Accession', peptide_column = 'Base Sequence'):
    """
    Greedy protein inference algorithm for matching peptids to corresponding proteins

    Notaion:
    G : original graph
    Gi : inferred graph
    Gr : remaining graph
    Gd: dropped graph
    p : greedily selcted protein
    s : peptides connected to p


    Select peptides in G that only match to single protein
    Add proteins corresponding to peptides and all attached peptides to Gi
    Remove said proteins from  Gr
    While Gr has edges connected proteins and peptides
        Greedily select best protein p
        Add p and connected peptides Gi
        Add peptide-protein edges where protein is not p and peptide is in s in Gd
        Remove edgees where peptides is in s from Gr
    
    Remake Gi and make Gd
        Gi remade to contain all protein-peptide edges that connect to an inferred protein
        Gd made to contain all protein-peptide edges that do not connect to an inferred protein

    Parameters
    ---------
    original : pandas DataFrame
        original peptide-protien graph
    protein_column : str
        column associated with protein accession
    peptide_column : str
        column associated with peptide

    Returns
    --------
    inferred: pandas DataFrame
        Gi, subgraph of G of proteins and their associated peptides
    dropped: pandas DataFrame
        Gd, subgraph of G of proteins and their associated peptides

    """

    dropped = pd.DataFrame(columns = original.columns)
    peptide_sizes = original.groupby(peptide_column).size().reset_index().rename(columns = {0:'size'})
    single = peptide_sizes[peptide_sizes['size'] == 1]
    inferred = original[original[peptide_column].isin(single[peptide_column])]
    inferred = original[original[protein_column].isin(inferred[protein_column])]
    remaining = original[~original[protein_column].isin(inferred[protein_column])]
    inferred = [ inferred ]

    while len(remaining) > 0:
        best_protein = find_best_protein(remaining, original, protein_column)
        matches = remaining[remaining[protein_column] == best_protein]
        tmp_peptides = list(matches[peptide_column])
        inferred.append(matches)

        is_matched_peptide = remaining[peptide_column].isin(tmp_peptides)
        is_best_protein = remaining[protein_column] == best_protein
        
        remaining = remaining[~is_matched_peptide]
        
    inferred = pd.concat(inferred)

    inferred_proteins = inferred[protein_column].unique()
    inferred = original[original[protein_column].isin(inferred_proteins)]
    dropped = original[~original[protein_column].isin(inferred_proteins)]
    
    return inferred, dropped

In [2]:
filename = '/Users/bj8th/Documents/Lab-for-Proteoform-Systems-Biology/data/peptides/AllPeptides_PacBio.psmtsv'
original = process_metamorpheus_file(filename)

In [None]:
o

In [15]:
print(stop-start)

57.87800621986389


In [17]:
print(f"Original {len(original)}")
print(f"Inferred {len(inferred)}")
print(f"Dropped {len(dropped)}")
print(f"Dropped + Inferred {len(dropped) + len(inferred)}")


Original 71672
Inferred 44071
Dropped 27601
Dropped + Inferred 71672


In [18]:
inferred.head()

Unnamed: 0,File Name,Scan Number,Scan Retention Time,Num Experimental Peaks,Total Ion Current,Precursor Scan Number,Precursor Charge,Precursor MZ,Precursor Mass,Score,...,Localized Scores,Improvement Possible,Cumulative Target,Cumulative Decoy,QValue,Cumulative Target Notch,Cumulative Decoy Notch,QValue Notch,PEP,PEP_QValue
0,120426_Jurkat_highLC_Frac14,19149,140.45874,186.0,2620926.0,19146,3.0,1187.22629,3558.65705,39.219,...,,,1,0,0.0,1,0,0.0,2.8e-05,1.7e-05
1,120426_Jurkat_highLC_Frac9,19892,150.09205,186.0,197229.7,19887,3.0,1221.90388,3662.68981,39.198,...,,,2,0,0.0,2,0,0.0,6e-06,3e-06
2,120426_Jurkat_highLC_Frac17,22894,167.51914,200.0,11627100.0,22893,4.0,802.65041,3206.57254,39.164,...,,,3,0,0.0,3,0,0.0,4e-06,2e-06
3,120426_Jurkat_highLC_Frac23,14654,119.26644,200.0,3137383.0,14651,4.0,771.14514,3080.55145,38.355,...,,,4,0,0.0,4,0,0.0,0.00298,0.000382
4,120426_Jurkat_highLC_Frac20,18563,142.91097,200.0,1478450.0,18561,3.0,1240.6425,3718.90568,37.266,...,,,5,0,0.0,5,0,0.0,1.3e-05,8e-06
