In [16]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import re

from __future__ import division

### Import peptide table from file

In [17]:
all_pept = pd.DataFrame.from_csv("data/all_pept.tsv",sep="\t")
print all_pept.head()
modified_pepts = all_pept[all_pept.MassShift != 0]
print modified_pepts.head()
unmodified_pepts = all_pept[all_pept.MassShift == 0]
print modified_pepts.head()

  DataSet  Index  ObservedMW  Charge  CalculatedMW  DeltaMass  Score  \
0    100a   6036   1673.8330       3     1673.8294     0.0036     30   
1    100a   6043   1673.8312       2     1673.8294     0.0017     71   
2    100a   6058   1673.8318       2     1673.8294     0.0023     67   
3    100b   6718   1673.8312       2     1673.8294     0.0017     61   
4    100b   6732   1673.8302       3     1673.8294     0.0008     31   

   Probability              Peptide       giID        ...          \
0       0.1200  R.VQSMPEINDADKTVK.L  254160296        ...           
1       0.9972  R.VQSMPEINDADKTVK.L  254160296        ...           
2       0.9955  R.VQSMPEINDADKTVK.L  254160296        ...           
3       0.9891  R.VQSMPEINDADKTVK.L  254160296        ...           
4       0.3623  R.VQSMPEINDADKTVK.L  254160296        ...           

                                   ProteinName  Time  BiolRep  TechRep  \
0  outer membrane protein assembly factor YaeT     6        3        1   
1  o

### Group peptides by
- Timepoint (i.e. count of all peptides at each timepoint)
- Timepoint x Mass Shift (count all peptides containing individual modifications for each time point)
- Timepoint x Mass Shift x Biological Replicate

In [18]:
pept_total_by_timepoint = all_pept.groupby(['Time']).count().Index
mod_spectrum_by_timepoint = all_pept.groupby(['Time','MassShift']).count().Index
mod_spectrum_by_timepoint_biorep = all_pept.groupby(['MassShift','Time','BiolRep']).count().Index

In [19]:
all_time_sum_biorep = mod_spectrum_by_timepoint_biorep.unstack().sum(level='MassShift')
all_time_sum_pooled = mod_spectrum_by_timepoint_biorep.fillna(0).sum(level='MassShift')
all_time_sum_pooled = all_time_sum_pooled.reindex(index=np.arange(-200,200),fill_value=0)
all_time_sum_pooled[0] = 0

### Use MODa output "Probability" column to calculate per-peptide FDR

Treat `1 - Probability` as the probability of type 1 error

Order peptides by probability, add peptides to the "passed" set until the sum of 1-P values / total passed is greater 
than the FDR cutoff

In [34]:
def calc_MODa_FDR(pept_df,fdr_cut,start_idx):
    """
    Filter pept_df (Dataframe of MODa output + protein info from parse_MODa.ipynb) to fdr_cut FDR
    
    start_idx allows the loop to jump to the specified index & calculate sums using (fast) pandas methods;
        mostly for debugging
    """
    # Sort peptides by probability
    pept_df_psort = pept_df.sort_values(by=['Probability',],ascending=False,inplace=False)
    cut_idx = 0
    print pept_df_psort.head()
    # Iterate over rows, adding each row's Probability to p_total
    for (idx,(df_ix,r)) in enumerate(pept_df_psort.iloc[start_idx:].iterrows()):
        print r
        idx += 1
        idx += start_idx
        p_total = idx - pept_df_psort.loc[:df_ix,'Probability'].sum()
        calc_fdr = p_total/idx
        if idx % 10000 == 0:
            print idx, p_total, calc_fdr
            
        # stop iteration when we reach the FDR cutoff
        if calc_fdr >= fdr_cut:
            cut_idx = idx
            break
    
    # Make a truncated copy of the original sorted data
    all_pept_fdr = pept_df_psort.iloc[:cut_idx].copy()
    
    # Add columns for modification location relative to the start of the PROTEIN 
    all_pept_fdr.loc[slice(None),'ModPosn_ProtCoord'] = (all_pept_fdr['PeptideStart'] + all_pept_fdr['ModifiedPosition']) - 2
    print all_pept_fdr.loc[(all_pept_fdr['PeptideStart'] == 1),slice(None)].head()
    all_pept_fdr.loc[(all_pept_fdr['PeptideStart'] == 1),'ModPosn_ProtCoord'] += 1
    all_pept_fdr.loc[((all_pept_fdr['PeptideStart'] == 1) & (all_pept_fdr['ModifiedPosition'] == -1)),'ModPosn_ProtCoord'] += 2
    print all_pept_fdr.loc[(all_pept_fdr['PeptideStart'] == 1),slice(None)].head()
    
    # Select modified and unmodified sets
    modified_pept_fdr = all_pept_fdr[all_pept_fdr['MassShift'] != 0]
    unmodified_pept_fdr = all_pept_fdr[all_pept_fdr['MassShift'] == 0]
    modified_pept_fdr_errorsum = modified_pept_fdr.shape[0] - modified_pept_fdr['Probability'].sum()
    unmodified_pept_fdr_errorsum = unmodified_pept_fdr.shape[0] - unmodified_pept_fdr['Probability'].sum()

    print "Total peptides: %d" % all_pept_fdr.shape[0]
    print "Modified Peptides: %d Error_sum: %f FDR: %f" % (modified_pept_fdr.shape[0],modified_pept_fdr_errorsum,(modified_pept_fdr_errorsum / modified_pept_fdr.shape[0]))
    print "Unodified Peptides: %d Error_sum: %f FDR: %f"  % (unmodified_pept_fdr.shape[0],unmodified_pept_fdr_errorsum,(unmodified_pept_fdr_errorsum / unmodified_pept_fdr.shape[0]))
    return (all_pept_fdr,modified_pept_fdr,unmodified_pept_fdr)

In [32]:
print all_pept.sort_values(by=['Probability',],ascending=False,inplace=False).head()

        DataSet  Index  ObservedMW  Charge  CalculatedMW  DeltaMass  Score  \
1360422     97b  10581   1795.9934       3     1795.9944    -0.0010     70   
270802     100b  54856   2108.0431       2     2108.0426     0.0005    113   
452984     105a  49307   1987.0029       2     1987.0044    -0.0015    126   
270853      17b  42313   2108.0475       2     2108.0426     0.0049    102   
653217      26a   7482   2312.9168       3     2312.9152     0.0016     85   

         Probability                   Peptide       giID        ...          \
1360422          1.0      K.HVVVDKPFTVTLSQAR.E  254161685        ...           
270802           1.0    R.VGYINDQYVLNPTQDELK.E  254163108        ...           
452984           1.0    K.KALTEANGDIELAIENMR.K  254160289        ...           
270853           1.0    R.VGYINDQYVLNPTQDELK.E  254163108        ...           
653217           1.0  K.RDDDSYDEDVEDDEGVGEVR.V  254162395        ...           

                                        ProteinNam

In [None]:
all_pept_fdr5, modified_pept_fdr5, unmodified_pept_fdr5 = calc_MODa_FDR(all_pept,.05,1970000)

        DataSet  Index  ObservedMW  Charge  CalculatedMW  DeltaMass  Score  \
1360422     97b  10581   1795.9934       3     1795.9944    -0.0010     70   
270802     100b  54856   2108.0431       2     2108.0426     0.0005    113   
452984     105a  49307   1987.0029       2     1987.0044    -0.0015    126   
270853      17b  42313   2108.0475       2     2108.0426     0.0049    102   
653217      26a   7482   2312.9168       3     2312.9152     0.0016     85   

         Probability                   Peptide       giID        ...          \
1360422          1.0      K.HVVVDKPFTVTLSQAR.E  254161685        ...           
270802           1.0    R.VGYINDQYVLNPTQDELK.E  254163108        ...           
452984           1.0    K.KALTEANGDIELAIENMR.K  254160289        ...           
270853           1.0    R.VGYINDQYVLNPTQDELK.E  254163108        ...           
653217           1.0  K.RDDDSYDEDVEDDEGVGEVR.V  254162395        ...           

                                        ProteinNam

In [None]:
all_pept_fdr1, modified_pept_fdr1, unmodified_pept_fdr1 = calc_MODa_FDR(all_pept,.01,1470000)

##  Get counts for unmodified positions (protein coordinates)

### Stack Peptides on full-length protein, and sum total unmodified for each position in the protein

In [None]:
def generate_unmodified_positions_table(unmod_pepts,mod_pepts):
    """
    Get cumulative counts of unmodified positions (i.e. both from unmodified peptides and non-modified positions
    of modified peptides)
    
    Build a table of counts across time points separating biological replicates
    """
    
    # get maximum protein length so we can format the table
    max_prot_len = max([unmod_pepts['Length'].max(),mod_pepts['Length'].max()]) + 1
    
    # group peptides by parent protein
    unmod_pepts_grp = unmod_pepts.groupby(['LocusTag','Locus','BiolRep','Time'])
    mod_pepts_grp = mod_pepts.groupby(['LocusTag','Locus','BiolRep','Time'])
    
    # build a combined index from the modified and unmodified sets
    all_grp_idx = pd.concat([unmod_pepts_grp.count()['Index'],mod_pepts_grp.count()['Index']],axis=1).index
    
    # New data frames for aggregate modifications; each row represents a protein, number of
    # columns equal to length of the longest protein in the database
    # 
    # After these are filled with totals for each position, we'll stack on sequence index and drop 
    # empty rows, so we end up with one row per position for each protein
    unmod_pept_posn_df = pd.DataFrame(index=all_grp_idx, columns=np.arange(1,max_prot_len)).fillna(0)
    
    # Separate table for the AA at each position; will merge to the position count table when we're done
    unmod_pept_posn_AAs_df = pd.DataFrame(index=all_grp_idx, columns=np.arange(1,max_prot_len))
    
    # Group (i.e. protein) counts, for bookkeeping
    ngroups = len(unmod_pepts_grp.groups)
    mod_ngroups = len(mod_pepts_grp.groups)
    
    for (n,(ix,grp)) in enumerate(unmod_pepts_grp):
        # Sum across all protein positions for each group
        unmod_pept_range_sum(unmod_pept_posn_df.loc[ix,slice(None)],grp)# = grp.apply(pept_range_df,axis=1,args=(unmod_pept_posn_df.columns,)).sum()
        
        # Get AA for each position
        pept_AAs(unmod_pept_posn_AAs_df.loc[ix,slice(None)],grp)
        if n % 100 == 0:
            print (ix,unmod_pept_posn_df.loc[ix,slice(None)].sum())
            print (ix,unmod_pept_posn_AAs_df.loc[ix,slice(None)].dropna())
            print "(%d of %d, %.2f %% of unmod)" % (n,ngroups,(n/ngroups) * 100)
    
    for (n,(ix,grp)) in enumerate(mod_pepts_grp):
        # Same as above for the modified set, but skipping any modified residues in each peptide
        mod_pept_range_sum(unmod_pept_posn_df.loc[ix,slice(None)],grp)# = grp.apply(pept_range_df,axis=1,args=(unmod_pept_posn_df.columns,)).sum()
        pept_AAs(unmod_pept_posn_AAs_df.loc[ix,slice(None)],grp)
        if n % 100 == 0:
            print (ix,unmod_pept_posn_df.loc[ix,slice(None)].sum())
            print (ix,unmod_pept_posn_AAs_df.loc[ix,slice(None)].dropna())
            print "(%d of %d, %.2f %% of mod)" % (n,mod_ngroups,(n/mod_ngroups) * 100)

    # Stack rows and drop empty positions        
    unmod_pept_posn_df.columns = unmod_pept_posn_df.columns.set_names(['ModPosn_ProtCoord'])
    unmod_pept_posn_AAs_df.columns = unmod_pept_posn_AAs_df.columns.set_names(['ModPosn_ProtCoord'])
    unmod_pept_posn_AAs_df_stck = unmod_pept_posn_AAs_df.stack()
    unmod_pept_posn_df_stck = unmod_pept_posn_df[unmod_pept_posn_df > 0].stack()
    
    # Move around indexes to match other tables
    unmod_pept_posn_AAs_df_stck = unmod_pept_posn_AAs_df_stck.unstack('BiolRep').fillna(method='ffill',axis=1).fillna(method='bfill',axis=1).stack('BiolRep')
    unmod_pept_posn_AAs_df_nix = unmod_pept_posn_AAs_df_stck.reset_index().set_index(['LocusTag','Locus','ModPosn_ProtCoord','BiolRep','Time'])
    unmod_pept_posncnt_df_nix = unmod_pept_posn_df_stck.reset_index().set_index(['LocusTag','Locus','ModPosn_ProtCoord','BiolRep','Time'])
    
    # Join AA table and count table
    unmod_pept_posn_df = pd.concat([unmod_pept_posn_AAs_df_nix,unmod_pept_posncnt_df_nix],axis=1)
    unmod_pept_posn_df.columns = pd.Index(['ModifiedResidue','unmodCount'])
    
    # Fix indexes
    unmod_pept_posn_df = unmod_pept_posn_df.reset_index().set_index(['LocusTag','Locus','ModPosn_ProtCoord','ModifiedResidue','BiolRep','Time'])
    unmod_pept_posn_df = unmod_pept_posn_df.unstack('BiolRep').unstack('Time').fillna(0)
    
    return unmod_pept_posn_df.sort_index()

### Miscellaneous utility functions

"""
Think these are obsolete...

def fixed_len_arange(st,end,alen):
    r = np.arange(st,end)
    a = np.zeros(alen)
    a[:r.shape[0]] = r
    return a

def pept_range_df(x,cols):
    ser = pd.Series.from_array([1,] * ((x['PeptideEnd']+1) - x['PeptideStart']),index=range(x['PeptideStart'],x['PeptideEnd']+1))
    ser = ser.reindex(index=cols)
    return ser
"""
def unmod_pept_range_sum(pept_line,group):
    for (ix,row) in group.iterrows():
        pept_line[row['PeptideStart'] - 1:row['PeptideEnd']] += 1
    
def mod_pept_range_sum(pept_line,group):
    for (ix,row) in group.iterrows():
        pept_line.loc[row['PeptideStart']:row['ModPosn_ProtCoord'] - 1] += 1
        pept_line.loc[row['ModPosn_ProtCoord'] + 1:row['PeptideEnd']] += 1
        
def pept_AAs(pept_line,group):
    grp_unique_pepts = pd.concat([group['PeptideStart'],group['PeptideEnd'],group['PeptideSeq']],axis=1)
    for (ix,row) in grp_unique_pepts.iterrows():
        pept_line[row['PeptideStart'] - 1:row['PeptideEnd']] = np.array(list(re.search('\.(\S+)\.',row['PeptideSeq']).group(1)))


In [None]:
# Generate table of per-position counts of unmodified peptides
# This can take a while to run

unmodified_posn_fdr1 = generate_unmodified_positions_table(unmodified_pept_fdr1,modified_pept_fdr1)
print unmodified_posn_fdr1

### Distribution of modifications over proteins (count & distribution)

In [None]:
def get_protein_pept_counts(pept_subset_df,pept_all_df):
    subset_proteins = pept_subset_df.groupby(['LocusTag','Locus']).count().Index
    subset_protein_total_pepts = pept_all_df[pept_all_df['LocusTag'].isin(subset_proteins.reset_index()['LocusTag'])]
    subset_protein_total_cnt = subset_protein_total_pepts.groupby(['LocusTag','Locus']).count().Index
    subset_protein_cnts = pd.DataFrame(data={'ModifiedPeptides':subset_proteins,
                                           'UnmodifiedPeptides':subset_protein_total_cnt - subset_proteins,
                                           'TotalPeptides':subset_protein_total_cnt})

    total_proteins = pept_all_df.groupby(['LocusTag','Locus']).count().Index
    nonsubset_protein_pepts = pept_all_df[~pept_all_df['LocusTag'].isin(subset_proteins.reset_index()['LocusTag'])]
    nonsubset_proteins = nonsubset_protein_pepts.groupby(['LocusTag','Locus']).count().Index
    return (subset_protein_cnts,nonsubset_proteins,total_proteins)

def get_protein_x_time_pept_counts(pept_subset_df,pept_all_df):
    subset_proteins = pept_subset_df.groupby(['LocusTag','Locus','Time','BiolRep']).count().Index
    subset_protein_total_pepts = pept_all_df[pept_all_df['LocusTag'].isin(subset_proteins.reset_index()['LocusTag'])]
    subset_protein_total_cnt = subset_protein_total_pepts.groupby(['LocusTag','Locus','Time','BiolRep']).count().Index
    subset_protein_cnts = pd.DataFrame(data={'ModifiedPeptides':subset_proteins,
                                           'UnmodifiedPeptides':subset_protein_total_cnt - subset_proteins,
                                           'TotalPeptides':subset_protein_total_cnt})

    total_proteins = pept_all_df.groupby(['LocusTag','Locus','Time','BiolRep']).count().Index
    nonsubset_protein_pepts = pept_all_df[~pept_all_df['LocusTag'].isin(subset_proteins.reset_index()['LocusTag'])]
    nonsubset_proteins = nonsubset_protein_pepts.groupby(['LocusTag','Locus','Time','BiolRep']).count().Index
    
    subset_proteins = subset_proteins.unstack()
    subset_proteins = subset_proteins.unstack()
    subset_proteins = subset_proteins.fillna(0)

    subset_protein_cnts = subset_protein_cnts.unstack()
    subset_protein_cnts = subset_protein_cnts.unstack()
    subset_protein_cnts = subset_protein_cnts.fillna(0)

    nonsubset_proteins = nonsubset_proteins.unstack()
    nonsubset_proteins = nonsubset_proteins.unstack()
    nonsubset_proteins = nonsubset_proteins.fillna(0)

    total_proteins = total_proteins.unstack()
    total_proteins = total_proteins.unstack()
    total_proteins = total_proteins.fillna(0)
    return (subset_proteins,nonsubset_proteins,total_proteins,subset_protein_cnts)

def get_uniqpept_x_time_pept_counts(pept_subset,pept_all_df):
    subset_pept_cnts = pept_subset.groupby(['PeptideSeq','Time','BiolRep']).count().Index
    subset_total_pepts = pept_all_df[pept_all_df['PeptideSeq'].isin(subset_pept_cnts.reset_index()['PeptideSeq'])]
    subset_total_cnts = subset_total_pepts.groupby(['PeptideSeq','Time','BiolRep']).count().Index
    subset_cnts = pd.DataFrame(data={'ModifiedPeptides':subset_pept_cnts,
                                           'UnmodifiedPeptides':subset_total_cnts - subset_pept_cnts,
                                           'TotalPeptides':subset_total_cnts})
    nonsubset_pepts = pept_all_df[~pept_all_df['PeptideSeq'].isin(subset_pept_cnts.reset_index()['PeptideSeq'])]
    nonsubset_cnts = nonsubset_pepts.groupby(['PeptideSeq','Time','BiolRep']).count().Index
    total_cnts = pept_all_df.groupby(['PeptideSeq','Time','BiolRep']).count().Index
    
    subset_pept_cnts = subset_pept_cnts.unstack()
    subset_pept_cnts = subset_pept_cnts.unstack()
    subset_pept_cnts = subset_pept_cnts.fillna(0)

    subset_cnts = subset_cnts.unstack()
    subset_cnts = subset_cnts.unstack()
    subset_cnts = subset_cnts.fillna(0)

    nonsubset_cnts = nonsubset_cnts.unstack()
    nonsubset_cnts = nonsubset_cnts.unstack()
    nonsubset_cnts = nonsubset_cnts.fillna(0)

    total_cnts = total_cnts.unstack()
    total_cnts = total_cnts.unstack()
    total_cnts = total_cnts.fillna(0)

    return (subset_pept_cnts,nonsubset_cnts,total_cnts,subset_cnts)

def get_count_info(pept_df,uniqpept_df,prot_df):
    pept_total = pept_df.shape[0]
    errorsum = pept_df.shape[0] - pept_df['Probability'].sum()
    info = [pept_total,
            errorsum,
            (errorsum / pept_total),
            uniqpept_df.shape[0],
            prot_df.shape[0]]
    return info


In [None]:
# FDR Calc summary table
fdr_summ_midx = pd.MultiIndex.from_product([('Unfiltered',"1% FDR","5% FDR"),(" ","Modified","Unmodified")])
fdr_summ_df = pd.DataFrame(index=fdr_summ_midx,columns=['Total PSSMs','Error Sum','False Discovery Rate','Unique Peptides','Proteins'])

all_pept_modified_uniqpept, all_pept_unmodified_uniqpept, all_pept_total_uniqpept, all_pept_mod_unmod_uniqpept_cnts = get_uniqpept_x_time_pept_counts(modified_pepts,all_pept)
all_pept_fdr1_modified_uniqpept, all_pept_fdr1_unmodified_uniqpept, all_pept_fdr1_total_uniqpept, all_pept_fdr1_mod_unmod_uniqpept_cnts = get_uniqpept_x_time_pept_counts(modified_pept_fdr1,all_pept_fdr1)
all_pept_fdr5_modified_uniqpept, all_pept_fdr5_unmodified_uniqpept, all_pept_fdr5_total_uniqpept, all_pept_fdr5_mod_unmod_uniqpept_cnts = get_uniqpept_x_time_pept_counts(modified_pept_fdr5,all_pept_fdr5)

all_pept_modified_prot, all_pept_unmodified_prot, all_pept_total_prot, all_pept_mod_unmod_protein_cnts = get_protein_x_time_pept_counts(modified_pepts,all_pept)
all_pept_fdr1_modified_prot, all_pept_fdr1_unmodified_prot, all_pept_fdr1_total_prot, all_pept_fdr1_mod_unmod_protein_cnts = get_protein_x_time_pept_counts(modified_pept_fdr1,all_pept_fdr1)
all_pept_fdr5_modified_prot, all_pept_fdr5_unmodified_prot, all_pept_fdr5_total_prot, all_pept_fdr5_mod_unmod_protein_cnts = get_protein_x_time_pept_counts(modified_pept_fdr5,all_pept_fdr5)

In [None]:
# Format the table into something readable
fdr_summ_df.loc[('Unfiltered',' '),:] = get_count_info(all_pept,all_pept_total_uniqpept,all_pept_total_prot)
fdr_summ_df.loc[('Unfiltered','Modified'),:] = get_count_info(modified_pepts,all_pept_modified_uniqpept,all_pept_modified_prot)
fdr_summ_df.loc[('Unfiltered','Unmodified'),:] = get_count_info(unmodified_pepts,all_pept_unmodified_uniqpept,all_pept_unmodified_prot)

fdr_summ_df.loc[('1% FDR',' '),:] = get_count_info(all_pept_fdr1,all_pept_fdr1_total_uniqpept,all_pept_fdr1_total_prot)
fdr_summ_df.loc[('1% FDR','Modified'),:] = get_count_info(modified_pept_fdr1,all_pept_fdr1_modified_uniqpept,all_pept_fdr1_modified_prot)
fdr_summ_df.loc[('1% FDR','Unmodified'),:] = get_count_info(unmodified_pept_fdr1,all_pept_fdr1_unmodified_uniqpept,all_pept_fdr1_unmodified_prot)

fdr_summ_df.loc[('5% FDR',' '),:] = get_count_info(all_pept_fdr5,all_pept_fdr5_total_uniqpept,all_pept_fdr5_total_prot)
fdr_summ_df.loc[('5% FDR','Modified'),:] = get_count_info(modified_pept_fdr5,all_pept_fdr5_modified_uniqpept,all_pept_fdr5_modified_prot)
fdr_summ_df.loc[('5% FDR','Unmodified'),:] = get_count_info(unmodified_pept_fdr5,all_pept_fdr5_unmodified_uniqpept,all_pept_fdr5_unmodified_prot)

print fdr_summ_df.to_latex(float_format=(lambda x: "%.3f" % x))

### Output data tables

In [None]:
all_pept_fdr1.to_pickle("data/all_pept_fdr1.pck")
modified_pept_fdr1.to_pickle("data/modified_pept_fdr1.pck")
unmodified_pept_fdr1.to_pickle("data/unmodified_pept_fdr1.pck")
unmodified_posn_fdr1.to_pickle("data/unmodified_posn_fdr1.pck")

all_pept_fdr1.to_csv("data/ADDITIONAL_FILE_1_all_pept_fdr1.tsv",sep="\t")
modified_pept_fdr1.to_csv("data/modified_pept_fdr1.tsv",sep="\t")
unmodified_pept_fdr1.to_csv("data/unmodified_pept_fdr1.tsv",sep="\t")
unmodified_posn_fdr1.to_csv("data/unmodified_posn_fdr1.tsv",sep="\t")

# Used by plot_allmod_x_time
all_pept_fdr1_modified_uniqpept.to_pickle("data/all_pept_fdr1_modified_uniqpept.pck")
all_pept_fdr1_total_uniqpept.to_pickle("data/all_pept_fdr1_total_uniqpept.pck") 

all_pept_fdr1_modified_prot.to_pickle("data/all_pept_fdr1_modified_proteins.pck") 
all_pept_fdr1_total_prot.to_pickle("data/all_pept_fdr1_total_proteins.pck")
