In [1]:
import numpy as np
import pandas as pd

In [2]:
pathslist = ['/Users/plutzer/Box/CellBio-MajorLab/Separate PG files/1-22-2021-MA-ADCK1_PG.txt',
'/Users/plutzer/Box/CellBio-MajorLab/Separate PG files/1-29-2021-MA-CSNK1G1_PG.txt',
'/Users/plutzer/Box/CellBio-MajorLab/Separate PG files/1-29-2021-MA-CSNK1G2_PG.txt',
'/Users/plutzer/Box/CellBio-MajorLab/Separate PG files/1-29-2021-MA-CSNK1G3_PG.txt',
'/Users/plutzer/Box/CellBio-MajorLab/Separate PG files/1-25-2021-MA-VRK2_PG.txt']

In [3]:
starting_df = pd.read_csv(pathslist[0],delimiter = '\t')

In [4]:
new_df = pd.read_csv(pathslist[1],delimiter = '\t')

In [13]:
def merge_pgs(dataframe1,dataframe2 = None):
    '''
        Merges two proteinGroups files on simplified Protein IDs, and rectifies other columns needed for downstream analysis.
            If no second dataframe is given, the function will perform filtering and column simplification steps on just the 
            first dataframe. Otherwise, the first dataframe is assumed to already be rectified and the filtering/simplification
            will only be done on the second dataframe.
        dataframe1: pandas dataframe for first proteinGroups file
        dataframe2: pandas dataframe for second proteinGroups file. Default is null if just the first dataframe is being filtered. 
        Output: pandas dataframe for the merged proteinGroups file, or filtered dataframe.
    '''
    # If no second dataframe is given, just filter and fix the Protein IDs column of the first dataframe.
    if dataframe2 is None:
        # Use the simplify function to fix the Protein IDs column.
        dataframe1 = simplify_protein_IDs(dataframe1)
        #Filter out any rows where the protein group was identified only by type
        dataframe1 = dataframe1[dataframe1['Score'] > 0]
        # Return the filtered and fixed dataframe
        return dataframe1
    # If a second dataframe is given, it is assumed that the first dataframe is already filtered and rectified.
    else:
        # Simplify the Protein IDs column.
        dataframe2 = simplify_protein_IDs(dataframe2)
        #Filter out any rows where the protein group was identified only by type
        dataframe2 = dataframe2[dataframe2['Score'] > 0]
        
        # Merge the columns unique to the second dataframe with the first dataframe using protein IDs
        combined_df = dataframe1.merge(dataframe2[list(np.setdiff1d(dataframe2.columns,dataframe1.columns))+['Protein IDs']],how='outer',on='Protein IDs')
        
        ##### Next fix the columns of the new dataframe. I'm only fixing the columns that matter for downstream SAINT analysis.
        
        # Fix Score (max of scores for a given protein group)
        max_scores = []
        for group in combined_df['Protein IDs']:
            max_scores.append(np.max(list(dataframe1[dataframe1['Protein IDs'] == group]['Score'])+list(dataframe2[dataframe2['Protein IDs'] == group]['Score'])))
        
        # Fix Reverse (+ if either dataframe has a +)
        # Fix Contaminant (+ if either dataframe has a +)
        # Fix Sequence length (should be the same - still use max just in case)
        # Fix Identified by site (+ if either dataframe has a + ... just in case: shouldn't matter because I'm filtering these out to start)
        # Generate FDR (either by majority protein IDs, or by single proteinGroup (or maybe keep old method but still merge the same?))
        
        # Return the combined dataframe
        return combined_df

    

In [15]:
def simplify_protein_IDs(dataframe):
    '''
        Takes a proteinGroups dataframe and returns a proteinGroups dataframe with simplified protein IDs.
        dataframe: pandas Dataframe with a 'Protein IDs' column
        Output: pandas Dataframe with the 'Protein IDs' column simplified
    '''
    dataframe['Protein IDs'] = [protein_group.split(';')[0] for protein_group in dataframe['Protein IDs']]
    return dataframe

In [16]:
combed_df = merge_pgs(merge_pgs(starting_df),new_df)

In [17]:
combed_df

Unnamed: 0,Protein IDs,Majority protein IDs_x,Peptide counts (all)_x,Peptide counts (razor+unique)_x,Peptide counts (unique)_x,Protein names_x,Gene names_x,Fasta headers_x,Number of proteins_x,Peptides_x,...,id_y,Peptide IDs_y,Peptide is razor_y,Mod. peptide IDs_y,Evidence IDs_y,MS/MS IDs_y,Best MS/MS_y,Oxidation (M) site IDs_y,Oxidation (M) site positions_y,Taxonomy IDs_y
0,Q9NZJ9,Q9NZJ9;A0A024RBG1,1;1,1;1,1;1,,NUDT4;NUDT4B,sp|Q9NZJ9|NUDT4_HUMAN Diphosphoinositol polyph...,2.0,1.0,...,0.0,4198;12399,True;True,4450;13295,9274;9275;28305,11676;11677;11678;36551,11678;36551,,,-1;-1
1,P0DPI2,P0DPI2;A0A0B4J2D5,4;4,4;4,4;4,,GATD3A;GATD3B,sp|P0DPI2|GAL3A_HUMAN Glutamine amidotransfera...,2.0,4.0,...,,,,,,,,,,
2,A0A0U1RRL7,A0A0U1RRL7,1,1,1,,MMP24OS,sp|A0A0U1RRL7|MMPOS_HUMAN Protein MMP24OS OS=H...,1.0,1.0,...,,,,,,,,,,
3,A0AVF1,A0AVF1,1,1,1,,TTC26,sp|A0AVF1|IFT56_HUMAN Intraflagellar transport...,1.0,1.0,...,,,,,,,,,,
4,A0AVT1,A0AVT1,1,1,1,,UBA6,sp|A0AVT1|UBA6_HUMAN Ubiquitin-like modifier-a...,1.0,1.0,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3963,REV__Q9H410,,,,,,,,,,...,3479.0,9268,True,9805,20969,27022,27022,,,-1
3964,REV__Q9NQ89,,,,,,,,,,...,3480.0,2085,True,2218,4676;4677;4678,5955;5956,5955,,,-1
3965,REV__Q9NXW9,,,,,,,,,,...,3482.0,6698;9642,True;True,7078;10190,14917;14918;14919;14920;21755,19019;19020;19021;27981,19021;27981,,,-1
3966,REV__Q9P2T0,,,,,,,,,,...,3483.0,9317,True,9855,21058;21059;21060;21061,27130;27131;27132,27131,,,-1


In [20]:
testdf = starting_df.merge(new_df[list(np.setdiff1d(new_df.columns,starting_df.columns))+['Protein IDs']],how='outer',on='Protein IDs')
testdf

Unnamed: 0,Protein IDs,Majority protein IDs,Peptide counts (all),Peptide counts (razor+unique),Peptide counts (unique),Protein names,Gene names,Fasta headers,Number of proteins,Peptides,...,MS/MS count CK1G1_1,MS/MS count CK1G1_2,Peptides CK1G1_1,Peptides CK1G1_2,Razor + unique peptides CK1G1_1,Razor + unique peptides CK1G1_2,Sequence coverage CK1G1_1 [%],Sequence coverage CK1G1_2 [%],Unique peptides CK1G1_1,Unique peptides CK1G1_2
0,Q9NZJ9,Q9NZJ9;A0A024RBG1,1;1,1;1,1;1,,NUDT4;NUDT4B,sp|Q9NZJ9|NUDT4_HUMAN Diphosphoinositol polyph...,2.0,1.0,...,0.0,1.0,1.0,2.0,1.0,2.0,5.0,12.8,0.0,1.0
1,P0DPI2,P0DPI2;A0A0B4J2D5,4;4,4;4,4;4,,GATD3A;GATD3B,sp|P0DPI2|GAL3A_HUMAN Glutamine amidotransfera...,2.0,4.0,...,,,,,,,,,,
2,A0A0U1RRL7,A0A0U1RRL7,1,1,1,,MMP24OS,sp|A0A0U1RRL7|MMPOS_HUMAN Protein MMP24OS OS=H...,1.0,1.0,...,,,,,,,,,,
3,A0AVF1,A0AVF1,1,1,1,,TTC26,sp|A0AVF1|IFT56_HUMAN Intraflagellar transport...,1.0,1.0,...,,,,,,,,,,
4,A0AVT1,A0AVT1,1,1,1,,UBA6,sp|A0AVT1|UBA6_HUMAN Ubiquitin-like modifier-a...,1.0,1.0,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4114,REV__Q9NQ89,,,,,,,,,,...,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0
4115,REV__Q9NXE4,,,,,,,,,,...,0.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0
4116,REV__Q9NXW9,,,,,,,,,,...,3.0,1.0,2.0,1.0,2.0,1.0,0.0,0.0,2.0,1.0
4117,REV__Q9P2T0,,,,,,,,,,...,1.0,2.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0


In [42]:
max_scores = []
for group in list(testdf['Protein IDs']):
    max_scores.append(np.max(list(starting_df[starting_df['Protein IDs'] == group]['Score'])+list(new_df[new_df['Protein IDs'] == group]['Score'])))
        

In [49]:
reverse = []
for group in list(testdf['Protein IDs']):
    print(list(starting_df[starting_df['Protein IDs'] == group]['Reverse']))
    '''
    if ('+' in ''.join(str(starting_df[starting_df['Protein IDs'] == group]['Reverse']))) or ('+' in ''.join(str(new_df[new_df['Protein IDs'] == group]))):
        print(''.join(starting_df[starting_df['Protein IDs'] == group]['Reverse'].astype('str')))
        reverse.append('+')
    else:
        reverse.append('')
    '''

[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan

[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan

[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[

In [47]:
starting_df['Reverse']

0       NaN
1       NaN
2       NaN
3       NaN
4       NaN
       ... 
2639      +
2640      +
2641      +
2642      +
2643      +
Name: Reverse, Length: 2644, dtype: object

In [31]:
np.unique(reverse)

array([''], dtype='<U1')

In [41]:
list(combed_df.columns)

['Protein IDs',
 'Majority protein IDs_x',
 'Peptide counts (all)_x',
 'Peptide counts (razor+unique)_x',
 'Peptide counts (unique)_x',
 'Protein names_x',
 'Gene names_x',
 'Fasta headers_x',
 'Number of proteins_x',
 'Peptides_x',
 'Razor + unique peptides_x',
 'Unique peptides_x',
 'Peptides ADCK1_1',
 'Peptides ADCK1_2',
 'Razor + unique peptides ADCK1_1',
 'Razor + unique peptides ADCK1_2',
 'Unique peptides ADCK1_1',
 'Unique peptides ADCK1_2',
 'Sequence coverage [%]_x',
 'Unique + razor sequence coverage [%]_x',
 'Unique sequence coverage [%]_x',
 'Mol. weight [kDa]_x',
 'Sequence length_x',
 'Sequence lengths_x',
 'Fraction average_x',
 'Fraction 1_x',
 'Q-value_x',
 'Score_x',
 'Identification type ADCK1_1',
 'Identification type ADCK1_2',
 'Sequence coverage ADCK1_1 [%]',
 'Sequence coverage ADCK1_2 [%]',
 'Intensity_x',
 'Intensity ADCK1_1',
 'Intensity ADCK1_2',
 'LFQ intensity ADCK1_1',
 'LFQ intensity ADCK1_2',
 'MS/MS count ADCK1_1',
 'MS/MS count ADCK1_2',
 'MS/MS coun