# a) filter mono- and intralinks on particle level / particle specific before merger script


- Read the .csv files and delete 3 columns which were added by merger script

Python 3.7.3
pandas 0.25.1
numpy 1.17.0

In [1]:
import os
import numpy as np
import pandas as pd
import math
import matplotlib.pyplot as plt

Functions

In [2]:
#function to pop out one column and position it somewhere else in the dataframe (pops out like popcorn,
#is thereby also deleted at the old position and inserted at the new one)
def move_column_inplace(df, col, pos):
    col = df.pop(col)
    df.insert(pos, col.name, col)
    return df

#function used to make sure that the A allele of paralog (homologous) r-proteins is the first entry in order 
#to map the r-protein timing of the Cruz/ Woolford 2015 paper to the r-protein clustering in this study
def myfunc(x,y,boolean):
    if boolean:
        return y
    else:
        return x
    
def stringreplace(df):
    df['XLType']=df['XLType'].str.replace ('decoy ', '')
    df['XLType']=df['XLType'].str.replace ('intralink', 'intra-protein xl')
    df['XLType']=df['XLType'].str.replace ('intra/inter xl', 'intra-protein xl')
    return df
    
def homology_check(df):
#important to correct for homologous proteins (e.g. ribosomal proteins in yeast) during def monolink_filter().
#If no homologous protein sequences are contained in the search databases used for xQuest, (hit_database AND(!) decoy_database) this function is not necessary.
#Works only properly after def stringreplace().
#If a peptide sequence fits to multiple proteins in the search database, xQuest writes all of these proteins into
#the columns 'Protein1' and/or 'Protein2'. The proteins are separated by a comma in the column.
#xQuest defines a link which involves homologous proteins as 'intra/inter- xl'. Usually these links should count as
#'intra-protein xl'. However, some links (e.g. links which involve a protein and the decoy_reverse_protein sequence) 
#are also marked as 'intra/inter xl'. The proteins involved in these links do not have the same sequence, they only 
#have part of their named in common. This is why they should actually be defined as 'inter-protein xl'.
#This function checks, if an 'intra-protein xl' is actually an 'intra-protein xl' 
#(between homologous proteins like  [sp|P36105|RL14A_YEAST, sp|P38754|RL14B_YEAST] and [sp|P36105|RL14A_YEAST, sp|P38754|RL14B_YEAST]) 
#or if it is an 'inter-protein xl' 
#(between non-homologous proteins like [sp|P36105|RL14A_YEAST, sp|P38754|RL14B_YEAST] and decoy_reverse_sp|P36105|RL14A_YEAST)

    #check how many intra-protein xl were changed to inter-protein xl
    test_xltype = df.groupby('XLType').apply(lambda x: len(x))
    print("\nThese are the XLTypes in your dataframe before correction for non-homologous proteins in intra-protein xls. \n" + str(test_xltype))    
    
    #create a temporary dataframe df_check_homol which contains only the problematic links
    #(links with Type == 'intralink' are links on 1 peptide and have a '-' in column 'Protein2', these may not be checked for homology
    #because def homology_check() would identify them as 'inter-protein xl' which would be wrong)
    bool_mask1 = (df.XLType == 'intra-protein xl')
    df_intra_temp1 = df[bool_mask1]
    
    bool_mask2 = (df_intra_temp1.Type != 'intralink')
    df_check_homol = df_intra_temp1[bool_mask2]

    #make lists out of all entries in one row of the columns 'Protein1' and 'Protein2'
    df_check_homol = df_check_homol.assign(Protein1=df_check_homol['Protein1'].str.split(','))
    df_check_homol = df_check_homol.assign(Protein2=df_check_homol['Protein2'].str.split(','))
    
    #check if a protein contained in the row-list of 'Protein1' column (e.g. [sp|P36105|RL14A_YEAST, sp|P38754|RL14B_YEAST])
    #is also contained in the row-list of 'Protein2' column (e.g. [sp|P36105|RL14A_YEAST, sp|P38754|RL14B_YEAST])
    #and save it as a boolean mask
    
    boo = df_check_homol.apply(lambda row: not set(row.Protein1).isdisjoint(set(row.Protein2)), axis=1)
    
    #add the boolean mask as an additional column termed 'Homology' to the temporary dataframe df_check_homol
    df_check_homol.loc[:,'Homology'] = boo
    
    #see how many times 'Homology' == False (these links should actually be 'inter-protein xl')
    print("\nThe intralinks in your dataframe were tested for homologous proteins.")
    test_homology = df_check_homol.groupby('Homology').apply(lambda x: len(x))
    print("\nLinks with 'Homology' = False are actually 'inter-protein xl'.\n" + str(test_homology))
    
    #the temporary dataframe df_check_homol is merged with the original df which gets the new column 'Homology'
    #therefore the lists in df_check_homol 'Protein1' and 'Protein2' are turned back into strings (to match the columns in df)
    mask_string1 = df_check_homol['Protein1'].apply(lambda x: ','.join(map(str, x)))
    mask_string2 = df_check_homol['Protein2'].apply(lambda x: ','.join(map(str, x)))
    df_check_homol.loc[:,'Protein1'] = mask_string1
    df_check_homol.loc[:,'Protein2'] = mask_string2

    df = df.merge(df_check_homol, how='outer') 
    
    #the 'XLType' in df is corrected to the term 'inter-protein xl' for all links with Homology == False
    mask_homology_false = (df['Homology'] == False)
    df['XLType'] = df['XLType'].mask(mask_homology_false, 'inter-protein xl')
    
    #check how many intra-protein xl were changed to inter-protein xl
    test_xltype_corr = df.groupby('XLType').apply(lambda x: len(x))
    print("\nThese are the XLTypes in your dataframe after correction for 'intra'-protein links between non-homologous proteins.\n" + str(test_xltype_corr))
    
    return df
  
def monolink_filter(df):
    #first get a list of all proteins for which a monolink or a intralink was found
    #then filter the original table with this list (or actually with this set)
    monolink_df = df[df.XLType == 'monolink']
    protein_list_monolinks1 = monolink_df['Protein1'].tolist()
    protein_list_monolinks2 = monolink_df['Protein2'].tolist()

    intralink_df = df[df.XLType == 'intra-protein xl']
    protein_list_intralinks1 = intralink_df['Protein1'].tolist()
    protein_list_intralinks2 = intralink_df['Protein2'].tolist()

    #concatenate lists (containing comma separated entries)
    filter_list = protein_list_intralinks1 + protein_list_monolinks1 + protein_list_monolinks2 + protein_list_intralinks2

    #make a set out of the list: a set can contain every entry only once (that's what we want)
    #this set contains the entries as they come from xquest: if a peptide can belong to multiple proteins out of homology reasons,
    #all of these proteins will be contained in the column 'Protein1' as a connected string separated by a comma 
    unique_filter_list = set(filter_list)

    #in order to not miss a protein, for which a monolink was found we make a second set which contains only single proteins
    #(not multiple ones as one entry in a connected string separated by commas)
    #therefore we have to explode the table on Protein1 and Protein2 column
    #don't continue to work with the exploded table after the list with the single proteins
    df_ex1 = df.assign(Protein1=df['Protein1'].str.split(',')).explode('Protein1')
    df_ex2 = df_ex1.assign(Protein2=df_ex1['Protein2'].str.split(',')).explode('Protein2')

    monolink_df_ex = df_ex2[df_ex2.XLType == 'monolink']
    protein_list_monolinks_ex1 = monolink_df_ex['Protein1'].tolist()
    protein_list_monolinks_ex2 = monolink_df_ex['Protein2'].tolist()

    intralink_df_ex = df_ex2[df_ex2.XLType == 'intra-protein xl']
    protein_list_intralinks_ex1 = intralink_df_ex['Protein1'].tolist()
    protein_list_intralinks_ex2 = intralink_df_ex['Protein2'].tolist()

    #concatenate exploded lists (containing single proteins)
    filter_list_ex = protein_list_intralinks_ex1 + protein_list_intralinks_ex2 + protein_list_monolinks_ex1 + protein_list_monolinks_ex2

    unique_filter_list_ex = set(filter_list_ex)

    #make a set which contains the comma separated entries and also the single proteins
    #this set contains all proteins for which a mono- or intralink was identified
    #it also contains a '-' which is important for the next step, when we filter for proteins with mono- or intralinks
    new_set = unique_filter_list | unique_filter_list_ex

    #filter for proteins with mono- or intralinks in both Protein1 and Protein2
    isin_df = df[(df['Protein1'].isin(new_set))&(df['Protein2'].isin(new_set))]
    return isin_df

#function to return the name of a dataframe
def get_df_name(df):
    name =[x for x in globals() if globals()[x] is df][0]
    return name


#function reads in 3 dataframes and generates a small helper dataframe containing the names 
#of the input dataframes
#each dataframe gets a new column 'origin_df' containing the name of the input dataframe
#the 3 input dataframes with the new column are concatenated
#stringreplace and homology_check have to be used in order to correct for homologous proteins 
#(e.g. ribosomal proteins in yeast) during def monolink_filter()
#only links >= ld-Score 25 are considered for the mono-/ intralink filter: only proteins which 
#have a mono or intralink >= 25 are considered for further analysis; links to all other proteins
#are filtered out by the function monolink_filter ()
#the filtered dataframe is splitted to 3 output dataframes of the biological repliactes 
#containing only links >= 25 to proteins which have a mono- or intralink >=25
#the 3 output dataframes are saved as .csv files and the concatenated dataframe is returned
def mi_filter_particle(df_1, df_2, df_3):
    df_hlp = pd.DataFrame([get_df_name(df_1),get_df_name(df_2),get_df_name(df_3)])
    df_1['origin_df'] = df_hlp.iloc[0,0]
    df_2['origin_df'] = df_hlp.iloc[1,0]
    df_3['origin_df'] = df_hlp.iloc[2,0]
    df = pd.concat([df_1, df_2, df_3])
    df = stringreplace(df)
    df = homology_check(df)
    df = df[df['ld-Score']>=20]
    df = monolink_filter(df)
    df_1_f = df[df['origin_df']==df_hlp.iloc[0,0]]
    filename_1 = os.path.join(root, (df_hlp.iloc[0,0]+'_mifp.csv'))
    df_1_f.to_csv(filename_1)
    df_2_f = df[df['origin_df']==df_hlp.iloc[1,0]]
    filename_2 = os.path.join(root, (df_hlp.iloc[1,0]+'_mifp.csv'))
    df_2_f.to_csv(filename_2)
    df_3_f = df[df['origin_df']==df_hlp.iloc[2,0]]
    filename_3 = os.path.join(root, (df_hlp.iloc[2,0]+'_mifp.csv'))
    df_3_f.to_csv(filename_3)
    return df

In [3]:
root = 'mi-filter' 
if not os.path.exists(root):
    os.makedirs(root)


## Control pulldown (wt)

In [5]:
df_I = pd.read_csv('wt_1_small.csv', sep=None, engine='python')
#mi_A = D8_I.drop(columns=['uxID_count_1', 'uxID_sum', 'uxID_sum_unique', 'origin'])
#mi_A.to_csv('210428_kai/non_A.csv')

df_II = pd.read_csv('wt_2_small.csv', sep=None, engine='python')
#mi_B = D8_II.drop(columns=['uxID_count_1', 'uxID_sum', 'uxID_sum_unique', 'origin'])
#mi_B.to_csv('210428_kai/non_B.csv')

df_III = pd.read_csv('wt_3_small.csv', sep=None, engine='python')
#mi_C = D8_III.drop(columns=['uxID_count_1', 'uxID_sum', 'uxID_sum_unique', 'origin'])
#mi_C.to_csv('210428_kai/non_C.csv')

df_concat_WT = mi_filter_particle(df_I, df_II, df_III)
df_concat_WT
#len(df_concat_WT)
#pd.DataFrame([get_df_name(wt_I)])


These are the XLTypes in your dataframe before correction for non-homologous proteins in intra-protein xls. 
XLType
inter-protein xl    1256
intra-protein xl     418
monolink             560
dtype: int64

The intralinks in your dataframe were tested for homologous proteins.

Links with 'Homology' = False are actually 'inter-protein xl'.
Homology
False     51
True     317
dtype: int64

These are the XLTypes in your dataframe after correction for 'intra'-protein links between non-homologous proteins.
XLType
inter-protein xl    1307
intra-protein xl     367
monolink             560
dtype: int64


Unnamed: 0.1,Unnamed: 0,Rank,Id,Protein1,Protein2,Type,XLType,Spectrum,AbsPos1,AbsPos2,...,TicB,WTIC,intsum,deltaS,ld-Score,FDR,uxID,comment,origin_df,Homology
36,36,1,RVELTEIVDIYPNKLQINHYAR-ETKENDEEFYKNK-a14-b11,decoy_reverse_sp|P23638|PSA4_YEAST,sp|P32565|RPN2_YEAST,xlink,inter-protein xl,F1807_kammerk_003.017499.017499.6_F1807_kammer...,159,858,...,0.04,0.03,333,0.82,20.94,0.656,decoy_reverse_sp|P23638|PSA4_YEAST:159:x:sp|P3...,,df_I,
37,37,1,KPDDDKIVPLTEGDIQVLK-IFKEQDVAGNYGKR-a6-b3,sp|P33299|PRS7_YEAST,decoy_reverse_sp|P23638|PSA4_YEAST,xlink,inter-protein xl,F1807_kammerk_003.019897.019897.4_F1807_kammer...,25,31,...,0.02,0.05,308,0.49,20.49,0.714,decoy_reverse_sp|P23638|PSA4_YEAST:31:x:sp|P33...,,df_I,
38,38,1,EQDVAGNYGKR-GINLRKVAEK-a10-b6,decoy_reverse_sp|P23638|PSA4_YEAST,sp|Q01939|PRS8_YEAST,xlink,inter-protein xl,F1807_kammerk_004.016521.016521.5_F1807_kammer...,41,345,...,0.34,0.07,240,0.71,22.24,0.618,decoy_reverse_sp|P23638|PSA4_YEAST:41:x:sp|Q01...,,df_I,
49,49,1,VEVDKK-VEVDKK-a5-b5,decoy_reverse_sp|P25451|PSB3_YEAST,decoy_reverse_sp|P25451|PSB3_YEAST,xlink,intra-protein xl,F1807_kammerk_004.004867.004867.3_F1807_kammer...,14,14,...,0.03,0.01,79,0.78,21.26,0.241,decoy_reverse_sp|P25451|PSB3_YEAST:14:x:decoy_...,,df_I,True
67,67,1,TEQNQKKEPDYR-KETNREK-a7-b1,decoy_reverse_sp|P40016|RPN3_YEAST,decoy_reverse_sp|P32565|RPN2_YEAST,xlink,inter-protein xl,F1807_kammerk_004.005938.005938.4_F1807_kammer...,39,103,...,0.01,0.05,343,0.70,20.77,0.680,decoy_reverse_sp|P32565|RPN2_YEAST:103:x:decoy...,,df_I,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2229,806,1,KLDDKPSLVDVHLLESK-K1-155,sp|Q12377|RPN6_YEAST,-,monolink,monolink,F1807_kammerk_052.018696.018696.4_F1807_kammer...,167,,...,0.00,0.10,277,0.61,29.33,0.000,sp|Q12377|RPN6_YEAST:167,,df_III,
2230,807,1,KLDDKPSLVDVHLLESK-K5-155,sp|Q12377|RPN6_YEAST,-,monolink,monolink,F1807_kammerk_052.018696.018696.4_F1807_kammer...,171,,...,0.00,0.11,340,0.69,32.65,0.000,sp|Q12377|RPN6_YEAST:171,,df_III,
2231,808,1,KLDDKPSLVDVHLLESK-K5-156,sp|Q12377|RPN6_YEAST,-,monolink,monolink,F1807_kammerk_053.023998.023998.4_F1807_kammer...,171,,...,0.00,0.07,259,0.66,30.57,0.000,sp|Q12377|RPN6_YEAST:171,,df_III,
2232,809,1,KLDDKPSLVDVHLLESK-K5-156,sp|Q12377|RPN6_YEAST,-,monolink,monolink,F1807_kammerk_055.021007.021007.4_F1807_kammer...,171,,...,0.00,0.07,133,0.51,27.07,0.000,sp|Q12377|RPN6_YEAST:171,,df_III,
