In [1]:
import pandas as pd
import numpy as np
import re

In [2]:
def modifications(input_string):
    poslist = []
    modlist = []
    if "(ac)" in input_string:
        poslist.append(0)
        modlist.append('Acetyl')
        input_string_noac = input_string.replace('(ac)', '')
    else:
        input_string_noac = input_string

    while re.search("\(ox\)", input_string_noac):
        match = re.search("\(ox\)", input_string_noac)
        pos = match.span()[0]
        poslist.append(pos)
        modlist.append('Oxidation')
        input_string_noac = re.sub('\(ox\)', '', input_string_noac, count=1)

    C_positions = [pos for pos, char in enumerate(input_string_noac) if char == 'C']
    for i in C_positions:
        modlist.append("Carbamidomethyl")
        poslist.append(i+1)

    sortedposlist = sorted(poslist)
    sortedmodlist = [x for _, x in sorted(zip(poslist, modlist))]

    formattedlist = []
    for (pos, mod) in zip(sortedposlist, sortedmodlist):
        formattedlist.append(str(pos)+'|'+str(mod))

    output = '|'.join(formattedlist)
    return output

In [4]:
data = pd.read_csv('/home/robbe/IM2DeepMulti/aligned_with_multimodals_conformers_unique_grouped.csv', sep=";", converters= {"CCS":pd.eval})

In [5]:
# Strip "_" from sequence
data["sequence"] = data["Modified sequence"].str.strip("_")

# Strip everything between "()" and "[]" from sequence
data["sequence"] = data["sequence"].str.replace(r"[\(\[].*?[\)\]]", "", regex=True)


In [6]:
data['Modified sequence'] = data['Modified sequence'].str.strip("_")
data['sequence'] = data['sequence'].str.replace(")", "")

In [7]:
# Parse modifications from Modified sequence
data['Modified sequence'] = data['Modified sequence'].str.replace('(Acetyl (Protein N-term))', '(ac)')
data['Modified sequence'] = data['Modified sequence'].str.replace('(Oxidation (M))', '(ox)')

In [8]:
data['modifications'] = data['Modified sequence'].apply(modifications)
data['modifications'] = data['modifications'].replace('', 'Not modified')

In [9]:
data['CCS'] = data['CCS'].apply(lambda x: np.array(x))

In [11]:
data['multimodal'] = data['CCS'].apply(lambda x: x[0] != x[1])

In [13]:
multimodals = data[data['multimodal']]

In [14]:
multimodals

Unnamed: 0,Modified sequence,Charge,CCS,sequence,modifications,multimodal
216,(ac)ADDLDFETGDAGASATFPMQCSALR,2,"[553.8808011101869, 593.3325884238297]",ADDLDFETGDAGASATFPMQCSALR,0|Acetyl|21|Carbamidomethyl,True
369,(ac)AFPHRPDAPELPDFSMLK,3,"[550.8340256946149, 589.9632800847153]",AFPHRPDAPELPDFSMLK,0|Acetyl,True
493,(ac)ASAVSPANLPAVLLQPR,2,"[505.073150712638, 454.6258100370671]",ASAVSPANLPAVLLQPR,0|Acetyl,True
601,(ac)AVADLALIPDVDIDSDGVFK,3,"[599.3350827093011, 564.319130116306]",AVADLALIPDVDIDSDGVFK,0|Acetyl,True
634,(ac)AYHSFLVEPISCHAWNK,3,"[576.2263509682161, 533.4697989038966]",AYHSFLVEPISCHAWNK,0|Acetyl|12|Carbamidomethyl,True
...,...,...,...,...,...,...
119526,YWDVPPPGFEHITPMQYK,3,"[549.937176450393, 637.4132017215193]",YWDVPPPGFEHITPMQYK,Not modified,True
119553,YWPQEAGEYAVHVLCNSEDIR,3,"[596.4863495710541, 663.6436940946516]",YWPQEAGEYAVHVLCNSEDIR,15|Carbamidomethyl,True
119605,YYEAADTVTQFDNVR,2,"[491.62790291818175, 454.37298289751044]",YYEAADTVTQFDNVR,Not modified,True
119656,YYIHDLSDLIDCCDLGYHASLNR,4,"[754.8011838821079, 669.2202440195756]",YYIHDLSDLIDCCDLGYHASLNR,12|Carbamidomethyl|13|Carbamidomethyl,True


In [15]:
multimodals.rename(columns={'sequence':'seq', "Charge":"charge"}, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  multimodals.rename(columns={'sequence':'seq', "Charge":"charge"}, inplace=True)


In [16]:
multimodals['tr'] = 0

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  multimodals['tr'] = 0


In [18]:
multimodals.to_pickle('/home/robbe/IM2DeepMulti/multimodals.pkl')