In [39]:
import pandas as pd
import numpy as np
import re

In [40]:
def modifications(input_string):
    poslist = []
    modlist = []
    if "(ac)" in input_string:
        poslist.append(0)
        modlist.append('Acetyl')
        input_string_noac = input_string.replace('(ac)', '')
    else:
        input_string_noac = input_string

    while re.search("\(ox\)", input_string_noac):
        match = re.search("\(ox\)", input_string_noac)
        pos = match.span()[0]
        poslist.append(pos)
        modlist.append('Oxidation')
        input_string_noac = re.sub('\(ox\)', '', input_string_noac, count=1)

    if 'GlyGly' in input_string_noac:
        match = re.search('GlyGly', input_string_noac)
        pos = match.span()[0]
        poslist.append(pos)
        modlist.append('GlyGly')
        input_string_noac = re.sub('GlyGly', '', input_string_noac, count=1)

    C_positions = [pos for pos, char in enumerate(input_string_noac) if char == 'C']
    for i in C_positions:
        modlist.append("Carbamidomethyl")
        poslist.append(i+1)

    sortedposlist = sorted(poslist)
    sortedmodlist = [x for _, x in sorted(zip(poslist, modlist))]

    formattedlist = []
    for (pos, mod) in zip(sortedposlist, sortedmodlist):
        formattedlist.append(str(pos)+'|'+str(mod))

    output = '|'.join(formattedlist)
    return output

In [41]:
data = pd.read_pickle('/home/robbe/IM2DeepMulti/dataset/final_conformers_unique_grouped.pkl')

In [42]:
# get everything between brackets of data['Modified sequence']
data['between_brackets'] = data['Modified sequence'].str.extract(r'\((.*?)\)', expand=False)

In [43]:
data_notnan = data[data['between_brackets'].notna()]

In [44]:
# Strip "_" from sequence
data["sequence"] = data["Modified sequence"].str.strip("_")

# Strip everything between "()" and "[]" from sequence
data["sequence"] = data["sequence"].str.replace(r"[\(\[].*?[\)\]]", "", regex=True)


In [45]:
data['Modified sequence'] = data['Modified sequence'].str.strip("_")
data['sequence'] = data['sequence'].str.replace(")", "")

In [46]:
# Parse modifications from Modified sequence
data['Modified sequence'] = data['Modified sequence'].str.replace('(Acetyl (Protein N-term))', '(ac)')
data['Modified sequence'] = data['Modified sequence'].str.replace('(Oxidation (M))', '(ox)')
data['Modified sequence'] = data['Modified sequence'].str.replace('(GlyGly (K))', 'GlyGly')

In [47]:
data['modifications'] = data['Modified sequence'].apply(modifications)
data['modifications'] = data['modifications'].replace('', 'Not modified')

In [48]:
data['CCS'] = data['CCS'].apply(lambda x: np.array(x))

In [49]:
data['multimodal'] = data['CCS'].apply(lambda x: x[0] != x[1])

In [50]:
multimodals = data[data['multimodal']]

In [51]:
multimodals

Unnamed: 0,Modified sequence,Charge,CCS,between_brackets,sequence,modifications,multimodal
27,(ac)AAAAASAPQQLSDEELFSQLR,3,"[606.7366658046817, 620.9744066641124]",Acetyl (Protein N-term,AAAAASAPQQLSDEELFSQLR,0|Acetyl,True
32,(ac)AAAAECDVVMAATEPELLDDQEAK,2,"[592.059490518594, 569.4993303204092]",Acetyl (Protein N-term,AAAAECDVVMAATEPELLDDQEAK,0|Acetyl|6|Carbamidomethyl,True
48,(ac)AAAAVSESWPELELAER,2,"[507.5739603232803, 491.76437437141686]",Acetyl (Protein N-term,AAAAVSESWPELELAER,0|Acetyl,True
55,(ac)AAAEAANCIMEVSCGQAESSEKPNAEDMTSK,3,"[670.4411322249787, 687.1666992937121]",Acetyl (Protein N-term,AAAEAANCIMEVSCGQAESSEKPNAEDMTSK,0|Acetyl|8|Carbamidomethyl|14|Carbamidomethyl,True
70,(ac)AAAPPSYCFVAFPPR,2,"[459.0285749239082, 469.80129636700474]",Acetyl (Protein N-term,AAAPPSYCFVAFPPR,0|Acetyl|8|Carbamidomethyl,True
...,...,...,...,...,...,...,...
168384,YYEAADTVTQFDNVR,2,"[448.78183696046887, 488.48148395570104]",,YYEAADTVTQFDNVR,Not modified,True
168401,YYEVLGAAATTDYNNNHEGREEDQR,3,"[701.2847436917783, 681.1808928127417]",,YYEVLGAAATTDYNNNHEGREEDQR,Not modified,True
168454,YYKTIDDLKNQILNLTTDNANILLQIDNAR,4,"[776.0645745924537, 700.46461195494]",,YYKTIDDLKNQILNLTTDNANILLQIDNAR,Not modified,True
168503,YYPTEDVPR,2,"[381.96414347459046, 359.97420674223264]",,YYPTEDVPR,Not modified,True


In [52]:
multimodals.rename(columns={'sequence':'seq', "Charge":"charge"}, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  multimodals.rename(columns={'sequence':'seq', "Charge":"charge"}, inplace=True)


In [53]:
multimodals['tr'] = 0

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  multimodals['tr'] = 0


In [54]:
multimodals.to_pickle('/home/robbe/IM2DeepMulti/multimodals_more.pkl')

In [55]:
multimodals.to_csv('/home/robbe/IM2DeepMulti/multimodals_more.csv')