In [54]:
import pandas as pd
import numpy as np
import re

In [55]:
def modifications(input_string):
    poslist = []
    modlist = []
    if "(ac)" in input_string:
        poslist.append(0)
        modlist.append('Acetyl')
        input_string_noac = input_string.replace('(ac)', '')
    else:
        input_string_noac = input_string

    while re.search("\(ox\)", input_string_noac):
        match = re.search("\(ox\)", input_string_noac)
        pos = match.span()[0]
        poslist.append(pos)
        modlist.append('Oxidation')
        input_string_noac = re.sub('\(ox\)', '', input_string_noac, count=1)

    if 'GlyGly' in input_string_noac:
        match = re.search('GlyGly', input_string_noac)
        pos = match.span()[0]
        poslist.append(pos)
        modlist.append('GlyGly')
        input_string_noac = re.sub('GlyGly', '', input_string_noac, count=1)

    C_positions = [pos for pos, char in enumerate(input_string_noac) if char == 'C']
    for i in C_positions:
        modlist.append("Carbamidomethyl")
        poslist.append(i+1)

    sortedposlist = sorted(poslist)
    sortedmodlist = [x for _, x in sorted(zip(poslist, modlist))]

    formattedlist = []
    for (pos, mod) in zip(sortedposlist, sortedmodlist):
        formattedlist.append(str(pos)+'|'+str(mod))

    output = '|'.join(formattedlist)
    return output

In [56]:
data = pd.read_csv('/home/robbe/IM2DeepMulti/dataset/Evaluation_dataset_with_overlap_v2.csv')

In [57]:
data

Unnamed: 0,Modified sequence,Intensity,Charge,m/z,Retention time,Raw file,Experiment,1/K0,CCS
0,_(Acetyl (Protein N-term))AAAAAAAAAAGAAGGR_,58830.0,2,620.823275,28.682,7_S1-B2_1_7050,B3,1.033297,418.47580
1,_(Acetyl (Protein N-term))AAAAAAAAAAGAAGGR_,51048.0,2,620.823275,28.545,8_S1-B4_1_7052,B4,1.033297,418.47577
2,_(Acetyl (Protein N-term))AAAAAAAAAAGDSDSWDADT...,33143.0,3,970.766403,37.406,7_S1-B2_1_7050,B3,1.123101,677.93670
3,_(Acetyl (Protein N-term))AAAAAAGAASGLPGPVAQGLK_,176510.0,2,895.991600,34.106,10_S1-B8_1_7056,C2,1.248190,503.78998
4,_(Acetyl (Protein N-term))AAAAAAGAASGLPGPVAQGLK_,73423.0,2,895.991600,34.024,11_S1-B10_1_7058,C3,1.245645,502.76257
...,...,...,...,...,...,...,...,...,...
146624,_YYVTIIDAPGHRDFIK_,28209.0,3,636.671988,25.584,3_S1-A6_1_7042,A3,0.904345,547.25200
146625,_YYVTIIDAPGHRDFIK_,22666.0,3,636.671988,25.702,4_S1-A8_1_7044,A4,0.904345,547.25200
146626,_YYVTIIDAPGHRDFIK_,29843.0,3,636.671988,25.646,5_S1-A10_1_7046,B1,0.904345,547.25200
146627,_YYVTIIDAPGHRDFIK_,58577.0,3,636.671988,25.706,6_S1-A12_1_7048,B2,0.904345,547.25200


In [58]:
# get everything between brackets of data['Modified sequence']
data['between_brackets'] = data['Modified sequence'].str.extract(r'\((.*?)\)', expand=False)

In [59]:
data_notnan = data[data['between_brackets'].notna()]

In [60]:
# Strip "_" from sequence
data["sequence"] = data["Modified sequence"].str.strip("_")

# Strip everything between "()" and "[]" from sequence
data["sequence"] = data["sequence"].str.replace(r"[\(\[].*?[\)\]]", "", regex=True)


In [61]:
data['Modified sequence'] = data['Modified sequence'].str.strip("_")
data['sequence'] = data['sequence'].str.replace(")", "")

In [62]:
# Parse modifications from Modified sequence
data['Modified sequence'] = data['Modified sequence'].str.replace('(Acetyl (Protein N-term))', '(ac)')
data['Modified sequence'] = data['Modified sequence'].str.replace('(Oxidation (M))', '(ox)')
data['Modified sequence'] = data['Modified sequence'].str.replace('(GlyGly (K))', 'GlyGly')

In [63]:
data['modifications'] = data['Modified sequence'].apply(modifications)
data['modifications'] = data['modifications'].replace('', 'Not modified')

In [64]:
data.shape

(146629, 12)

In [65]:
# Remove rows with GlyGly
data = data[~data['modifications'].str.contains('GlyGly')]

In [66]:
data.shape

(146629, 12)

In [67]:
data

Unnamed: 0,Modified sequence,Intensity,Charge,m/z,Retention time,Raw file,Experiment,1/K0,CCS,between_brackets,sequence,modifications
0,(ac)AAAAAAAAAAGAAGGR,58830.0,2,620.823275,28.682,7_S1-B2_1_7050,B3,1.033297,418.47580,Acetyl (Protein N-term,AAAAAAAAAAGAAGGR,0|Acetyl
1,(ac)AAAAAAAAAAGAAGGR,51048.0,2,620.823275,28.545,8_S1-B4_1_7052,B4,1.033297,418.47577,Acetyl (Protein N-term,AAAAAAAAAAGAAGGR,0|Acetyl
2,(ac)AAAAAAAAAAGDSDSWDADTFSMEDPVRK,33143.0,3,970.766403,37.406,7_S1-B2_1_7050,B3,1.123101,677.93670,Acetyl (Protein N-term,AAAAAAAAAAGDSDSWDADTFSMEDPVRK,0|Acetyl
3,(ac)AAAAAAGAASGLPGPVAQGLK,176510.0,2,895.991600,34.106,10_S1-B8_1_7056,C2,1.248190,503.78998,Acetyl (Protein N-term,AAAAAAGAASGLPGPVAQGLK,0|Acetyl
4,(ac)AAAAAAGAASGLPGPVAQGLK,73423.0,2,895.991600,34.024,11_S1-B10_1_7058,C3,1.245645,502.76257,Acetyl (Protein N-term,AAAAAAGAASGLPGPVAQGLK,0|Acetyl
...,...,...,...,...,...,...,...,...,...,...,...,...
146624,YYVTIIDAPGHRDFIK,28209.0,3,636.671988,25.584,3_S1-A6_1_7042,A3,0.904345,547.25200,,YYVTIIDAPGHRDFIK,Not modified
146625,YYVTIIDAPGHRDFIK,22666.0,3,636.671988,25.702,4_S1-A8_1_7044,A4,0.904345,547.25200,,YYVTIIDAPGHRDFIK,Not modified
146626,YYVTIIDAPGHRDFIK,29843.0,3,636.671988,25.646,5_S1-A10_1_7046,B1,0.904345,547.25200,,YYVTIIDAPGHRDFIK,Not modified
146627,YYVTIIDAPGHRDFIK,58577.0,3,636.671988,25.706,6_S1-A12_1_7048,B2,0.904345,547.25200,,YYVTIIDAPGHRDFIK,Not modified


In [68]:
# data['CCS'] = data['CCS'].apply(lambda x: np.array(x))

In [69]:
# data['multimodal'] = data['CCS'].apply(lambda x: x[0] != x[1])

In [70]:
# multimodals = data[data['multimodal']]

In [71]:
data.rename(columns={'sequence':'seq', "Charge":"charge"}, inplace=True)

In [72]:
data['tr'] = 0

In [73]:
data

Unnamed: 0,Modified sequence,Intensity,charge,m/z,Retention time,Raw file,Experiment,1/K0,CCS,between_brackets,seq,modifications,tr
0,(ac)AAAAAAAAAAGAAGGR,58830.0,2,620.823275,28.682,7_S1-B2_1_7050,B3,1.033297,418.47580,Acetyl (Protein N-term,AAAAAAAAAAGAAGGR,0|Acetyl,0
1,(ac)AAAAAAAAAAGAAGGR,51048.0,2,620.823275,28.545,8_S1-B4_1_7052,B4,1.033297,418.47577,Acetyl (Protein N-term,AAAAAAAAAAGAAGGR,0|Acetyl,0
2,(ac)AAAAAAAAAAGDSDSWDADTFSMEDPVRK,33143.0,3,970.766403,37.406,7_S1-B2_1_7050,B3,1.123101,677.93670,Acetyl (Protein N-term,AAAAAAAAAAGDSDSWDADTFSMEDPVRK,0|Acetyl,0
3,(ac)AAAAAAGAASGLPGPVAQGLK,176510.0,2,895.991600,34.106,10_S1-B8_1_7056,C2,1.248190,503.78998,Acetyl (Protein N-term,AAAAAAGAASGLPGPVAQGLK,0|Acetyl,0
4,(ac)AAAAAAGAASGLPGPVAQGLK,73423.0,2,895.991600,34.024,11_S1-B10_1_7058,C3,1.245645,502.76257,Acetyl (Protein N-term,AAAAAAGAASGLPGPVAQGLK,0|Acetyl,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
146624,YYVTIIDAPGHRDFIK,28209.0,3,636.671988,25.584,3_S1-A6_1_7042,A3,0.904345,547.25200,,YYVTIIDAPGHRDFIK,Not modified,0
146625,YYVTIIDAPGHRDFIK,22666.0,3,636.671988,25.702,4_S1-A8_1_7044,A4,0.904345,547.25200,,YYVTIIDAPGHRDFIK,Not modified,0
146626,YYVTIIDAPGHRDFIK,29843.0,3,636.671988,25.646,5_S1-A10_1_7046,B1,0.904345,547.25200,,YYVTIIDAPGHRDFIK,Not modified,0
146627,YYVTIIDAPGHRDFIK,58577.0,3,636.671988,25.706,6_S1-A12_1_7048,B2,0.904345,547.25200,,YYVTIIDAPGHRDFIK,Not modified,0


In [74]:
data.to_pickle('/home/robbe/IM2DeepMulti/Evaluation_dataset_v2_with_overlap.pkl')

In [75]:
data.to_csv('/home/robbe/IM2DeepMulti/Evaluation_dataset_v2_with_overlap.csv')