# Preprocessing MassSpecGym dataset

In [None]:
import numpy as np
import pandas as pd
from pandarallel import pandarallel
from rdkit import Chem
from rdkit.Chem import rdFingerprintGenerator

pandarallel.initialize(progress_bar=True)

INFO: Pandarallel will run on 16 workers.
INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.


In [None]:
# modify the path where to save the formatted dataset
path = '../../../data/MSG/'

In [None]:
# downloading initial dataset
df = pd.read_csv("hf://datasets/roman-bushuiev/MassSpecGym/data/MassSpecGym.tsv", sep="\t")
df

Unnamed: 0,identifier,mzs,intensities,smiles,inchikey,formula,precursor_formula,parent_mass,precursor_mz,adduct,instrument_type,collision_energy,fold,simulation_challenge
0,MassSpecGymID0000001,"91.0542,125.0233,154.0499,155.0577,185.0961,20...","0.24524524524524524,1.0,0.08008008008008008,0....",CC(=O)N[C@@H](CC1=CC=CC=C1)C2=CC(=CC(=O)O2)OC,VFMQMACUYWGDOJ,C16H17NO4,C16H18NO4,287.115224,288.1225,[M+H]+,Orbitrap,30.0,train,True
1,MassSpecGymID0000002,"91.0542,125.0233,155.0577,185.0961,229.0859,24...","0.0990990990990991,0.28128128128128127,0.04004...",CC(=O)N[C@@H](CC1=CC=CC=C1)C2=CC(=CC(=O)O2)OC,VFMQMACUYWGDOJ,C16H17NO4,C16H18NO4,287.115224,288.1225,[M+H]+,Orbitrap,20.0,train,True
2,MassSpecGymID0000003,"69.0343,91.0542,125.0233,127.039,153.0699,154....","0.03403403403403404,0.31431431431431434,1.0,0....",CC(=O)N[C@@H](CC1=CC=CC=C1)C2=CC(=CC(=O)O2)OC,VFMQMACUYWGDOJ,C16H17NO4,C16H18NO4,287.115224,288.1225,[M+H]+,Orbitrap,40.0,train,True
3,MassSpecGymID0000004,"69.0343,91.0542,110.06,111.0441,112.0393,120.0...","0.17917917917917917,0.47347347347347346,0.0380...",CC(=O)N[C@@H](CC1=CC=CC=C1)C2=CC(=CC(=O)O2)OC,VFMQMACUYWGDOJ,C16H17NO4,C16H18NO4,287.115224,288.1225,[M+H]+,Orbitrap,55.0,train,True
4,MassSpecGymID0000005,"91.0542,125.0233,185.0961,229.0859,246.1125,28...","0.07807807807807808,0.1841841841841842,0.03503...",CC(=O)N[C@@H](CC1=CC=CC=C1)C2=CC(=CC(=O)O2)OC,VFMQMACUYWGDOJ,C16H17NO4,C16H18NO4,287.115224,288.1225,[M+H]+,Orbitrap,10.0,train,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
231099,MassSpecGymID0414168,"55.054474,58.029369,58.065601,59.049339,69.032...","0.0004935684823754727,0.00012405427491363124,0...",CC[C@@H]1[C@H](/C=C(/C=C\C(=O)[C@@H](C[C@@H]([...,WBPYTXDJUQJLPQ,C46H77NO17,C46H78NO17,915.521724,916.5290,[M+H]+,QTOF,,val,False
231100,MassSpecGymID0414171,"72.043739,86.060242,98.058304,99.0634,126.0537...","0.0019527744563354998,0.0016939927337585416,0....",C[C@@]1([C@H]2C[C@H]3[C@@H](C(=O)C(=C([C@]3(C(...,DHPRQBPJLMKORJ,C22H23ClN2O8,C22H24ClN2O8,478.115724,479.1230,[M+H]+,QTOF,,test,False
231101,MassSpecGymID0414172,"72.080132,102.233917,113.082344,113.121498,114...","0.021173783463613503,0.007516001055215398,0.03...",C[C@H]([C@@H]1CC[C@H]([C@H](O1)O[C@@H]2[C@H](C...,CEAZRRDELHUEMR,C21H43N5O7,C21H44N5O7,477.317724,478.3250,[M+H]+,QTOF,,val,False
231102,MassSpecGymID0414173,"55.053627,56.455425,58.065819,67.053543,82.065...","0.031731527156456024,0.017878400151027027,0.07...",C[C@H]([C@@H]1CC[C@H]([C@H](O1)O[C@@H]2[C@H](C...,CEAZRRDELHUEMR,C21H43N5O7,C21H44N5O7,477.317724,478.3250,[M+H]+,QTOF,,val,False


In [3]:
print('# train = ', len(df[df['fold'].eq('train')]))
print('# val = ', len(df[df['fold'].eq('val')]))
print('# test = ', len(df[df['fold'].eq('test')]))

# train =  194119
# val =  19429
# test =  17556


In [9]:
# in the code we read lists of floats, not lists of strings, so we need to re-format both msz and intensities
# intensities are normalized to 1 here but in our code we use normalization to 100, so we multiply by 100
# easier to create new columns in the df

def initial_refactor(data):
    data = data.rename(columns={'collision_energy':'coll_en'})
    data['adduct']=['H' if a=='[M+H]+' else 'Na' for a in data['adduct']]
    data['coll_en']=['-1' if np.isnan(a) else str(a) for a in data['coll_en']]

    return data

df = initial_refactor(df)
df['mzs_float'] = df['mzs'].map(lambda mz : [float(mz_val) for mz_val in mz.split(',')])
df['int_float'] = df['intensities'].map(lambda i : [float(i_val) *100 for i_val in i.split(',')])
# we need only one column with paired msz and respective intensity, so we combine every msz with the respective intensity for every peak
df['spectrum'] = df.apply(lambda row : [[mz, intensity] for mz, intensity in zip(row['mzs_float'], row['int_float'])], axis=1)

# remove noisy peaks first (intensity < 1)
df['mask'] = df['int_float'].map(lambda x : [float(intens)>1 for intens in x])
df['spectrum'] = df.apply(lambda row : [[mz, intensity] for mz, intensity, cond in zip(row['mzs_float'], row['int_float'], row['mask']) if cond], axis=1)
df

Unnamed: 0,identifier,mzs,intensities,smiles,inchikey,formula,precursor_formula,parent_mass,precursor_mz,adduct,instrument_type,coll_en,fold,simulation_challenge,mzs_float,int_float,spectrum,mask
0,MassSpecGymID0000001,"91.0542,125.0233,154.0499,155.0577,185.0961,20...","0.24524524524524524,1.0,0.08008008008008008,0....",CC(=O)N[C@@H](CC1=CC=CC=C1)C2=CC(=CC(=O)O2)OC,VFMQMACUYWGDOJ,C16H17NO4,C16H18NO4,287.115224,288.1225,H,Orbitrap,30.0,train,True,"[91.0542, 125.0233, 154.0499, 155.0577, 185.09...","[24.524524524524523, 100.0, 8.008008008008009,...","[[91.0542, 24.524524524524523], [125.0233, 100...","[True, True, True, True, True, True, True, True]"
1,MassSpecGymID0000002,"91.0542,125.0233,155.0577,185.0961,229.0859,24...","0.0990990990990991,0.28128128128128127,0.04004...",CC(=O)N[C@@H](CC1=CC=CC=C1)C2=CC(=CC(=O)O2)OC,VFMQMACUYWGDOJ,C16H17NO4,C16H18NO4,287.115224,288.1225,H,Orbitrap,20.0,train,True,"[91.0542, 125.0233, 155.0577, 185.0961, 229.08...","[9.90990990990991, 28.128128128128125, 4.00400...","[[91.0542, 9.90990990990991], [125.0233, 28.12...","[True, True, True, True, True, True]"
2,MassSpecGymID0000003,"69.0343,91.0542,125.0233,127.039,153.0699,154....","0.03403403403403404,0.31431431431431434,1.0,0....",CC(=O)N[C@@H](CC1=CC=CC=C1)C2=CC(=CC(=O)O2)OC,VFMQMACUYWGDOJ,C16H17NO4,C16H18NO4,287.115224,288.1225,H,Orbitrap,40.0,train,True,"[69.0343, 91.0542, 125.0233, 127.039, 153.0699...","[3.4034034034034035, 31.431431431431434, 100.0...","[[69.0343, 3.4034034034034035], [91.0542, 31.4...","[True, True, True, True, True, True, True, Tru..."
3,MassSpecGymID0000004,"69.0343,91.0542,110.06,111.0441,112.0393,120.0...","0.17917917917917917,0.47347347347347346,0.0380...",CC(=O)N[C@@H](CC1=CC=CC=C1)C2=CC(=CC(=O)O2)OC,VFMQMACUYWGDOJ,C16H17NO4,C16H18NO4,287.115224,288.1225,H,Orbitrap,55.0,train,True,"[69.0343, 91.0542, 110.06, 111.0441, 112.0393,...","[17.917917917917915, 47.347347347347345, 3.803...","[[69.0343, 17.917917917917915], [91.0542, 47.3...","[True, True, True, True, True, True, True, Tru..."
4,MassSpecGymID0000005,"91.0542,125.0233,185.0961,229.0859,246.1125,28...","0.07807807807807808,0.1841841841841842,0.03503...",CC(=O)N[C@@H](CC1=CC=CC=C1)C2=CC(=CC(=O)O2)OC,VFMQMACUYWGDOJ,C16H17NO4,C16H18NO4,287.115224,288.1225,H,Orbitrap,10.0,train,True,"[91.0542, 125.0233, 185.0961, 229.0859, 246.11...","[7.807807807807808, 18.41841841841842, 3.50350...","[[91.0542, 7.807807807807808], [125.0233, 18.4...","[True, True, True, True, True, True]"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
231099,MassSpecGymID0414168,"55.054474,58.029369,58.065601,59.049339,69.032...","0.0004935684823754727,0.00012405427491363124,0...",CC[C@@H]1[C@H](/C=C(/C=C\C(=O)[C@@H](C[C@@H]([...,WBPYTXDJUQJLPQ,C46H77NO17,C46H78NO17,915.521724,916.5290,H,QTOF,-1,val,False,"[55.054474, 58.029369, 58.065601, 59.049339, 6...","[0.04935684823754727, 0.012405427491363124, 0....","[[101.05954, 1.6709916364850783], [145.085648,...","[False, False, False, False, False, False, Fal..."
231100,MassSpecGymID0414171,"72.043739,86.060242,98.058304,99.0634,126.0537...","0.0019527744563354998,0.0016939927337585416,0....",C[C@@]1([C@H]2C[C@H]3[C@@H](C(=O)C(=C([C@]3(C(...,DHPRQBPJLMKORJ,C22H23ClN2O8,C22H24ClN2O8,478.115724,479.1230,H,QTOF,-1,test,False,"[72.043739, 86.060242, 98.058304, 99.0634, 126...","[0.19527744563354998, 0.16939927337585417, 0.1...","[[444.084412, 8.434492975677559], [462.095337,...","[False, False, False, False, False, False, Fal..."
231101,MassSpecGymID0414172,"72.080132,102.233917,113.082344,113.121498,114...","0.021173783463613503,0.007516001055215398,0.03...",C[C@H]([C@@H]1CC[C@H]([C@H](O1)O[C@@H]2[C@H](C...,CEAZRRDELHUEMR,C21H43N5O7,C21H44N5O7,477.317724,478.3250,H,QTOF,-1,val,False,"[72.080132, 102.233917, 113.082344, 113.121498...","[2.1173783463613502, 0.7516001055215398, 3.565...","[[72.080132, 2.1173783463613502], [113.082344,...","[True, False, True, False, True, False, False,..."
231102,MassSpecGymID0414173,"55.053627,56.455425,58.065819,67.053543,82.065...","0.031731527156456024,0.017878400151027027,0.07...",C[C@H]([C@@H]1CC[C@H]([C@H](O1)O[C@@H]2[C@H](C...,CEAZRRDELHUEMR,C21H43N5O7,C21H44N5O7,477.317724,478.3250,H,QTOF,-1,val,False,"[55.053627, 56.455425, 58.065819, 67.053543, 8...","[3.1731527156456023, 1.7878400151027027, 7.332...","[[55.053627, 3.1731527156456023], [56.455425, ...","[True, True, True, True, True, True, True, Tru..."


In [5]:
# canonicalise SMILES
df['smiles'] = [Chem.CanonSmiles(smile) for smile in df['smiles']]

for i,smile in enumerate(df['smiles']):
    if smile != Chem.CanonSmiles(smile):
        print('canonicalising!', i)
        df.iloc[i]['smiles'] = Chem.CanonSmiles(smile)

canonicalising! 217489


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.iloc[i]['smiles'] = Chem.CanonSmiles(smile)


In [11]:
# calculate molecular fingerprints
fp_size=128
mfpgen = rdFingerprintGenerator.GetMorganGenerator(radius=2, fpSize=fp_size)

def get_fingerprint(smiles: str):
    mol = Chem.MolFromSmiles(smiles)
    fp = mfpgen.GetFingerprint(mol)
    return list(fp.ToBitString())

df['fingerprint'] = df['smiles'].map(lambda smiles : get_fingerprint(smiles))
df['fingerprint'] = df['fingerprint'].parallel_apply(lambda row: [int(el) for el in row])

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=14444), Label(value='0 / 14444')))…

In [12]:
df

Unnamed: 0,identifier,mzs,intensities,smiles,inchikey,formula,precursor_formula,parent_mass,precursor_mz,adduct,instrument_type,coll_en,fold,simulation_challenge,mzs_float,int_float,spectrum,mask,fingerprint
0,MassSpecGymID0000001,"91.0542,125.0233,154.0499,155.0577,185.0961,20...","0.24524524524524524,1.0,0.08008008008008008,0....",CC(=O)N[C@@H](CC1=CC=CC=C1)C2=CC(=CC(=O)O2)OC,VFMQMACUYWGDOJ,C16H17NO4,C16H18NO4,287.115224,288.1225,H,Orbitrap,30.0,train,True,"[91.0542, 125.0233, 154.0499, 155.0577, 185.09...","[24.524524524524523, 100.0, 8.008008008008009,...","[[91.0542, 24.524524524524523], [125.0233, 100...","[True, True, True, True, True, True, True, True]","[1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, ..."
1,MassSpecGymID0000002,"91.0542,125.0233,155.0577,185.0961,229.0859,24...","0.0990990990990991,0.28128128128128127,0.04004...",CC(=O)N[C@@H](CC1=CC=CC=C1)C2=CC(=CC(=O)O2)OC,VFMQMACUYWGDOJ,C16H17NO4,C16H18NO4,287.115224,288.1225,H,Orbitrap,20.0,train,True,"[91.0542, 125.0233, 155.0577, 185.0961, 229.08...","[9.90990990990991, 28.128128128128125, 4.00400...","[[91.0542, 9.90990990990991], [125.0233, 28.12...","[True, True, True, True, True, True]","[1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, ..."
2,MassSpecGymID0000003,"69.0343,91.0542,125.0233,127.039,153.0699,154....","0.03403403403403404,0.31431431431431434,1.0,0....",CC(=O)N[C@@H](CC1=CC=CC=C1)C2=CC(=CC(=O)O2)OC,VFMQMACUYWGDOJ,C16H17NO4,C16H18NO4,287.115224,288.1225,H,Orbitrap,40.0,train,True,"[69.0343, 91.0542, 125.0233, 127.039, 153.0699...","[3.4034034034034035, 31.431431431431434, 100.0...","[[69.0343, 3.4034034034034035], [91.0542, 31.4...","[True, True, True, True, True, True, True, Tru...","[1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, ..."
3,MassSpecGymID0000004,"69.0343,91.0542,110.06,111.0441,112.0393,120.0...","0.17917917917917917,0.47347347347347346,0.0380...",CC(=O)N[C@@H](CC1=CC=CC=C1)C2=CC(=CC(=O)O2)OC,VFMQMACUYWGDOJ,C16H17NO4,C16H18NO4,287.115224,288.1225,H,Orbitrap,55.0,train,True,"[69.0343, 91.0542, 110.06, 111.0441, 112.0393,...","[17.917917917917915, 47.347347347347345, 3.803...","[[69.0343, 17.917917917917915], [91.0542, 47.3...","[True, True, True, True, True, True, True, Tru...","[1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, ..."
4,MassSpecGymID0000005,"91.0542,125.0233,185.0961,229.0859,246.1125,28...","0.07807807807807808,0.1841841841841842,0.03503...",CC(=O)N[C@@H](CC1=CC=CC=C1)C2=CC(=CC(=O)O2)OC,VFMQMACUYWGDOJ,C16H17NO4,C16H18NO4,287.115224,288.1225,H,Orbitrap,10.0,train,True,"[91.0542, 125.0233, 185.0961, 229.0859, 246.11...","[7.807807807807808, 18.41841841841842, 3.50350...","[[91.0542, 7.807807807807808], [125.0233, 18.4...","[True, True, True, True, True, True]","[1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, ..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
231099,MassSpecGymID0414168,"55.054474,58.029369,58.065601,59.049339,69.032...","0.0004935684823754727,0.00012405427491363124,0...",CC[C@@H]1[C@H](/C=C(/C=C\C(=O)[C@@H](C[C@@H]([...,WBPYTXDJUQJLPQ,C46H77NO17,C46H78NO17,915.521724,916.5290,H,QTOF,-1,val,False,"[55.054474, 58.029369, 58.065601, 59.049339, 6...","[0.04935684823754727, 0.012405427491363124, 0....","[[101.05954, 1.6709916364850783], [145.085648,...","[False, False, False, False, False, False, Fal...","[1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, ..."
231100,MassSpecGymID0414171,"72.043739,86.060242,98.058304,99.0634,126.0537...","0.0019527744563354998,0.0016939927337585416,0....",C[C@@]1([C@H]2C[C@H]3[C@@H](C(=O)C(=C([C@]3(C(...,DHPRQBPJLMKORJ,C22H23ClN2O8,C22H24ClN2O8,478.115724,479.1230,H,QTOF,-1,test,False,"[72.043739, 86.060242, 98.058304, 99.0634, 126...","[0.19527744563354998, 0.16939927337585417, 0.1...","[[444.084412, 8.434492975677559], [462.095337,...","[False, False, False, False, False, False, Fal...","[0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, ..."
231101,MassSpecGymID0414172,"72.080132,102.233917,113.082344,113.121498,114...","0.021173783463613503,0.007516001055215398,0.03...",C[C@H]([C@@H]1CC[C@H]([C@H](O1)O[C@@H]2[C@H](C...,CEAZRRDELHUEMR,C21H43N5O7,C21H44N5O7,477.317724,478.3250,H,QTOF,-1,val,False,"[72.080132, 102.233917, 113.082344, 113.121498...","[2.1173783463613502, 0.7516001055215398, 3.565...","[[72.080132, 2.1173783463613502], [113.082344,...","[True, False, True, False, True, False, False,...","[1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, ..."
231102,MassSpecGymID0414173,"55.053627,56.455425,58.065819,67.053543,82.065...","0.031731527156456024,0.017878400151027027,0.07...",C[C@H]([C@@H]1CC[C@H]([C@H](O1)O[C@@H]2[C@H](C...,CEAZRRDELHUEMR,C21H43N5O7,C21H44N5O7,477.317724,478.3250,H,QTOF,-1,val,False,"[55.053627, 56.455425, 58.065819, 67.053543, 8...","[3.1731527156456023, 1.7878400151027027, 7.332...","[[55.053627, 3.1731527156456023], [56.455425, ...","[True, True, True, True, True, True, True, Tru...","[1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, ..."


In [None]:
df[df['fold'].eq('train')][['formula', 'smiles','spectrum','fingerprint']].to_parquet(f'{path}/train.parquet')
df[df['fold'].eq('test')][['formula', 'smiles','spectrum','fingerprint']].to_parquet(f'{path}/test.parquet')
df[df['fold'].eq('val')][['formula', 'smiles','spectrum','fingerprint']].to_parquet(f'{path}/val.parquet')

# to show the final formatting
df[df['fold'].eq('train')][['formula', 'smiles','spectrum','fingerprint']]

Unnamed: 0,formula,smiles,spectrum,fingerprint
0,C16H17NO4,CC(=O)N[C@@H](CC1=CC=CC=C1)C2=CC(=CC(=O)O2)OC,"[[91.0542, 24.524524524524523], [125.0233, 100...","[1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, ..."
1,C16H17NO4,CC(=O)N[C@@H](CC1=CC=CC=C1)C2=CC(=CC(=O)O2)OC,"[[91.0542, 9.90990990990991], [125.0233, 28.12...","[1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, ..."
2,C16H17NO4,CC(=O)N[C@@H](CC1=CC=CC=C1)C2=CC(=CC(=O)O2)OC,"[[69.0343, 3.4034034034034035], [91.0542, 31.4...","[1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, ..."
3,C16H17NO4,CC(=O)N[C@@H](CC1=CC=CC=C1)C2=CC(=CC(=O)O2)OC,"[[69.0343, 17.917917917917915], [91.0542, 47.3...","[1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, ..."
4,C16H17NO4,CC(=O)N[C@@H](CC1=CC=CC=C1)C2=CC(=CC(=O)O2)OC,"[[91.0542, 7.807807807807808], [125.0233, 18.4...","[1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, ..."
...,...,...,...,...
231090,C23H22O13,CC(=O)OC[C@@H]1[C@H](C(C([C@@H](O1)OC2=C(OC3=C...,"[[81.034798, 7.6076076076076085], [109.0299, 8...","[1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, ..."
231091,C27H30O15,C[C@@H]1[C@@H]([C@@H]([C@H]([C@@H](O1)OC2=CC(=...,"[[287.05481, 100.0], [433.112701, 27.127127127...","[1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, ..."
231092,C27H30O15,CC1[C@@H](C([C@@H]([C@@H](O1)OC2[C@@H](C(O[C@H...,"[[71.050797, 4.3043043043043046], [85.029099, ...","[1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, ..."
231093,C33H40O19,CC1[C@@H]([C@@H](C([C@@H](O1)OC2=CC(=C3C(=C2)O...,"[[85.029198, 7.707707707707708], [287.055115, ...","[1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, ..."
