In [1]:
import pandas as pd
import h5py
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from tqdm import tqdm
tqdm.pandas()
from sklearn.model_selection import StratifiedGroupKFold
from pandarallel import pandarallel
from sklearn.cluster import AgglomerativeClustering
from scipy.spatial.distance import squareform
from rdkit import Chem
from rdkit.Chem import Draw
from rdkit.Chem import DataStructs
from matchms import Spectrum
from matchms.exporting import save_as_mgf
import massspecgym.utils as utils

In [26]:
df = pd.read_csv('../../data/data/MassSpecGym_split.tsv', sep='\t')
len(df)

231104

### Merge `ITFT` and `QFT` instruments to `Orbitrap`

In [21]:
display(df['instrument_type'].value_counts())
df['instrument_type'] = df['instrument_type'].replace({'ITFT':'Orbitrap', 'QFT':'Orbitrap'})
display(df['instrument_type'].value_counts())

instrument_type
ITFT        110724
QTOF         53823
Orbitrap     38585
QFT          22749
Name: count, dtype: int64

instrument_type
Orbitrap    172058
QTOF         53823
Name: count, dtype: int64

### Use only columns having all metadata available and [M+H]+ adduct for the simulation challenge

In [22]:
df['simulation_challenge'] = (~df.isna().any(axis=1)) & (df['adduct'] == '[M+H]+')
df['simulation_challenge'].value_counts()

simulation_challenge
True     119029
False    112075
Name: count, dtype: int64

### Remove `inchi` column

In [18]:
df = df.drop(columns=['inchi'])

### Reorder columns

In [29]:
print(df.shape)
cols = [
    'identifier', 'mzs', 'intensities', 'smiles', 'inchikey', 'formula', 'precursor_formula',
    'parent_mass', 'precursor_mz', 'adduct', 'instrument_type', 'collision_energy', 'fold',
    'simulation_challenge'
]
df = df[cols]
print(df.shape)
df

(231104, 15)
(231104, 14)


Unnamed: 0,identifier,mzs,intensities,smiles,inchikey,formula,precursor_formula,parent_mass,precursor_mz,adduct,instrument_type,collision_energy,fold,simulation_challenge
0,MassSpecGymID0000001,"91.0542,125.0233,154.0499,155.0577,185.0961,20...","0.24524524524524524,1.0,0.08008008008008008,0....",CC(=O)N[C@@H](CC1=CC=CC=C1)C2=CC(=CC(=O)O2)OC,VFMQMACUYWGDOJ,C16H17NO4,C16H18NO4,287.115224,288.1225,[M+H]+,ITFT,30.0,train,True
1,MassSpecGymID0000002,"91.0542,125.0233,155.0577,185.0961,229.0859,24...","0.0990990990990991,0.28128128128128127,0.04004...",CC(=O)N[C@@H](CC1=CC=CC=C1)C2=CC(=CC(=O)O2)OC,VFMQMACUYWGDOJ,C16H17NO4,C16H18NO4,287.115224,288.1225,[M+H]+,ITFT,20.0,train,True
2,MassSpecGymID0000003,"69.0343,91.0542,125.0233,127.039,153.0699,154....","0.03403403403403404,0.31431431431431434,1.0,0....",CC(=O)N[C@@H](CC1=CC=CC=C1)C2=CC(=CC(=O)O2)OC,VFMQMACUYWGDOJ,C16H17NO4,C16H18NO4,287.115224,288.1225,[M+H]+,ITFT,40.0,train,True
3,MassSpecGymID0000004,"69.0343,91.0542,110.06,111.0441,112.0393,120.0...","0.17917917917917917,0.47347347347347346,0.0380...",CC(=O)N[C@@H](CC1=CC=CC=C1)C2=CC(=CC(=O)O2)OC,VFMQMACUYWGDOJ,C16H17NO4,C16H18NO4,287.115224,288.1225,[M+H]+,ITFT,55.0,train,True
4,MassSpecGymID0000005,"91.0542,125.0233,185.0961,229.0859,246.1125,28...","0.07807807807807808,0.1841841841841842,0.03503...",CC(=O)N[C@@H](CC1=CC=CC=C1)C2=CC(=CC(=O)O2)OC,VFMQMACUYWGDOJ,C16H17NO4,C16H18NO4,287.115224,288.1225,[M+H]+,ITFT,10.0,train,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
231099,MassSpecGymID0414168,"55.054474,58.029369,58.065601,59.049339,69.032...","0.0004935684823754727,0.00012405427491363124,0...",CC[C@@H]1[C@H](/C=C(/C=C\C(=O)[C@@H](C[C@@H]([...,WBPYTXDJUQJLPQ,C46H77NO17,C46H78NO17,915.521724,916.5290,[M+H]+,QTOF,,val,False
231100,MassSpecGymID0414171,"72.043739,86.060242,98.058304,99.0634,126.0537...","0.0019527744563354998,0.0016939927337585416,0....",C[C@@]1([C@H]2C[C@H]3[C@@H](C(=O)C(=C([C@]3(C(...,DHPRQBPJLMKORJ,C22H23ClN2O8,C22H24ClN2O8,478.115724,479.1230,[M+H]+,QTOF,,test,False
231101,MassSpecGymID0414172,"72.080132,102.233917,113.082344,113.121498,114...","0.021173783463613503,0.007516001055215398,0.03...",C[C@H]([C@@H]1CC[C@H]([C@H](O1)O[C@@H]2[C@H](C...,CEAZRRDELHUEMR,C21H43N5O7,C21H44N5O7,477.317724,478.3250,[M+H]+,QTOF,,val,False
231102,MassSpecGymID0414173,"55.053627,56.455425,58.065819,67.053543,82.065...","0.031731527156456024,0.017878400151027027,0.07...",C[C@H]([C@@H]1CC[C@H]([C@H](O1)O[C@@H]2[C@H](C...,CEAZRRDELHUEMR,C21H43N5O7,C21H44N5O7,477.317724,478.3250,[M+H]+,QTOF,,val,False


In [30]:
df.to_csv('../../data/data/MassSpecGym.tsv', sep='\t', index=False)