In [1]:
import pandas as pd
import os
from pprint import pprint
import numpy as np
import re
import seaborn as sns

In [2]:
# some helper functions

def parse_ce_str(ce_str):

    ce_str = str(ce_str)
    normal_regex = re.compile(r'\d+(\.\d+)?')
    # ramped_regex = re.compile(r'\d+(\.\d+)?-\d+(\.\d+)?')
    ramped_regex = re.compile(r'\d+(\.\d+)?(V)?(-|->)\d+(\.\d+)?(V)?')
    ce_str = ce_str.split(";")[-1]
    try:
        if "%" in ce_str:
            normalized = True
        else:
            normalized = False
        if "-" in ce_str or "Ramp" in ce_str or "RAMP" in ce_str or "->" in ce_str:
            ramped = True
            ramped_ce = ramped_regex.search(ce_str).group(0)
            if "->" in ramped_ce:
                min_ce, max_ce = ramped_ce.split("->")
            else:
                min_ce, max_ce = ramped_ce.split("-")
            ce = 0.5*float(min_ce.strip("V")) + 0.5*float(max_ce.strip("V"))
        else:
            ramped = False
            ce = normal_regex.search(ce_str).group(0)
            ce = float(ce)
    except:
        ce = np.nan
        normalized = False
        ramped = False
    return ce, normalized, ramped

def convert_nce(row):
    # assumes charge factor of 1
    if row["normalized"]:
        nce = row["ce"]
        ace = (nce * row["precursor_mz"] * 1.) / 500.
    else:
        ace = row["ce"]
    return ace

In [3]:
# df_fp = "/scratch/hdd001/home/adamo/neurips_msms_library/neurips_library_positive_df_2.csv"
df_fp = "/scratch/hdd001/home/adamo/neurips_msms_library/MassSpecGym_labeled_data_df.csv"

In [4]:
df = pd.read_csv(df_fp)

  df = pd.read_csv(df_fp)


In [5]:
print(df.shape)

(448979, 215)


In [6]:
def spec_per_mol_analysis(df):
    # get distribution of number of spectra per molecule
    assert not df["inchikey"].isna().any()
    spec_per_mol = df[["inchikey"]].groupby("inchikey").size().reset_index(name='counts')
    print(spec_per_mol["counts"].describe())
    counts, bins = np.histogram(spec_per_mol["counts"].to_numpy(), bins=np.arange(0, 25, 3))
    last_counts = (spec_per_mol["counts"].to_numpy() > bins[-1]).sum()
    total_counts = np.sum(counts) + last_counts
    assert total_counts == df["inchikey"].nunique()
    percents = 100 * counts / total_counts
    count_counts, _ = np.histogram(spec_per_mol["counts"].to_numpy(), bins=bins, weights=spec_per_mol["counts"].to_numpy())
    last_count_counts = ((spec_per_mol["counts"].to_numpy() > bins[-1]) * spec_per_mol["counts"].to_numpy()).sum()
    total_count_counts = np.sum(count_counts) + last_count_counts
    assert total_count_counts == df.shape[0]
    count_percents = 100 * count_counts / total_count_counts
    for i in range(len(percents)):
        print(f"| ({bins[i]},{bins[i+1]}] | {percents[i]:.1f} % | {count_percents[i]:.1f} % |")
    print(f"| ({bins[-1]},inf) | {100 * last_counts / total_counts:.1f} % | {100 * last_count_counts / total_count_counts:.1f} % |")

In [7]:
spec_per_mol_analysis(df)

count    36752.000000
mean        12.216451
std         31.976222
min          1.000000
25%          2.000000
50%          3.000000
75%          8.000000
max        998.000000
Name: counts, dtype: float64
| (0,3] | 29.4 % | 3.5 % |
| (3,6] | 40.2 % | 11.3 % |
| (6,9] | 6.9 % | 3.8 % |
| (9,12] | 4.1 % | 3.3 % |
| (12,15] | 2.9 % | 3.1 % |
| (15,18] | 2.5 % | 3.3 % |
| (18,21] | 1.5 % | 2.3 % |
| (21,24] | 1.4 % | 2.5 % |
| (24,inf) | 11.2 % | 67.1 % |


In [8]:
columns = sorted(df.columns)
columns_by_nans = df.isna().sum().sort_values(ascending=True)
print(columns_by_nans[:25])

charge                         0
precursor_mz                   0
smiles                         0
inchikey                       0
adduct                         0
peaks_json                     0
id                             0
parent_mass                    0
inchi                          0
ionmode                        0
compound_name               1771
ms_level                    3245
instrument_type            10564
spectrum_id                47692
scans                     120601
database_origin           120601
principal_investigator    120631
data_collector            121254
file_name                 168293
peptide_sequence          168293
confidence                168293
submit_user               168293
organism_name             168293
formula                   260695
num_peaks                 280686
dtype: int64


In [9]:
ce_col = "collision_energy"
print(df[ce_col].isna().sum())
print(df[ce_col].value_counts()[:10])

291230
collision_energy
60.0     14636
20.0     14464
30.0      6200
6V        5930
10 eV     5449
15.0      4741
40        3501
45.0      3446
30        3345
20 eV     3225
Name: count, dtype: int64


In [10]:
adduct_col = "adduct"
print(df[adduct_col].isna().sum())
print(df[adduct_col].value_counts()[:10])

0
adduct
[M+H]+         302835
[M+Na]+        118040
[M+NH4]+        12830
[M-H2O+H]+       6671
[2M+H]+          1792
[2M+Na]+         1657
[M-2H2O+H]+      1312
[M+K]+           1188
[M]+              945
[M+2H]2+          657
Name: count, dtype: int64


In [11]:
inst_col = "instrument_type"
print(df[inst_col].isna().sum())
print(df[inst_col].value_counts()[:10])

10564
instrument_type
ESI-ITFT             222251
LC-ESI-QTOF           69842
LC-ESI-ITFT           61678
Orbitrap              47693
LC-ESI-QFT            21954
ESI-QTOF               9562
ESI-QFT                4759
LC-Q-TOF/MS             172
LC-ESI-QEHF             144
LC-ESI-Q-Orbitrap       108
Name: count, dtype: int64


In [12]:
level_col = "ms_level"
print(df[level_col].isna().sum())
print(df[level_col].value_counts())

3245
ms_level
2      303802
MS2    116961
2       24576
MS1       334
MS         61
Name: count, dtype: int64


In [13]:
adducts = [
    "[M+H]+",
    "[M+Na]+"
]

inst_types_map = {
    # "ESI-ITFT": "ITFT",
    "LC-ESI-QTOF": "QTOF",
    # "LC-ESI_ITFT": "ITFT",
    "Orbitrap": "Orbitrap",
    "LC-ESI-QFT": "QFT",
    "ESI-QTOF": "QTOF",
    "ESI-QFT": "QFT"
}

def get_filter(df):

    ce_filter = ~(df["collision_energy"].isna())
    adduct_filter = df["adduct"].isin(adducts)
    inst_filter = ~(df["instrument_type"].map(inst_types_map).isna())
    prec_mz_filter = ~(df["precursor_mz"].isna())
    level_filter = df["ms_level"].isin([2,"2","MS2"])
    all_filter = ce_filter & adduct_filter & inst_filter & prec_mz_filter & level_filter
    print("collision energy filter:",ce_filter.mean())
    print("adduct filter:",adduct_filter.mean())
    print("instrument filter:",inst_filter.mean())
    print("prec_mz_filter:",prec_mz_filter.mean())
    print("level filter:",level_filter.mean())
    print("intersection filter:",all_filter.mean())
    return all_filter

all_filter = get_filter(df)


collision energy filter: 0.35135050859839767
adduct filter: 0.9374046447606681
instrument filter: 0.3425772697609465
prec_mz_filter: 1.0
level filter: 0.9918927165858537
intersection filter: 0.26731762510050583


In [15]:
filter_df = df[all_filter].copy(deep=True)
print("num spectra:", filter_df.shape[0])
print("num compounds:", filter_df["inchikey"].nunique())
print(filter_df["adduct"].value_counts())
print(filter_df["instrument_type"].value_counts())

num spectra: 120020
num compounds: 19265
adduct
[M+H]+     115023
[M+Na]+      4997
Name: count, dtype: int64
instrument_type
LC-ESI-QTOF    52284
Orbitrap       41356
LC-ESI-QFT     21432
ESI-QFT         3089
ESI-QTOF        1859
Name: count, dtype: int64


In [16]:
ces = filter_df["collision_energy"].unique()
print(ces[:10])

['65HCD' '45HCD' '35HCD' '60.0' '20.0' '15.0' '20 V' '10 V' '40 V' '30.0']


In [17]:
# standardize CEs
# try to parse them
ce_vals = filter_df["collision_energy"].apply(parse_ce_str)
ce_df = pd.DataFrame(ce_vals.tolist(), columns=["ce","normalized","ramped"])
ce_df.index = filter_df.index
ce_df = pd.concat([ce_df,filter_df[["precursor_mz"]]],axis=1)
print(ce_df.isna().sum())
print(ce_df["normalized"].value_counts())
print(ce_df["ramped"].value_counts())
# convert normalized to absolute
ce = ce_df.apply(convert_nce,axis=1)
print(ce.isna().sum())
ce_df.loc[:,"ce"] = ce
print(ce_df["ce"].isna().sum())
ce_filter_2 = ~(ce_df["ce"].isna())
print(ce_filter_2.mean())

ce              2
normalized      0
ramped          0
precursor_mz    0
dtype: int64
normalized
False    114770
True       5250
Name: count, dtype: int64
ramped
False    115566
True       4454
Name: count, dtype: int64
2
2
0.9999833361106483


In [18]:
# get subset that can handle ces and pass previous filters
all_filter_2 = all_filter & ce_filter_2
filter_df_2 = df[all_filter_2].copy(deep=True)
print("num spectra:", filter_df_2.shape[0])
print("num compounds:", filter_df_2["inchikey"].nunique())
print(filter_df_2["adduct"].value_counts())
print(filter_df_2["instrument_type"].value_counts())

num spectra: 120018
num compounds: 19265
adduct
[M+H]+     115021
[M+Na]+      4997
Name: count, dtype: int64
instrument_type
LC-ESI-QTOF    52282
Orbitrap       41356
LC-ESI-QFT     21432
ESI-QFT         3089
ESI-QTOF        1859
Name: count, dtype: int64


In [19]:
# standardize ces (rename to ce, to avoid overwriting collision_energy)
ce_vals = filter_df_2["collision_energy"].apply(parse_ce_str)
ce_df = pd.DataFrame(ce_vals.tolist(), columns=["ce","normalized","ramped"])
ce_df.index = filter_df_2.index
ce_df = pd.concat([ce_df,filter_df_2[["precursor_mz"]]],axis=1)
ce = ce_df.apply(convert_nce,axis=1)
ce_df.loc[:,"ce"] = ce
assert not ce_df["ce"].isna().any()
filter_df_2.loc[:,"ce"] = ce_df["ce"]

In [20]:
# standardize instrument types (rename to inst_type, to avoid overwriting instrument_type)
filter_df_2.loc[:,"inst_type"] = filter_df_2["instrument_type"].map(inst_types_map)

In [21]:
# verify that there are no nans
print(filter_df_2[["ce","adduct","inst_type"]].isna().sum())
# summarize metadata
print(filter_df_2["ce"].describe())
print(filter_df_2["adduct"].value_counts())
print(filter_df_2["inst_type"].value_counts())

ce           0
adduct       0
inst_type    0
dtype: int64
count    120018.000000
mean         39.696305
std          30.907380
min           0.000000
25%          20.000000
50%          30.000000
75%          60.000000
max         376.392708
Name: ce, dtype: float64
adduct
[M+H]+     115021
[M+Na]+      4997
Name: count, dtype: int64
inst_type
QTOF        54141
Orbitrap    41356
QFT         24521
Name: count, dtype: int64


In [22]:
spec_per_mol_analysis(filter_df_2)

count    19265.000000
mean         6.229847
std          8.304412
min          1.000000
25%          3.000000
50%          3.000000
75%          6.000000
max        144.000000
Name: counts, dtype: float64
| (0,3] | 15.1 % | 4.1 % |
| (3,6] | 58.9 % | 29.9 % |
| (6,9] | 7.3 % | 7.7 % |
| (9,12] | 4.5 % | 7.0 % |
| (12,15] | 3.9 % | 7.8 % |
| (15,18] | 3.0 % | 7.7 % |
| (18,21] | 1.5 % | 4.6 % |
| (21,24] | 1.5 % | 5.5 % |
| (24,inf) | 4.3 % | 25.8 % |


In [23]:
proc_df = filter_df_2[["ce","adduct","inst_type","inchikey","smiles","peaks_json","precursor_mz"]].copy(deep=True)
proc_df = proc_df.rename(columns={"ce":"collision_energy","inst_type":"instrument_type","peaks_json":"spectrum"})
proc_df["spec_id"] = np.arange(proc_df.shape[0])
inchikey_to_id = {ik:idx for idx, ik in enumerate(sorted(proc_df["inchikey"].unique()))}
proc_df["mol_id"] = proc_df["inchikey"].map(inchikey_to_id)
proc_df = proc_df[["spec_id","mol_id","adduct","instrument_type","collision_energy","precursor_mz","inchikey","smiles","spectrum"]]
assert ~(proc_df.isna().any().any())
print(proc_df.nunique())

spec_id             120018
mol_id               19265
adduct                   2
instrument_type          3
collision_energy      4056
precursor_mz         20456
inchikey             19265
smiles               19566
spectrum            118287
dtype: int64


In [24]:
proc_df.to_csv("../data/debug/simulation_df.csv", index=False)

In [None]:
# smaller version for debugging
small_proc_df = proc_df.sample(n=10000, replace=False, random_state=420)
small_proc_df.to_csv("../data/debug/simulation_small_df.csv", index=False)