Import libraries

In [1]:
import os 
import json
import torch
import numpy as np 
from matchms import Spectrum
from matchms.exporting import save_as_mgf

from utils import load_pickle, load_json, pickle_data

mkdir -p failed for path /afs/csail.mit.edu/u/k/klingmin/.config/matplotlib: [Errno 13] Permission denied: '/afs/csail.mit.edu/u/k/klingmin/.config/matplotlib'
Matplotlib created a temporary cache directory at /tmp/matplotlib-3281dumr because there was an issue with the default path (/afs/csail.mit.edu/u/k/klingmin/.config/matplotlib); it is highly recommended to set the MPLCONFIGDIR environment variable to a writable directory, in particular to speed up the import of Matplotlib and to better support multiprocessing.


Settings

In [11]:
data_folder = "/data/rbg/users/klingmin/projects/MS_processing/data/"
splits_folder = "/data/rbg/users/klingmin/projects/MS_processing/data_splits/"
output_folder = "./cache/MGF_files"

datasets = ["canopus", "massspecgym", "nist2023"]
splits = ["scaffold_vanilla", "inchikey_vanilla", "random", "LS"]

Helper functions

In [13]:
def string_to_bits(string): 

    bits = np.array([int(c) for c in string])

    return bits

def get_spec(id_, rec):
    
    peaks = sorted(rec["peaks"], key = lambda x :x["mz"])

    spec = Spectrum(mz = np.array([p["mz"] for p in peaks]),
                    intensities = np.array([p["intensity"] for p in peaks]),
                    metadata = {"id_": id_,
                                "precursor_mz": rec["precursor_MZ_final"],
                                "FP": rec["FPs"]["morgan4_4096"]})
    
    return spec

Load in the data

In [9]:
dataset_info = {} 

canopus = load_pickle(os.path.join(data_folder, "canopus", "canopus_w_mol_info_w_frag_CF_preds.pkl"))
canopus = {str(r["id_"]) : r for r in canopus}
print("Done loading canopus")

massspecgym = load_pickle(os.path.join(data_folder, "massspecgym", "massspecgym_w_mol_info_w_frag_CF_preds.pkl"))
massspecgym = {str(r["id_"]) : r for r in massspecgym}
print("Done loading MSG")

nist2023 = load_pickle(os.path.join(data_folder, "nist2023", "nist2023_w_mol_info_w_frag_CF_preds.pkl"))
nist2023 = {r["id_"] : r for r in nist2023}
print("Done loading NIST2023")

dataset_info["canopus"] = canopus
dataset_info["massspecgym"] = massspecgym
dataset_info["nist2023"] = nist2023

Done loading canopus
Done loading MSG
Done loading NIST2023


Get the MGF files

In [14]:
for dataset in datasets: 

    for split in splits: 

        current_folder = os.path.join(output_folder, dataset, split)
        if not os.path.exists(current_folder): os.makedirs(current_folder)
        train_MGF_path = os.path.join(current_folder, "train.mgf")
        test_MGF_path = os.path.join(current_folder, "test.mgf")

        split_file = load_json(os.path.join(splits_folder, dataset, "splits", split + ".json"))

        if not os.path.exists(train_MGF_path):

            train = split_file["train"]
            train = [get_spec(f.replace(".pkl", ""), dataset_info[dataset][f.replace(".pkl", "")]) for f in train]
            save_as_mgf(train, train_MGF_path)
            print(f"Generated the spec for {dataset}, {split} (train). Length: {len(train)}")
        
        if not os.path.exists(test_MGF_path):

            test = split_file["test"]
            test = [get_spec(f.replace(".pkl", ""), dataset_info[dataset][f.replace(".pkl", "")]) for f in test]
            save_as_mgf(test, test_MGF_path)
            print(f"Generated the spec for {dataset}, {split} (test). Length: {len(test)}")


Generated the spec for nist2023, scaffold_vanilla (train). Length: 643637
Generated the spec for nist2023, scaffold_vanilla (test). Length: 137840
Generated the spec for nist2023, inchikey_vanilla (train). Length: 643637
Generated the spec for nist2023, inchikey_vanilla (test). Length: 137840
Generated the spec for nist2023, random (train). Length: 643521
Generated the spec for nist2023, random (test). Length: 137898
Generated the spec for nist2023, LS (train). Length: 285823
Generated the spec for nist2023, LS (test). Length: 18333
