# Generate spec embeds
Generate embeds for baselines spec2vec, ms2deepscore, etc.

In [1]:
# Install package if needed
#!pip install spec2vec
#!pip install ms2deepscore

# Download pretrained ms2deepscore
#!wget https://zenodo.org/record/4699356/files/MS2DeepScore_allGNPSpositive_10k_500_500_200.hdf5?download=1
#!mv MS2DeepScore_allGNPSpositive_10k_500_500_200.hdf5?download=1 MS2DeepScore_allGNPSpositive_10k_500_500_200.hdf5

# Download pretrained spec2vec
#!wget https://zenodo.org/record/4173596/files/spec2vec_AllPositive_ratio05_filtered_201101_iter_15.model?download=1
#!mv spec2vec_AllPositive_ratio05_filtered_201101_iter_15.model?download=1 spec2vec.model

#!wget https://zenodo.org/record/4173596/files/spec2vec_AllPositive_ratio05_filtered_201101_iter_15.model.trainables.syn1neg.npy?download=1
#!mv spec2vec_AllPositive_ratio05_filtered_201101_iter_15.model.trainables.syn1neg.npy?download=1 spec2vec.model.trainables.syn1neg.npy

#!wget https://zenodo.org/record/4173596/files/spec2vec_AllPositive_ratio05_filtered_201101_iter_15.model.wv.vectors.npy?download=1
#!mv spec2vec_AllPositive_ratio05_filtered_201101_iter_15.model.wv.vectors.npy?download=1 spec2vec.model.wv.vectors.npy

In [2]:
from tqdm import tqdm
from pathlib import Path

import pickle
import numpy as np
import pandas as pd

# Matchms filtering
import matchms
from matchms import importing
from matchms.filtering import default_filters
from matchms.filtering import normalize_intensities
from matchms.filtering import select_by_intensity
from matchms.filtering import select_by_mz
import matchms.filtering as msfilters

from matchms.similarity import ModifiedCosine
from matchms import calculate_scores

import gensim
from spec2vec import Spec2Vec

import ms2deepscore
from ms2deepscore.models import load_model


matchms.set_matchms_logger_level(loglevel="ERROR")

2023-05-01 08:19:10.696035: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [3]:
def peak_processing(spectrum):
    """ peak_processing. 

    Taken directly from tutorial

    """
    spectrum = default_filters(spectrum)
    spectrum = normalize_intensities(spectrum)
    spectrum = select_by_intensity(spectrum, intensity_from=0.01)
    spectrum = select_by_mz(spectrum, mz_from=10, mz_to=1500)
    return spectrum


In [4]:
# Debug
# Non debug
debug = False

f = "../data/paired_spectra/csi2022/csi2022.mgf"
f_split = "../data/paired_spectra/csi2022/splits/csi_split_0.txt"

res = Path("../results/2023_04_30_embed_matchms/")
res.mkdir(exist_ok=True)

split_df = pd.read_csv(f_split, sep=",")
test_names = set(split_df[split_df['Fold_0'] == "test"]['name'].values)

input_specs = importing.load_from_mgf(f, metadata_harmonization=True)
new_names, new_specs = [], []
for ind, i in enumerate(tqdm(input_specs)):
    if debug and ind > 20:
        break

    f_name = i.metadata['_file']
    if f_name not in test_names:
        continue

    new_specs.append(i)
    new_names.append(f_name)
    
input_specs, names = new_specs, new_names

spectrums = [peak_processing(s) for s in input_specs]


31145it [00:35, 880.09it/s] 


In [5]:
model_name = "ms2deepscore"
model_name = "spec2vec"
model_name = "cosine"

out_file = res / f"{model_name}_out.p"

if model_name == "spec2vec":
    spec_model = "spec2vec.model"
    model = gensim.models.Word2Vec.load(spec_model)
    embed_model = Spec2Vec(model=model, intensity_weighting_power=0.5,
                           allowed_missing_percentage=5.0)
    embed_fn = embed_model._calculate_embedding
    pairwise=False
elif model_name == "ms2deepscore":
    spec_model = "MS2DeepScore_allGNPSpositive_10k_500_500_200.hdf5"
    tf_model = load_model("MS2DeepScore_allGNPSpositive_10k_500_500_200.hdf5")
    model = ms2deepscore.MS2DeepScore(tf_model)
    embed_fn = lambda x : model.calculate_vectors([x])[0]
    pairwise=False
elif model_name == "cosine":
    similarity_measure = ModifiedCosine(tolerance=0.005)
    pairwise_fn = lambda x: calculate_scores(x, x, 
                                             similarity_measure, 
                                             is_symmetric=True).scores['score']
    pairwise=True

In [24]:
%%capture captured
if pairwise:
    pairwise_scores = pairwise_fn(spectrums)
    out = {"names": np.array(names),
           "pairwise_cos": pairwise_scores,
           "args": {"model": model_name}}

else:
    new_embeddings = []
    new_names = []
    for i, j in tqdm(zip(spectrums, names)):
        if i is None:
            continue
        new_embedding = embed_fn(i)
        new_embeddings.append(new_embedding)
        new_names.append(j)

    new_embeddings = np.vstack(new_embeddings)

    out = {"names": np.array(new_names),
           "embeds": new_embeddings,
           "args": {"model": model_name}}

with open(out_file, "wb") as fp:
    pickle.dump(out, fp)
