Import libraries

In [29]:
import os 
import copy
import json
import pickle
import random
import collections
import numpy as np
import pandas as pd
from tqdm import tqdm
import torch.nn.functional as F

import rdkit.Chem as Chem
from rdkit.Chem import AllChem
import scipy.stats as stats

import matplotlib.pyplot as plt
from sklearn.metrics import log_loss

from matchms import Spectrum
from matchms import calculate_scores
from matchms.similarity import CosineGreedy

Helper Functions

In [40]:
def load_pickle(path):

    with open(path, "rb") as f:

        data = pickle.load(f)
    
    return data

def get_mol(smiles):

    mol = Chem.MolFromSmiles(smiles, sanitize=True)
    Chem.Kekulize(mol, clearAromaticFlags=True)
    
    for i, atom in enumerate(mol.GetAtoms()):
        atom.SetProp("molAtomMapNumber", str(atom.GetIdx() + 1))
    
    return mol

def get_morgan(mol, radius, FP_size):
    
    fpgen = AllChem.GetMorganGenerator(radius = radius, fpSize = FP_size)
    FP = fpgen.GetFingerprint(mol).ToBitString()

    return FP

def get_FP(smiles):

    # Add in the fingerprint
    mol = get_mol(smiles)

    r = {}

    r["morgan4_256"] = get_morgan(mol, 2, 256)
    r["morgan4_1024"] = get_morgan(mol, 2, 1024)
    r["morgan4_2048"] = get_morgan(mol, 2, 2048)

    r["morgan6_256"] = get_morgan(mol, 3, 256)
    r["morgan6_1024"] = get_morgan(mol, 3, 1024)
    r["morgan6_2048"] = get_morgan(mol, 3, 2048)
    r["morgan6_4096"] = get_morgan(mol, 3, 4096)

    return r 

def flip_bit(FP, percentage_bits_flipped):

    n_bits = len(FP)
    n_bits_flipped = max(1, int(percentage_bits_flipped * n_bits))
    pos_flipped = random.sample(range(n_bits), n_bits_flipped)

    for pos in pos_flipped:
        bit = FP[pos]

        if bit == 1:  FP[pos] = 0 
        if bit == 0:  FP[pos] = 1 

    return FP

def batch_jaccard_index(FP_pred, FP):

    # Intersection = bitwise AND
    intersection = np.logical_and(FP, FP_pred).sum(axis=1)

    # Union = bitwise OR
    union = np.logical_or(FP, FP_pred).sum(axis=1)

    # Avoid division-by-zero by adding a small epsilon
    jaccard_scores = intersection / (union + 1e-9)

    return jaccard_scores



In [78]:
def lookup_pubchem(r, FP, pubchem, FP_key):

    # Get necessary information 
    inchikey = r["inchikey"]

    # Get the candidates 
    formula = r["formula"]
    cand = pubchem[formula]
    cand = [k[:14] for k, v in cand.items()]

    # Get the loss
    loss = batch_jaccard_index(FP, np.random.rand(len(cand), len(FP)))

    # Sort the candidates
    cand_sorted = sorted(zip(cand, loss), reverse = True)

    # Get the ranking 
    rank = [c[0] for c in cand_sorted].index(inchikey)

    return rank 


Load in the data

In [8]:
massspecgym_path = "/data/rbg/users/klingmin/projects/MS_processing/benchmarks/massspec_gym/MassSpecGym.tsv"
massspecgym = pd.read_csv(massspecgym_path, sep = "\t")
massspecgym = json.loads(massspecgym.to_json(orient = "records"))

massspecgym[0]

{'identifier': 'MassSpecGymID0000001',
 'mzs': '91.0542,125.0233,154.0499,155.0577,185.0961,200.107,229.0859,246.1125',
 'intensities': '0.24524524524524524,1.0,0.08008008008008008,0.35535535535535534,0.34934934934934936,0.04504504504504504,0.14214214214214213,0.7347347347347347',
 'smiles': 'CC(=O)N[C@@H](CC1=CC=CC=C1)C2=CC(=CC(=O)O2)OC',
 'inchikey': 'VFMQMACUYWGDOJ',
 'formula': 'C16H17NO4',
 'precursor_formula': 'C16H18NO4',
 'parent_mass': 287.115224,
 'precursor_mz': 288.1225,
 'adduct': '[M+H]+',
 'instrument_type': 'Orbitrap',
 'collision_energy': 30.0,
 'fold': 'train',
 'simulation_challenge': True}

In [None]:
import os 

In [9]:
pubchem_path = "/data/rbg/users/klingmin/projects/MS_processing/data/pubchem/pubchem.pkl"
pubchem = load_pickle(pubchem_path)

Settings

In [79]:
percentage_bits_flipped = 0.05
FP_key = "morgan4_256"
k = 5

Iterate through perturbations

In [81]:
all_ranks_original = []
all_ranks_perturbed = [] 

for r in massspecgym:

    smiles = r["smiles"]
    all_FPs  = get_FP(smiles)
    FP = [int(c) for c in all_FPs[FP_key]]

    # Look up the database 
    rank_original = lookup_pubchem(r, FP, pubchem, FP_key)

    # Get `k` ranks for perturbing 
    rank_perturbed = []
    for _ in range(k):
        FP_perturbed = flip_bit([int(c) for c in all_FPs[FP_key]], percentage_bits_flipped)
        rank_perturbed.append(lookup_pubchem(r, FP_perturbed, pubchem, FP_key))
    
    rank_perturbed = np.mean(rank_perturbed)
    
    # Append to the master list 
    all_ranks_original.append(rank_original)
    all_ranks_perturbed.append(rank_perturbed)

    print(rank_original)
    print(rank_perturbed)
    
    a = z

    

2022
2022.0


NameError: name 'z' is not defined