Import libraries

In [10]:
import os
import json
import pickle
import numpy as np
import pandas as pd
from tqdm import tqdm
import torch

Settings

In [2]:
folders = ["../FP_prediction/mist/best_models", "../FP_prediction/baseline_models/best_models"]
all_checkpoints = [] 
for folder in folders:
    for dataset in os.listdir(folder):
        for checkpoint in os.listdir(os.path.join(folder, dataset)):
            all_checkpoints.append(os.path.join(folder, dataset, checkpoint))

Helper Functions

In [3]:
def load_json(path):

    with open(path, "r") as f:
        data = json.load(f)
    
    return data

def write_json(data, path):
    
    with open(path, "w", encoding = "UTF-8") as f:
        json.dump(data, f)

def load_pickle(path):

    with open(path, "rb") as f:
        return pickle.load(f)


def to_binary(FP, threshold = 0.5):

    FP = (FP > threshold).cpu().numpy().astype(int)

    return FP

def jaccard_index(FP_pred, FP, sigmoid = True):

    FP_pred = torch.from_numpy(np.array(FP_pred))
    FP = np.array(FP).astype(int)

    # Turn to binary if 
    if sigmoid:
        FP_pred = torch.sigmoid(FP_pred)

    FP_pred = to_binary(FP_pred)

    # Intersection = bitwise AND
    intersection = np.logical_and(FP, FP_pred).sum()

    # Union = bitwise OR
    union = np.logical_or(FP, FP_pred).sum()

    # Avoid division-by-zero by adding a small epsilon
    jaccard_scores = intersection / (union + 1e-9)

    return jaccard_scores.item()

def get_jaccard_score(results, sigmoid = True):
        
    total = 0 

    for k, v in results.items():

        pred = v["pred"]
        GT = v["GT"]
        j = jaccard_index(pred, GT, sigmoid = sigmoid)

        total += j 

    mean_score = total / len(results)

    return mean_score

def get_unique_mol(ids, labels):

    mols = set([labels[i.replace(".pkl", "")] for i in ids if i.replace(".pkl", "") in labels])
    return len(mols)

`` Look at the selected experimental conditions``

In [4]:
for dataset in ["canopus", "massspecgym", "nist2023"]:

    frags_folder = f"/data/rbg/users/klingmin/projects/MS_processing/data/{dataset}/frags_preds"
    splits_folder = f"/data/rbg/users/klingmin/projects/MS_processing/data_splits/{dataset}/splits"

    split_file_cleaned = load_json(os.path.join(splits_folder, f"random_sieved.json"))
    sample = split_file_cleaned["test"][0]
    frags = load_pickle(os.path.join(frags_folder, f"{sample}"))

    print(dataset)
    print(frags["precursor_type"])
    print(frags["collision_energy"])

    if dataset == "nist2023":
        print(frags["instrument"])
    print(frags["instrument_type"])
    print() 


canopus
[M+H]+
35.0
Orbitrap (LCMS)

massspecgym
[M+H]+
30.0
Orbitrap

nist2023
[M+H]+
20
Thermo Finnigan Elite Orbitrap
HCD



`` Let us look at how much training data is left when we only train with clean subset of the data ``

In [6]:
table = [] 
ori_table = []

for dataset in ["canopus", "massspecgym", "nist2023"]:

    labels_file = f"/data/rbg/users/klingmin/projects/MS_processing/data_BM/mist/{dataset}/labels.tsv"
    labels = pd.read_csv(labels_file, sep = "\t").values
    labels = {str(r[1]): r[6][:14] for r in labels}

    splits_folder = f"/data/rbg/users/klingmin/projects/MS_processing/data_splits/{dataset}/splits"

    splits = ["inchikey_vanilla", "scaffold_vanilla", "random"]

    for split in splits: 

        split_file = load_json(os.path.join(splits_folder, f"{split}.json"))
        split_file_cleaned = load_json(os.path.join(splits_folder, f"{split}_sieved.json"))

        train_ori, val_ori, test_ori = len(split_file["train"]), len(split_file["val"]), len(split_file["test"])
        train_mol_ori = get_unique_mol(split_file["train"], labels)
        val_mol_ori = get_unique_mol(split_file["val"], labels)
        test_mol_ori = get_unique_mol(split_file["test"], labels)

        train_new, val_new, test_new = len(split_file_cleaned["train"]), len(split_file_cleaned["val"]), len(split_file_cleaned["test"])
        percent_drop_train = round((train_new) / train_ori * 100, 3)

        table.append([dataset, split, train_ori, train_new, percent_drop_train, test_ori, test_new])
        ori_table.append([dataset, split, train_ori, train_mol_ori, val_ori, val_mol_ori, test_ori, test_mol_ori])

table = pd.DataFrame(table)
table.columns = ["dataset", "split", "n_train (before)", "n_train (after)", "percentage selected", "n_test (before)", "n_test (after)"]
table

Unnamed: 0,dataset,split,n_train (before),n_train (after),percentage selected,n_test (before),n_test (after)
0,canopus,inchikey_vanilla,12908,121,0.937,2690,12
1,canopus,scaffold_vanilla,12909,114,0.883,2689,30
2,canopus,random,12800,110,0.859,2744,24
3,massspecgym,inchikey_vanilla,76242,19437,25.494,16043,3917
4,massspecgym,scaffold_vanilla,76243,18942,24.844,16042,4290
5,massspecgym,random,75828,19263,25.404,16250,4149
6,nist2023,inchikey_vanilla,643637,48893,7.596,137840,10399
7,nist2023,scaffold_vanilla,643637,48046,7.465,137840,10543
8,nist2023,random,643521,48845,7.59,137898,10519


`` Look at the original data statistics ``

In [7]:
ori_table = pd.DataFrame(ori_table)
ori_table.columns = ["dataset", "split", "n_train_spectra", "n_unique_train_mol", "n_val_spectra", "n_unique_val_mol", "n_test_spectra", "n_unique_test_mol"]
ori_table

Unnamed: 0,dataset,split,n_train_spectra,n_unique_train_mol,n_val_spectra,n_unique_val_mol,n_test_spectra,n_unique_test_mol
0,canopus,inchikey_vanilla,12908,5937,2688,1313,2690,1271
1,canopus,scaffold_vanilla,12909,5809,2688,1371,2689,1343
2,canopus,random,12800,6577,2742,2072,2744,2049
3,massspecgym,inchikey_vanilla,76242,11828,16041,2355,16043,2524
4,massspecgym,scaffold_vanilla,76243,11452,16041,2666,16042,2869
5,massspecgym,random,75828,15705,16248,7676,16250,7660
6,nist2023,inchikey_vanilla,643637,18522,137839,3924,137840,3946
7,nist2023,scaffold_vanilla,643637,18771,137839,3889,137840,3730
8,nist2023,random,643521,26198,137897,21779,137898,21718


`` Look at how the performance has changed based on the sampling strategy `` 

In [19]:
datasets = ["massspecgym", "nist2023"]
splits = ["scaffold_vanilla", "inchikey_vanilla", "random"]
models = ["binned_", "MS_", "formula_", "MIST_"]

results = []

for dataset in tqdm(datasets):

    for model in tqdm(models): 

        for split in tqdm(splits):

            original_checkpoint = [f for f in all_checkpoints if dataset in f and model in f and split in f and "sieved" not in f]
            sieved_checkpoint = [f for f in all_checkpoints if dataset in f and model in f and split in f and "sieved" in f]

            assert len(original_checkpoint) == 1
            assert len(sieved_checkpoint) == 1 

            original_checkpoint = original_checkpoint[0]
            sieved_checkpoint = sieved_checkpoint[0]

            original_performance = load_json(os.path.join(original_checkpoint, "test_performance.json"))
            sieved_performance = load_json(os.path.join(sieved_checkpoint, "test_performance.json"))["jaccard"]

            if "jaccard_subset" in original_performance: 
                original_results_subset_jaccard = original_performance["jaccard_subset"] 
            
            else:

                sieved_results = load_pickle(os.path.join(sieved_checkpoint, "test_results.pkl"))
                sieved_keys = sieved_results.keys()
                
                original_results = load_pickle(os.path.join(original_checkpoint, "test_results.pkl"))
                original_results_subset = {k: v for k,v in original_results.items() if k in sieved_keys}

                sigmoid = True 
                if "MIST" in model: sigmoid = False
                
                original_results_subset_jaccard = get_jaccard_score(original_results_subset, sigmoid = sigmoid)

            original_performance["jaccard_subset"] = original_results_subset_jaccard
            write_json(original_performance, os.path.join(original_checkpoint, "test_performance.json"))

            # Add in the performance of the subset into the original results 
            results.append([dataset, model, split, round(sieved_performance,3), round(original_results_subset_jaccard,3)])


  0%|          | 0/2 [00:00<?, ?it/s]
100%|██████████| 3/3 [00:00<00:00, 280.46it/s]

100%|██████████| 3/3 [00:00<00:00, 319.89it/s]

100%|██████████| 3/3 [00:00<00:00, 309.66it/s]

100%|██████████| 3/3 [00:00<00:00, 409.75it/s]
100%|██████████| 4/4 [00:00<00:00, 75.55it/s]

100%|██████████| 3/3 [00:00<00:00, 269.00it/s]

100%|██████████| 3/3 [00:00<00:00, 284.95it/s]

100%|██████████| 3/3 [00:00<00:00, 293.75it/s]

100%|██████████| 3/3 [00:00<00:00, 397.73it/s]
100%|██████████| 4/4 [00:00<00:00, 78.62it/s]
100%|██████████| 2/2 [00:00<00:00, 18.05it/s]


In [20]:
results = pd.DataFrame(results)
results.columns = ['dataset',  'model', 'split', 'Standard experiment conditions', 'All experiment condition']
# current = results.loc[results.loc[:, "model"] == "MS", :]
current = results
# current = current.loc[current.loc[:, "dataset"] == "nist2023", :]
current

Unnamed: 0,dataset,model,split,Standard experiment conditions,All experiment condition
0,massspecgym,binned_,scaffold_vanilla,0.21,0.22
1,massspecgym,binned_,inchikey_vanilla,0.21,0.227
2,massspecgym,binned_,random,0.48,0.615
3,massspecgym,MS_,scaffold_vanilla,0.206,0.215
4,massspecgym,MS_,inchikey_vanilla,0.208,0.224
5,massspecgym,MS_,random,0.461,0.588
6,massspecgym,formula_,scaffold_vanilla,0.207,0.236
7,massspecgym,formula_,inchikey_vanilla,0.208,0.245
8,massspecgym,formula_,random,0.471,0.561
9,massspecgym,MIST_,scaffold_vanilla,0.245,0.254
