Import libraries

In [2]:
import os
import json
import pickle
import numpy as np
import pandas as pd

Helper Functions

In [3]:
def load_json(path):

    with open(path, "r") as f:
        data = json.load(f)
    
    return data

def write_json(data, path):
    
    with open(path, "w", encoding = "UTF-8") as f:
        json.dump(data, f)

def load_pickle(path):

    with open(path, "rb") as f:
        return pickle.load(f)


def to_binary(FP, threshold):

    FP = (FP > threshold).astype(int)

    return FP 

def jaccard_index(FP_pred, FP):

    # Intersection = bitwise AND
    intersection = np.logical_and(FP, FP_pred).sum()

    # Union = bitwise OR
    union = np.logical_or(FP, FP_pred).sum()

    # Avoid division-by-zero by adding a small epsilon
    jaccard = intersection / (union + 1e-9)

    return jaccard

def get_jaccard_score(results):
        
    total = 0 

    for k, v in results.items():

        pred = np.array(v["pred"])
        GT = v["GT"]

        pred = to_binary(pred, 0.5)
        j = jaccard_index(pred, GT)

        total += j 

    mean_score = total / len(results)

    return mean_score

def get_unique_mol(ids, labels):

    mols = set([labels[i.replace(".pkl", "")] for i in ids if i.replace(".pkl", "") in labels])
    return len(mols)

`` Look at the selected experimental conditions``

In [18]:
for dataset in ["canopus", "massspecgym", "nist2023"]:

    frags_folder = f"/data/rbg/users/klingmin/projects/MS_processing/data/{dataset}/frags_preds"
    splits_folder = f"/data/rbg/users/klingmin/projects/MS_processing/data_splits/{dataset}/splits"

    split_file_cleaned = load_json(os.path.join(splits_folder, f"random_sieved.json"))
    sample = split_file_cleaned["test"][0]
    frags = load_pickle(os.path.join(frags_folder, f"{sample}"))

    print(dataset)
    print(frags["precursor_type"])
    print(frags["collision_energy"])

    if dataset == "nist2023":
        print(frags["instrument"])
    print(frags["instrument_type"])
    print() 



canopus
[M+H]+
35.0
Orbitrap (LCMS)

massspecgym
[M+H]+
30.0
Orbitrap

nist2023
[M+H]+
20
Thermo Finnigan Elite Orbitrap
HCD



`` Let us look at how much training data is left when we only train with clean subset of the data ``

In [None]:
table = [] 
ori_table = []

for dataset in ["canopus", "massspecgym", "nist2023"]:

    labels_file = f"/data/rbg/users/klingmin/projects/MS_processing/data_BM/mist/{dataset}/labels.tsv"
    labels = pd.read_csv(labels_file, sep = "\t").values
    labels = {str(r[1]): r[6][:14] for r in labels}

    splits_folder = f"/data/rbg/users/klingmin/projects/MS_processing/data_splits/{dataset}/splits"

    splits = ["inchikey_vanilla", "scaffold_vanilla", "random"]

    for split in splits: 

        split_file = load_json(os.path.join(splits_folder, f"{split}.json"))
        split_file_cleaned = load_json(os.path.join(splits_folder, f"{split}_sieved.json"))

        train_ori, val_ori, test_ori = len(split_file["train"]), len(split_file["val"]), len(split_file["test"])
        train_mol_ori = get_unique_mol(split_file["train"], labels)
        val_mol_ori = get_unique_mol(split_file["val"], labels)
        test_mol_ori = get_unique_mol(split_file["test"], labels)

        train_new, val_new, test_new = len(split_file_cleaned["train"]), len(split_file_cleaned["val"]), len(split_file_cleaned["test"])
        percent_drop_train = round((train_new) / train_ori * 100, 3)

        table.append([dataset, split, train_ori, train_new, percent_drop_train, test_ori, test_new])
        ori_table.append([dataset, split, train_ori, train_mol_ori, val_ori, val_mol_ori, test_ori, test_mol_ori])
        print()

table = pd.DataFrame(table)
table.columns = ["dataset", "split", "n_train (before)", "n_train (after)", "percentage selected", "n_test (before)", "n_test (after)"]
table












Unnamed: 0,dataset,split,n_train (before),n_train (after),percentage selected,n_test (before),n_test (after)
0,canopus,inchikey_vanilla,12908,121,0.937,2690,12
1,canopus,scaffold_vanilla,12909,114,0.883,2689,30
2,canopus,random,12800,110,0.859,2744,24
3,massspecgym,inchikey_vanilla,76242,19437,25.494,16043,3917
4,massspecgym,scaffold_vanilla,76243,18942,24.844,16042,4290
5,massspecgym,random,75828,19263,25.404,16250,4149
6,nist2023,inchikey_vanilla,643637,48893,7.596,137840,10399
7,nist2023,scaffold_vanilla,643637,48046,7.465,137840,10543
8,nist2023,random,643521,48845,7.59,137898,10519


`` Look at the original data statistics ``

In [4]:
ori_table = pd.DataFrame(ori_table)
ori_table.columns = ["dataset", "split", "n_train_spectra", "n_unique_train_mol", "n_val_spectra", "n_unique_val_mol", "n_test_spectra", "n_unique_test_mol"]
ori_table

Unnamed: 0,dataset,split,n_train_spectra,n_unique_train_mol,n_val_spectra,n_unique_val_mol,n_test_spectra,n_unique_test_mol
0,canopus,inchikey_vanilla,12908,5937,2688,1313,2690,1271
1,canopus,scaffold_vanilla,12909,5809,2688,1371,2689,1343
2,canopus,random,12800,6577,2742,2072,2744,2049
3,massspecgym,inchikey_vanilla,76242,11828,16041,2355,16043,2524
4,massspecgym,scaffold_vanilla,76243,11452,16041,2666,16042,2869
5,massspecgym,random,75828,15705,16248,7676,16250,7660
6,nist2023,inchikey_vanilla,643637,18522,137839,3924,137840,3946
7,nist2023,scaffold_vanilla,643637,18771,137839,3889,137840,3730
8,nist2023,random,643521,26198,137897,21779,137898,21718


`` Look at how the performance has changed based on the sampling strategy `` 

In [5]:
datasets = ["massspecgym", "nist2023"]
folders = ["../FP_prediction/mist/best_models", "../FP_prediction/baseline_models/best_models"]
results = []

for folder in folders: 
        
    for dataset in datasets:

        sieved_folder = os.path.join(folder, f"{dataset}_sieved")
        original_folder = os.path.join(folder, f"{dataset}")

        for checkpoint in os.listdir(sieved_folder):
            
            model = checkpoint.split("_")[1]
            original_checkpoint = [f for f in os.listdir(original_folder) if "_".join(checkpoint.split("_")[-3:-1]) in f]
            if len(original_checkpoint) == 0: continue
            original_checkpoint = original_checkpoint[0]
            
            original_performance = load_json(os.path.join(original_folder, original_checkpoint, "test_performance.json"))
            sieved_performance = load_json(os.path.join(sieved_folder, checkpoint, "test_performance.json"))["jaccard"]

            if "jaccard_subset" in original_performance: 
                original_results_subset_jaccard = original_performance["jaccard_subset"] 
            
            else:

                sieved_results = load_pickle(os.path.join(sieved_folder, checkpoint, "test_results.pkl"))
                sieved_keys = sieved_results.keys()
                
                original_results = load_pickle(os.path.join(original_folder, original_checkpoint, "test_results.pkl"))
            
                original_results_subset = {k: v for k,v in original_results.items() if k in sieved_keys}
                original_results_subset_jaccard = get_jaccard_score(original_results_subset)

                # Add in the performance of the subset into the original results 
                original_performance["jaccard_subset"] = original_results_subset_jaccard
                write_json(original_performance, os.path.join(original_folder, original_checkpoint, "test_performance.json"))

            split = " ".join(checkpoint.split("_")[-3:-1]).replace("4096", "")
            results.append([dataset, model, split, round(sieved_performance,3), round(original_results_subset_jaccard,3)])

In [6]:
results = pd.DataFrame(results)
results.columns = ['dataset',  'model', 'split', 'Standard experiment conditions', 'All experiment condition']
# current = results.loc[results.loc[:, "model"] == "MS", :]
current = results
current = current.loc[current.loc[:, "dataset"] == "nist2023", :]
current

Unnamed: 0,dataset,model,split,Standard experiment conditions,All experiment condition
3,nist2023,MIST,inchikey vanilla,0.304,0.302
4,nist2023,MIST,random,0.621,0.62
5,nist2023,MIST,scaffold vanilla,0.257,0.257
15,nist2023,formula,scaffold vanilla,0.211,0.21
16,nist2023,formula,random,0.64,0.466
17,nist2023,MS,random,0.638,0.466
18,nist2023,MS,inchikey vanilla,0.227,0.228
19,nist2023,binned,inchikey vanilla,0.23,0.228
20,nist2023,formula,inchikey vanilla,0.227,0.228
21,nist2023,binned,scaffold vanilla,0.207,0.21
