Import libraries

In [1]:
import os
import json
import pickle
import numpy as np
import pandas as pd

Helper Functions

In [2]:
def load_json(path):

    with open(path, "r") as f:
        data = json.load(f)
    
    return data

def write_json(data, path):
    
    with open(path, "w", encoding = "UTF-8") as f:
        json.dump(data, f)

def load_pickle(path):

    with open(path, "rb") as f:
        return pickle.load(f)


def to_binary(FP, threshold):

    FP = (FP > threshold).astype(int)

    return FP 

def jaccard_index(FP_pred, FP):

    # Intersection = bitwise AND
    intersection = np.logical_and(FP, FP_pred).sum()

    # Union = bitwise OR
    union = np.logical_or(FP, FP_pred).sum()

    # Avoid division-by-zero by adding a small epsilon
    jaccard = intersection / (union + 1e-9)

    return jaccard

def get_jaccard_score(results):
        
    total = 0 

    for k, v in results.items():

        pred = np.array(v["pred"])
        GT = v["GT"]

        pred = to_binary(pred, 0.5)
        j = jaccard_index(pred, GT)

        total += j 

    mean_score = total / len(results)

    return mean_score

`` Let us look at how much training data is left when we only train with clean subset of the data ``

In [3]:
table = [] 

for dataset in ["massspecgym", "nist2023"]:

    splits_folder = f"/data/rbg/users/klingmin/projects/MS_processing/data_splits/{dataset}/splits"

    splits = ["inchikey_vanilla", "scaffold_vanilla", "random"]

    for split in splits: 

        split_file = load_json(os.path.join(splits_folder, f"{split}.json"))
        split_file_cleaned = load_json(os.path.join(splits_folder, f"{split}_sieved.json"))

        train_ori, val_ori, test_ori = len(split_file["train"]), len(split_file["val"]), len(split_file["test"])
        train_new, val_new, test_new = len(split_file_cleaned["train"]), len(split_file_cleaned["val"]), len(split_file_cleaned["test"])

        percent_drop_train = round((train_ori - train_new) / train_ori * 100, 3)

        table.append([dataset, split, train_ori, train_new, percent_drop_train])

        print()

table = pd.DataFrame(table)
table.columns = ["dataset", "split", "n_train (before)", "n_train (after)", "percentage drop"]
table









Unnamed: 0,dataset,split,n_train (before),n_train (after),percentage drop
0,massspecgym,inchikey_vanilla,76242,19437,74.506
1,massspecgym,scaffold_vanilla,76243,18942,75.156
2,massspecgym,random,75828,19263,74.596
3,nist2023,inchikey_vanilla,643637,48893,92.404
4,nist2023,scaffold_vanilla,643637,48046,92.535
5,nist2023,random,643521,48845,92.41


`` Look at how the performance has changed based on the sampling strategy `` 

In [5]:
datasets = ["massspecgym", "nist2023"]
folder = "../FP_prediction/mist/best_models"

for dataset in datasets:

    sieved_folder = os.path.join(folder, f"{dataset}_sieved")
    original_folder = os.path.join(folder, f"{dataset}")

    for checkpoint in os.listdir(sieved_folder):
        
        original_checkpoint = [f for f in os.listdir(original_folder) if "_".join(checkpoint.split("_")[-3:-1]) in f]
        if len(original_checkpoint) == 0: continue
        original_checkpoint = original_checkpoint[0]
        
        original_performance = load_json(os.path.join(original_folder, original_checkpoint, "test_performance.json"))
        sieved_performance = load_json(os.path.join(sieved_folder, checkpoint, "test_performance.json"))["jaccard"]

        if "jaccard_subset" in original_performance: 
            original_results_subset_jaccard = original_performance["jaccard_subset"] 
        
        else:

            sieved_results = load_pickle(os.path.join(sieved_folder, checkpoint, "test_results.pkl"))
            sieved_keys = sieved_results.keys()
            
            original_results = load_pickle(os.path.join(original_folder, original_checkpoint, "test_results.pkl"))
        
            original_results_subset = {k: v for k,v in original_results.items() if k in sieved_keys}
            original_results_subset_jaccard = get_jaccard_score(original_results_subset)

            # Add in the performance of the subset into the original results 
            original_performance["jaccard_subset"] = original_results_subset_jaccard
            write_json(original_performance, os.path.join(original_folder, original_checkpoint, "test_performance.json"))

        print(checkpoint)
        print(round(sieved_performance,3), round(original_results_subset_jaccard,3))
        print() 

MSG_MIST_sieved_4096_scaffold_vanilla_sieved
0.245 0.254

MSG_MIST_sieved_4096_inchikey_vanilla_sieved
0.276 0.29

MSG_MIST_sieved_4096_random_sieved
0.423 0.603

NIST2023_MIST_sieved_4096_inchikey_vanilla_sieved
0.304 0.302

NIST2023_MIST_sieved_4096_random_sieved
0.621 0.62

NIST2023_MIST_sieved_4096_scaffold_vanilla_sieved
0.257 0.257



What are the influences - are they the molecules that look the same, or are they those with the same experimental conditions - are they outside of our influence groups. 


If our theory is right, we should see that positive influence that are same conditions. then negative influence are wrong conditions. 

We could think about conditions are similar - we can look at the particular influences; red on red - but now red on yellow 

Question: maybe there are some difference in conditions that are not important - even if they are seemingly different, adding them to the training set would not harm the performance - it will infact improve the performance 