`` This script designs various strategies to sample training data using the influence function scores ``

Import libraries

In [2]:
import os
import random
import pickle
import numpy as np
from tqdm import tqdm 
import torch

from utils import load_pickle, pickle_data, load_json, write_json

Settings

In [1]:
# Define the percentage of samples that we will be using for training
ratio_list = [0.05, 0.10, 0.20, 0.30, 0.35, 0.40, 0.45, 0.50, 0.55, 0.60, 0.70, 0.75, 0.80, 0.85, 0.90]

# Get all the required folders
splits_folder = "/data/rbg/users/klingmin/projects/MS_processing/data_splits/"
frags_folder = "/data/rbg/users/klingmin/projects/MS_processing/data/"
CF_folder = "/data/rbg/users/klingmin/projects/MS_processing/CFs"
baseline_folder = "/data/rbg/users/klingmin/projects/ML_MS_analysis/FP_prediction/baseline_models/best_models"
mist_folder = "/data/rbg/users/klingmin/projects/ML_MS_analysis/FP_prediction/mist/best_models"
results_folders = [baseline_folder, mist_folder]

# Get the considered datasets 
datasets = ["massspecgym", "nist2023"]

Helper Functions

In [2]:
def score_train_CF_oov(OOV_list, train_ids, frags_folder):

    scores = {} 
    for id_ in tqdm(train_ids):

        frags = [f["comment"]["f_pred"] for f in load_pickle(os.path.join(frags_folder, id_))["peaks"]]
        frags = list(set([f for f in frags if f != ""]))

        score = sum([f in OOV_list for f in frags])
        scores[id_] = score
    
    return scores

`` Get upper bound performance ``

In [None]:
# UB 1: FT on all available FT data 
for dataset in tqdm(datasets):

    split_file = load_json(os.path.join(splits_folder, dataset, "splits", "sampling_split.json"))
    output_path = os.path.join(splits_folder, dataset, "splits_sampling", "random", f"sampled_random_100.json")

    new_split = {"train": split_file["train"], 
                 "val": split_file["val"],
                 "test": split_file["test"]}
    
    write_json(new_split, output_path)
        
    # Add full set 
    original_train = load_json(os.path.join(splits_folder, dataset, "splits", "scaffold_vanilla_sieved.json"))["train"]
    output_path = os.path.join(splits_folder, dataset, "splits_sampling", "random", f"sampled_random_100_combined.json")
    full_set = {"train": original_train + split_file["train"],
                "val": split_file["val"], 
                "test": split_file["test"]}

    write_json(full_set, output_path)

100%|██████████| 2/2 [00:06<00:00,  3.45s/it]


`` Sampling stategy 1: sample randomly ``

In [24]:
# V1: sample randomly 
for dataset in tqdm(datasets):

    output_split_folder = os.path.join(splits_folder, dataset, "splits_sampling", "random")
    split_file = load_json(os.path.join(splits_folder, dataset, "splits", "sampling_split.json"))

    train_ids, val_ids, test_ids = split_file["train"], split_file["val"], split_file["test"]

    if not os.path.exists(output_split_folder): os.makedirs(output_split_folder)

    for ratio in tqdm(ratio_list):

        output_path = os.path.join(output_split_folder, f"sampled_random_{int(ratio*100)}.json")
        if os.path.exists(output_path): continue 

        train_ids_sampled = random.sample(train_ids, int(len(train_ids) * ratio))

        new_split = {"train": train_ids_sampled, 
                     "val": val_ids,
                     "test": test_ids}

        write_json(new_split, output_path)
    
# V2: sample randomly + add original data 
for dataset in tqdm(datasets):

    original_train = load_json(os.path.join(splits_folder, dataset, "splits", "scaffold_vanilla_sieved.json"))["train"]

    for ratio in tqdm(ratio_list):

        sampled_split = load_json(os.path.join(splits_folder, dataset, "splits_sampling", "random", f"sampled_random_{int(ratio*100)}.json"))
        output_path = os.path.join(splits_folder, dataset, "splits_sampling", "random", f"sampled_random_{int(ratio*100)}_combined.json")

        if os.path.exists(output_path): continue
        new_split = {"train": sampled_split["train"] + original_train, 
                     "val": sampled_split["val"],
                     "test": sampled_split["test"]}
        
        write_json(new_split, output_path)

100%|██████████| 15/15 [00:00<00:00, 19418.07it/s]
100%|██████████| 15/15 [00:00<00:00, 16029.19it/s]
100%|██████████| 2/2 [00:00<00:00,  5.92it/s]
100%|██████████| 15/15 [00:01<00:00, 11.25it/s]
100%|██████████| 15/15 [00:02<00:00,  7.25it/s]
100%|██████████| 2/2 [00:03<00:00,  1.79s/it]


`` Sampling strategy 2: sample based on CF ``

In [26]:
# # V1: sample based on CF
# for dataset in tqdm(datasets):

#     output_split_folder = os.path.join(splits_folder, dataset, "splits_sampling", "CF")
#     split_file = load_json(os.path.join(splits_folder, dataset, "splits", "sampling_split.json"))
    
#     current_CF_folder = os.path.join(CF_folder, dataset, "scaffold_vanilla_sieved_split")
#     current_frags_folder = os.path.join(frags_folder, dataset, "frags_preds")
#     train_CFs, test_CFs = load_pickle(os.path.join(current_CF_folder, "train_CFs.pkl")), load_pickle(os.path.join(current_CF_folder, "test_CFs.pkl"))
#     train_mist_CFs, test_mist_CFs = load_pickle(os.path.join(current_CF_folder, "train_MIST_CFs.pkl")), load_pickle(os.path.join(current_CF_folder, "test_MIST_CFs.pkl"))

#     # Get the OOVs for this set 
#     OOV = test_CFs - train_CFs
#     OOV_mist = test_mist_CFs - train_mist_CFs

#     train_ids, val_ids, test_ids = split_file["train"], split_file["val"], split_file["test"]
#     if not os.path.exists(output_split_folder): os.makedirs(output_split_folder)

#     # Get the percent of peaks with CFs in the OOV list for train and sample based on that
#     train_scores = score_train_CF_oov(OOV, train_ids, current_frags_folder)
#     train_scores_mist = score_train_CF_oov(OOV_mist, train_ids, current_frags_folder)

#     train_scores = sorted(train_scores.items(), key = lambda x: x[1], reverse = True)
#     train_scores_mist = sorted(train_scores_mist.items(), key = lambda x: x[1], reverse = True)
    
#     for ratio in tqdm(ratio_list):

#         output_path = os.path.join(output_split_folder, f"sampled_CF_{int(ratio*100)}.json")
#         output_MIST_path = os.path.join(output_split_folder, f"sampled_CF_MIST_{int(ratio*100)}.json")

#         if os.path.exists(output_path): 
#             continue 
#         else:
#             train_ids_sampled = [t[0] for t in train_scores[:int(len(train_ids) * ratio)]]
#             new_split = {"train": train_ids_sampled, 
#                          "val": val_ids,
#                          "test": test_ids}

#             write_json(new_split, output_path)

#         if os.path.exists(output_MIST_path): 
#             continue 
#         else:
#             train_ids_sampled = [t[0] for t in train_scores_mist[:int(len(train_ids) * ratio)]]
#             new_split = {"train": train_ids_sampled, 
#                          "val": val_ids,
#                          "test": test_ids}

#             write_json(new_split, output_MIST_path)

# V2: sample based on CF + add original data 
for dataset in tqdm(datasets):

    original_train = load_json(os.path.join(splits_folder, dataset, "splits", "scaffold_vanilla_sieved.json"))["train"]

    for ratio in tqdm(ratio_list):

        sampled_split = load_json(os.path.join(splits_folder, dataset, "splits_sampling", "CF", f"sampled_CF_{int(ratio*100)}.json"))
        output_path = os.path.join(splits_folder, dataset, "splits_sampling", "CF", f"sampled_CF_{int(ratio*100)}_combined.json")
        if os.path.exists(output_path): continue 

        new_split = {"train": sampled_split["train"] + original_train, 
                     "val": sampled_split["val"],
                     "test": sampled_split["test"]}
        
        write_json(new_split, output_path)

    # MIST version 
    for ratio in tqdm(ratio_list):

        sampled_split = load_json(os.path.join(splits_folder, dataset, "splits_sampling", "CF", f"sampled_CF_MIST_{int(ratio*100)}.json"))
        mist_output_path = os.path.join(splits_folder, dataset, "splits_sampling", "CF", f"sampled_CF_MIST_{int(ratio*100)}_combined.json")
        if os.path.exists(mist_output_path): continue 

        new_split = {"train": sampled_split["train"] + original_train, 
                     "val": sampled_split["val"],
                     "test": sampled_split["test"]}
        
        write_json(new_split, mist_output_path)

100%|██████████| 15/15 [00:01<00:00, 10.12it/s]
100%|██████████| 15/15 [00:01<00:00,  7.93it/s]
100%|██████████| 15/15 [00:01<00:00,  8.10it/s]
100%|██████████| 15/15 [00:01<00:00,  8.72it/s]
100%|██████████| 2/2 [00:07<00:00,  3.57s/it]


`` Sampling strategy 3: sample based on influence score ``

`` 3a: Using a separate validation set to select training data ``

In [35]:
# # V1: sample based on IF (val) 
# for dataset in tqdm(datasets):
    
#     if dataset == "massspecgym": continue
    
#     output_split_folder = os.path.join(splits_folder, dataset, "splits_sampling", "IF_val")
#     split_file = load_json(os.path.join(splits_folder, dataset, "splits", "sampling_split.json"))
#     val_ids, test_ids = split_file["val"], split_file["test"]

#     if not os.path.exists(output_split_folder): os.makedirs(output_split_folder)

#     for folder in results_folders:

#         current_result_folder = os.path.join(folder, f"{dataset}_sieved")
#         all_checkpoints = [os.path.join(current_result_folder, f) for f in os.listdir(current_result_folder) if "scaffold" in f]

#         for checkpoint in all_checkpoints:

#             if "EK-FAC_scores_for_sampling_val.pkl" not in os.listdir(checkpoint): continue 

#             model_name = checkpoint.split("/")[-1].split("_")[1]
#             influence_scores = load_pickle(os.path.join(checkpoint, "EK-FAC_scores_for_sampling_val.pkl"))["all_modules"]
#             IF_scores_aggregated = torch.sum(influence_scores, dim = 0)
#             train_ids_FT = load_pickle(os.path.join(checkpoint, "train_ids_FT.pkl"))
            
#             for ratio in tqdm(ratio_list):

#                 output_path = os.path.join(output_split_folder, f"sampled_IF_val_{model_name}_{int(ratio*100)}.json")
#                 if os.path.exists(output_path): continue 
#                 print(f"Processing {output_path} now")
#                 train_idx_sampled = IF_scores_aggregated.topk(k = int(ratio * IF_scores_aggregated.shape[0])).indices
#                 train_ids_sampled = [train_ids_FT[i] for i in train_idx_sampled]

#                 new_split = {"train": train_ids_sampled, 
#                             "val": val_ids,
#                             "test": test_ids}

#                 write_json(new_split, output_path)

# V2: sample based on IF (val) + add original data 
for dataset in tqdm(datasets):

    original_train = load_json(os.path.join(splits_folder, dataset, "splits", "scaffold_vanilla_sieved.json"))["train"]

    for model_name in ["binned", "formula", "MS", "MIST"]:

        for ratio in tqdm(ratio_list):

            sampled_split = os.path.join(splits_folder, dataset, "splits_sampling", "IF_val", f"sampled_IF_val_{model_name}_{int(ratio*100)}.json")
            if not os.path.exists(sampled_split): continue
            sampled_split = load_json(sampled_split)
            output_path = os.path.join(splits_folder, dataset, "splits_sampling", "IF_val",  f"sampled_IF_val_{model_name}_{int(ratio*100)}_combined.json")
            # if os.path.exists(output_path): continue 

            new_split = {"train": sampled_split["train"] + original_train, 
                        "val": sampled_split["val"],
                        "test": sampled_split["test"]}
            
            write_json(new_split, output_path)

100%|██████████| 15/15 [00:00<00:00, 21.96it/s]
100%|██████████| 15/15 [00:00<00:00, 26.51it/s]
100%|██████████| 15/15 [00:00<00:00, 23.34it/s]
100%|██████████| 15/15 [00:00<00:00, 21.39it/s]
100%|██████████| 15/15 [00:00<00:00, 15401.36it/s]
100%|██████████| 15/15 [00:00<00:00, 41971.02it/s]
100%|██████████| 15/15 [00:00<00:00, 42027.09it/s]
100%|██████████| 15/15 [00:01<00:00, 12.18it/s]
100%|██████████| 2/2 [00:03<00:00,  1.93s/it]


`` 3b: Using the test set to select training data directly ``

In [46]:
# for dataset in tqdm(datasets):

#     if dataset == "massspecgym": continue
    
#     output_split_folder = os.path.join(splits_folder, dataset, "splits_sampling", "IF_test")
#     split_file = load_json(os.path.join(splits_folder, dataset, "splits", "sampling_split.json"))
#     val_ids, test_ids = split_file["val"], split_file["test"]

#     if not os.path.exists(output_split_folder): os.makedirs(output_split_folder)

#     for folder in results_folders:

#         current_result_folder = os.path.join(folder, f"{dataset}_sieved")
#         all_checkpoints = [os.path.join(current_result_folder, f) for f in os.listdir(current_result_folder) if "scaffold" in f]

#         for checkpoint in all_checkpoints:

#             if "EK-FAC_scores_for_sampling_test.pkl" not in os.listdir(checkpoint): continue 

#             model_name = checkpoint.split("/")[-1].split("_")[1]
#             influence_scores = load_pickle(os.path.join(checkpoint, "EK-FAC_scores_for_sampling_test.pkl"))["all_modules"]
#             IF_scores_aggregated = torch.sum(influence_scores, dim = 0)
#             train_ids_FT = load_pickle(os.path.join(checkpoint, "train_ids_FT.pkl"))
                        
#             for ratio in tqdm(ratio_list):

#                 output_path = os.path.join(output_split_folder, f"sampled_IF_test_{model_name}_{int(ratio*100)}.json")
#                 if os.path.exists(output_path): continue 
#                 print(f"Processing {output_path} now")
                
#                 train_idx_sampled = IF_scores_aggregated.topk(k = int(ratio * IF_scores_aggregated.shape[0])).indices
#                 train_ids_sampled = [train_ids_FT[i] for i in train_idx_sampled]

#                 new_split = {"train": train_ids_sampled, 
#                             "val": val_ids,
#                             "test": test_ids}

#                 write_json(new_split, output_path)

# V2: sample based on IF (test) + add original data 
for dataset in tqdm(datasets):

    original_train = load_json(os.path.join(splits_folder, dataset, "splits", "scaffold_vanilla_sieved.json"))["train"]

    for model_name in ["binned", "formula", "MS", "MIST"]:

        for ratio in tqdm(ratio_list):

            sampled_split = os.path.join(splits_folder, dataset, "splits_sampling", "IF_test", f"sampled_IF_test_{model_name}_{int(ratio*100)}.json")
            if not os.path.exists(sampled_split): continue
            sampled_split = load_json(sampled_split)
            output_path = os.path.join(splits_folder, dataset, "splits_sampling", "IF_test", f"sampled_IF_test_{model_name}_{int(ratio*100)}_combined.json")
            # if os.path.exists(output_path): continue 

            new_split = {"train": sampled_split["train"] + original_train, 
                        "val": sampled_split["val"],
                        "test": sampled_split["test"]}
            
            write_json(new_split, output_path)

100%|██████████| 15/15 [00:05<00:00,  2.55it/s]
100%|██████████| 15/15 [00:00<00:00, 17.80it/s]
100%|██████████| 15/15 [00:00<00:00, 15.27it/s]
100%|██████████| 15/15 [00:01<00:00,  9.02it/s]
100%|██████████| 15/15 [00:00<00:00, 2757.48it/s]
100%|██████████| 15/15 [00:00<00:00, 3278.85it/s]
100%|██████████| 15/15 [00:00<00:00, 3504.21it/s]
100%|██████████| 15/15 [00:02<00:00,  7.19it/s]
100%|██████████| 2/2 [00:11<00:00,  5.85s/it]
