In [14]:
import torch
from protera_stability.config.lazy import LazyCall as L
from protera_stability.config.common.mlp import mlp_esm
from protera_stability.train import get_cfg, setup_diversity, setup_data

exp_params = {
    "diversity_cutoff": 0.866,
    "random_percent": 0.15,
    "sampling_method": "diversity",
    "experiment_name": "base",
}

def create_cfg(exp_params):
    cfg = get_cfg(args={})
    cfg = setup_diversity(cfg, **exp_params)
    mlp_esm.n_units = 2048
    mlp_esm.act = L(torch.nn.GELU)()
    cfg.model = mlp_esm

    cfg = setup_data(cfg)
    return cfg

cfg = create_cfg(exp_params)
cfg.keys()

dict_keys(['trainer_params', 'output_dir', 'random_split', 'experiment', 'model', 'dataloader'])

In [10]:
from protera_stability.train import do_train
trainer = do_train(cfg)

  rank_zero_warn(f"Checkpoint directory {dirpath} exists and is not empty.")
GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
  rank_zero_warn(f"No correct seed found, seed set to {seed}")
Global seed set to 634843590
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]


=== USING diversity as Sampling Method ===
=== USING 6137 out of 8204 samples ===
=== SIZE WAS DETERMINED BY CUTOFF ===



  | Name     | Type       | Params
----------------------------------------
0 | model    | ProteinMLP | 4.7 M 
1 | train_r2 | R2Score    | 0     
2 | valid_r2 | R2Score    | 0     
3 | test_r2  | R2Score    | 0     
----------------------------------------
4.7 M     Trainable params
0         Non-trainable params
4.7 M     Total params
18.891    Total estimated model params size (MB)


                                                                      

Global seed set to 634843590




Global seed set to 634843590


Epoch 14: 100%|██████████| 28/28 [00:02<00:00, 14.17it/s, loss=0.246, v_num=2, train/r2=0.732, train/loss=0.254, valid/r2=0.695, valid/loss=0.279]


In [11]:
trainer.test()

  rank_zero_deprecation(
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]


Testing:  20%|██        | 1/5 [00:00<00:01,  2.16it/s]--------------------------------------------------------------------------------
DATALOADER:0 TEST RESULTS
{'test/loss': 0.3312093913555145,
 'test/r2': 0.6877349615097046,
 'test/r2_step': 0.6877349615097046}
--------------------------------------------------------------------------------
Testing: 100%|██████████| 5/5 [00:00<00:00,  8.12it/s]


<protera_stability.engine.default.DefaultTrainer at 0x7fb435418fa0>

In [13]:
exp_params["sampling_method"] = "random"
cfg = create_cfg(exp_params)

trainer = do_train(cfg)
trainer.test()

  rank_zero_warn(f"Checkpoint directory {dirpath} exists and is not empty.")
GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]

  | Name     | Type       | Params
----------------------------------------
0 | model    | ProteinMLP | 4.7 M 
1 | train_r2 | R2Score    | 0     
2 | valid_r2 | R2Score    | 0     
3 | test_r2  | R2Score    | 0     
----------------------------------------
4.7 M     Trainable params
0         Non-trainable params
4.7 M     Total params
18.891    Total estimated model params size (MB)


=== USING random as Sampling Method ===
=== USING 2461 out of 8204 samples ===
=== SIZE WAS DETERMINED BY RANDOM PERCENT OF 0.15 ===
                                                                      

Global seed set to 634843590


Epoch 8: 100%|██████████| 14/14 [00:02<00:00,  7.28it/s, loss=0.334, v_num=1, train/r2=0.674, train/loss=0.333, valid/r2=0.756, valid/loss=0.235]


  rank_zero_deprecation(
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]


Testing:  20%|██        | 1/5 [00:00<00:02,  1.72it/s]--------------------------------------------------------------------------------
DATALOADER:0 TEST RESULTS
{'test/loss': 0.23753789067268372,
 'test/r2': 0.7481482028961182,
 'test/r2_step': 0.7481482028961182}
--------------------------------------------------------------------------------
Testing: 100%|██████████| 5/5 [00:00<00:00,  6.32it/s]


<protera_stability.engine.default.DefaultTrainer at 0x7fb423bff520>

In [None]:
from tqdm import tqdm
import pickle

def run_experiment(data_path, epochs, max_randoms, cutoffs):
    exp_params = {
        "diversity_cutoff": 0.866,
        "random_percent": 0.15,
        "sampling_method": "diversity",
        "experiment_name": "base",
    }

    print("=== RUNNING RANDOM SAMPLING ===")
    for random_percent in tqdm(max_randoms):
        exp_params["sampling_method"] = "random"
        exp_params["random_percent"] = random_percent

        cfg = create_cfg(exp_params)

        trainer = do_train(cfg)
        test_results = trainer.test()
        
        pickle.dump(test_results, open(Path("../logs") / f"stability_random_{random_percent}" / "test.pkl", "wb"))
        
    print("=== RUNNING DIVERSITY SAMPLING ===")
    for cut in tqdm(cutoffs):
        exp_params["sampling_method"] = "diversity"
        exp_params["diversity_cutoff"] = cut

        cfg = create_cfg(exp_params)

        trainer = do_train(cfg)
        test_results = trainer.test()
        
        pickle.dump(test_results, open(Path("../logs") / f"stability_cut_{cut}" / "test.pkl", "wb"))

In [None]:
random_percents = [0.8, 0.5, 0.25, 0.15, 0.1]
cutoffs = [0.82, 0.83, 0.85, 0.86, 0.87, 0.875, 0.878]

run_experiment(
    data_path=data_path, epochs=150, max_randoms=random_percents, cutoffs=cutoffs
)