In [1]:
import torch
import torch.optim as opt
import numpy as np
from torch.utils.data import DataLoader
from vae.base import BaseVAE
from vae.datasets import SABRExFeatsDataset
from vae.cvae_with_mem import CVAEMem
from vae.utils import *
from ray import tune
from ray.tune.schedulers import ASHAScheduler
import flaml
import json, time, os

In [2]:
data = np.load("data/sabr_surface_with_ret.npz")
vol_surf_data = data["surface"]
ret_data = data["ret"]

In [3]:
# perform model evaluation in terms of the accuracy and f1 score.
def model_eval_new(model: BaseVAE, dataloader):
    model.eval() # switch to eval model, will turn off randomness like dropout
    eval_loss = 0
    num_batches = 0
    for step, batch in enumerate(dataloader):
        try:
            batch.to(model.device)
        except:
            pass

        losses = model.test_step(batch)

        eval_loss += losses["loss"].item()
        num_batches += 1

    return eval_loss / num_batches

def train_new(model: BaseVAE, train_dataloader: DataLoader, valid_dataloader: DataLoader, 
          lr=1e-5, epochs=100, 
          model_dir="./", file_name="vanilla.pt"):
    model.train()
    optimizer = opt.AdamW(model.parameters(), lr)
    best_dev_loss = np.inf

    ## run for the specified number of epochs
    if not os.path.exists(model_dir):
        os.makedirs(model_dir)
    if "." in file_name:
        file_prefix = file_name.split(".")[0]
    else:
        file_prefix = file_name
    log_file = open(f"{model_dir}/{file_prefix}-{epochs}-log.txt", "w", encoding="utf-8")

    print("Model config: ", file=log_file)
    print(json.dumps(model.config, indent=True), file=log_file)
    print(f"LR: {lr}", file=log_file)
    print(f"Epochs: {epochs}", file=log_file)
    print("", file=log_file)
    start_time = time.time()
    for epoch in range(epochs):
        epoch_start_time = time.time()
        model.train()
        train_loss = 0
        num_batches = 0
        for step, batch in enumerate(train_dataloader):
            try:
                batch.to(model.device)
            except:
                pass

            losses = model.train_step(batch, optimizer)

            train_loss += losses["loss"].item()
            num_batches += 1

        train_loss = train_loss / (num_batches)
        
        dev_loss = model_eval_new(model, valid_dataloader)

        if dev_loss < best_dev_loss:
            best_dev_loss = dev_loss
            model.save_weights(optimizer, model_dir, file_prefix)

        # print(f"epoch {epoch}: train loss :: {train_loss :.3f}, dev loss :: {dev_loss :.3f}, time elapsed :: {time.time() - epoch_start_time}")
        print(f"epoch {epoch}: train loss :: {train_loss :.3f}, dev loss :: {dev_loss :.3f}, time elapsed :: {time.time() - epoch_start_time}", file=log_file)
        yield train_loss, dev_loss
    # print(f"training finished, total time :: {time.time() - start_time}")
    print(f"training finished, total time :: {time.time() - start_time}", file=log_file)
    return train_loss, dev_loss

In [4]:
def train_wrapper(config):
    model_config = {
        "seq_len": config["seq_len"], 
        "feat_dim": (5, 5),
        "latent_dim": config["latent_dim"],
        "device": "cuda",
        "kl_weight": config["kl_weight"],
        "re_feat_weight": config["re_feat_weight"],
        "surface_hidden": [int(v) for v in config["surface_hidden"]],
        "ex_feats_dim": 1,
        "ex_feats_hidden": None,
        "mem_type": "lstm",
        "mem_hidden": config["mem_hidden"],
        "mem_layers": config["mem_layers"],
        "mem_dropout": config["mem_dropout"],
        "ctx_len": config["ctx_len"], 
        "ctx_surface_hidden": [int(v) for v in config["ctx_surface_hidden"]], 
        "ctx_ex_feats_hidden": None,
    }
    train_dataset = SABRExFeatsDataset(vol_surf_data[:5000], ret_data[:5000], config["seq_len"], dtype=torch.float32)
    train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=config["batch_size"])
    valid_dataset = SABRExFeatsDataset(vol_surf_data[5000:6000], ret_data[5000:6000], config["seq_len"], dtype=torch.float32)
    valid_dataloader = DataLoader(valid_dataset, shuffle=True, batch_size=config["batch_size"])
    test_dataset = SABRExFeatsDataset(vol_surf_data[6000:7000], ret_data[6000:7000], config["seq_len"], dtype=torch.float32)
    test_dataloader = DataLoader(test_dataset, shuffle=True, batch_size=config["batch_size"])

    model = CVAEMem(model_config)
    for train_loss, dev_loss in train_new(model, train_dataloader, valid_dataloader, 
                                 config["lr"], int(round(config["num_epochs"])), 
                                 "models", "cond_conv2d_lstm.pt"):
        tune.report(train_loss=train_loss, dev_loss=dev_loss)

In [7]:
def surface_hidden_size_sampler(config):
    layers = []
    for _ in range(np.random.randint(1, 6)):
        layers.append(np.random.randint(1, 11))
    return tuple(layers)


def seq_len_sampler(config):
    # modify this for different choices
    return tune.randint(1, config["config"]["seq_len"])

config = {
    "lr": tune.loguniform(1e-5, 1e-3),
    "num_epochs": tune.choice([10, 25, 50, 100]),
    "batch_size": tune.choice([16, 32, 64]),
    "seq_len": tune.choice([2, 5, 7, 10, 30]),
    "ctx_len": tune.sample_from(seq_len_sampler),
    "latent_dim": tune.randint(1, 101),
    "re_feat_weight": tune.loguniform(1, 1000),
    "kl_weight": tune.uniform(0, 1),
    "surface_hidden": tune.sample_from(surface_hidden_size_sampler),
    "mem_hidden": tune.randint(25, 101),
    "mem_layers": tune.randint(1, 11),
    "mem_dropout": tune.uniform(0, 1),
    "ctx_surface_hidden": tune.sample_from(surface_hidden_size_sampler),
}

In [6]:
result = flaml.tune.run(
    tune.with_parameters(train_wrapper),
    config=config,
    metric="dev_loss",
    mode="min",
    low_cost_partial_config={"num_epochs": 10},
    scheduler="asha",  # Use asha scheduler to perform early stopping based on intermediate results reported
    resources_per_trial={"cpu": 1, "gpu": 1},
    local_dir='logs/',
    num_samples=15,
    use_ray=True)

Using CFO for search. To use BlendSearch, run: pip install flaml[blendsearch]
2023-05-08 23:17:20,130	ERROR services.py:1197 -- Failed to start the dashboard 
2023-05-08 23:17:20,131	ERROR services.py:1222 -- Error should be written to 'dashboard.log' or 'dashboard.err'. We are printing the last 20 lines for you. See 'https://docs.ray.io/en/master/ray-observability/ray-logging.html#logging-directory-structure' to find where the log file is.
2023-05-08 23:17:20,132	ERROR services.py:1232 -- Couldn't read dashboard.log file. Error: [Errno 2] No such file or directory: 'C:\\Users\\yunta\\AppData\\Local\\Temp\\ray\\session_2023-05-08_23-16-57_429752_15356\\logs\\dashboard.log'. It means the dashboard is broken even before it initializes the logger (mostly dependency issues). Reading the dashboard.err file which contains stdout/stderr.
2023-05-08 23:17:20,133	ERROR services.py:1266 -- Failed to read dashboard.err file: cannot mmap an empty file. It is unexpected. Please report an issue to R

ValueError: Failed to evaluate expression: ('config', 'ctx_len'): <ray.tune.search.sample.Function object at 0x00000203004EA080>

In [6]:
scheduler = ASHAScheduler(
        max_t=100,
        grace_period=1,
        reduction_factor=2)
tuner = tune.Tuner(
        tune.with_resources(
            tune.with_parameters(train_wrapper),
            resources={"cpu": 2, "gpu": 1.0}
        ),
        tune_config=tune.TuneConfig(
            metric="dev_loss",
            mode="min",
            scheduler=scheduler,
            num_samples=15,
        ),
        param_space=config,
    )
results = tuner.fit()

2023-05-08 22:42:23,944	ERROR services.py:1197 -- Failed to start the dashboard 
2023-05-08 22:42:23,947	ERROR services.py:1222 -- Error should be written to 'dashboard.log' or 'dashboard.err'. We are printing the last 20 lines for you. See 'https://docs.ray.io/en/master/ray-observability/ray-logging.html#logging-directory-structure' to find where the log file is.
2023-05-08 22:42:23,949	ERROR services.py:1232 -- Couldn't read dashboard.log file. Error: [Errno 2] No such file or directory: 'C:\\Users\\yunta\\AppData\\Local\\Temp\\ray\\session_2023-05-08_22-42-01_358295_17328\\logs\\dashboard.log'. It means the dashboard is broken even before it initializes the logger (mostly dependency issues). Reading the dashboard.err file which contains stdout/stderr.
2023-05-08 22:42:23,952	ERROR services.py:1266 -- Failed to read dashboard.err file: cannot mmap an empty file. It is unexpected. Please report an issue to Ray github. https://github.com/ray-project/ray/issues
2023-05-08 22:42:25,683	I

2023-05-08 22:43:25,717	ERROR tune.py:941 -- Trials did not complete: [train_wrapper_23d90_00000, train_wrapper_23d90_00001, train_wrapper_23d90_00002, train_wrapper_23d90_00003, train_wrapper_23d90_00004, train_wrapper_23d90_00005, train_wrapper_23d90_00006, train_wrapper_23d90_00007, train_wrapper_23d90_00008, train_wrapper_23d90_00009, train_wrapper_23d90_00010, train_wrapper_23d90_00011, train_wrapper_23d90_00012, train_wrapper_23d90_00013, train_wrapper_23d90_00014]
2023-05-08 22:43:25,718	INFO tune.py:945 -- Total run time: 57.31 seconds (55.65 seconds for the tuning loop).
Continue running this experiment with: Tuner.restore(path="C:\Users\yunta\ray_results\train_wrapper_2023-05-08_22-41-57", trainable=...)
