In [1]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import torch  
import torch.nn as nn
import torch.nn.functional as F
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from ray import tune
from ray.tune import CLIReporter
from ray.tune.schedulers import ASHAScheduler
from ray.tune import ExperimentAnalysis


  from .autonotebook import tqdm as notebook_tqdm


# Neural Net Class and Training Functions
Define Class and functions

In [2]:
def data_loaders():
   # Import data
    dir_X = '/Users/donokoye/Documents/Spring_23/ML_1.C51/Final_Project/ev_adoption_ml/Data/df_X_county.csv'
    dir_y = '/Users/donokoye/Documents/Spring_23/ML_1.C51/Final_Project/ev_adoption_ml/Data/df_y_county.csv'
    
    X = pd.read_csv(dir_X)
    #X['constant'] = 1
    y = pd.read_csv(dir_y)

    # check if any nan values
    nan_row_X = X[X.isna().any(axis=1)]
    #print(nan_row_X)
    nan_row_y = y[y.isna().any(axis=1)]
    #print(nan_row_y)

    X = X.to_numpy()
    y = y.to_numpy()

    # only predict PHEV
    #y = y[:,0]

    # split train/test
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
    

    # standardize X
    scaler = StandardScaler().fit(X_train)
    X_train = scaler.transform(X_train)
    X_test = scaler.transform(X_test)

    return X_train, X_test, y_train, y_test

def train_model(nn_model, data_loaded, opt, batch_size=32):
    
    '''
    Trains neural network model on X_train, y_train data.
    
    Returns
    ----------
    nn_model: torch.nn.Module
        trained neural network model
    '''
    # convert to tensors (for Pytorch)
    X_train, X_eval, y_train, y_eval = data_loaded
    X_train_tensor = torch.tensor(X_train)
    y_train_tensor = torch.tensor(y_train)
    X_test_tensor = torch.tensor(X_eval)
    y_test_tensor = torch.tensor(y_eval)
    
    # train with (mini-batch) SGD; initialize optimizer
    n_samples, n_features = X_train_tensor.shape
    # initialize mse loss function
    mse_loss = torch.nn.MSELoss()
    nn_model.train()  # put model in train mode
    
    # loop through data in batches
    for batch_start in range(0, n_samples, batch_size):
    # reset gradients to zero
        opt.zero_grad()
        # form batch
        X_batch = X_train_tensor[batch_start:batch_start+batch_size]
        y_batch = y_train_tensor[batch_start:batch_start+batch_size]
        X_batch_test = X_test_tensor[batch_start:batch_start+batch_size]
        y_batch_test = y_test_tensor[batch_start:batch_start+batch_size]
        # pass batch through neural net to get prediction
        y_pred = nn_model(X_batch.float())
        y_pred = y_pred.unsqueeze(1)
        y_pred_test = nn_model(X_batch_test.float())
        y_pred_test = y_pred_test.unsqueeze(1)
        # compute MSE loss
        loss = mse_loss(y_pred, y_batch[:, None].float())
        loss_test = mse_loss(y_pred_test, y_batch_test[:, None].float())
        # back-propagate loss
        loss.backward()
        # update model parameters based on backpropogated gradients - clip values to avoid exploding gradients
        torch.nn.utils.clip_grad_value_(nn_model.parameters(), clip_value=1.5)
        opt.step()
        
        #print(f"Mean Train MSE: {epoch_loss}")
        
    return loss, loss_test

def evaluate_model(nn_model, X_eval, y_eval, batch_size=32):
    '''
    Evaluates trained neural network model on X_eval, y_eval data.

    Parameters
    ----------
    nn_model: torch.nn.Module
        trained neural network model
    X_eval: np.array
        matrix of training data features
    y_eval: np.array
        vector of training data labels
    batch_size: int
        batch size to looping over dataset to generate predictions

    Returns
    ----------
    mse: float
        MSE of trained model on X_eval, y_eval data
    '''
    # initialize mse loss function
    mse_loss = torch.nn.MSELoss()
    # convert to tensors (for Pytorch)
    X_eval_tensor = torch.tensor(X_eval)
    y_eval_tensor = torch.tensor(y_eval)
    n_samples = X_eval_tensor.shape[0]
    nn_model.eval() # put in eval mode
    # loop over data and generate predictions
    preds = []
    for batch_start in range(0, n_samples, batch_size):
        # form batch
        X_batch = X_eval_tensor[batch_start:batch_start+batch_size]
        y_batch = y_eval_tensor[batch_start:batch_start+batch_size]
        with torch.no_grad():  # no need to compute gradients during evaluation
            # pass batch through neural net to get prediction
            y_pred = nn_model(X_batch.float())
            y_pred = y_pred.unsqueeze(1)
            preds.append(y_pred)
    # compute MSE across all samples
    all_preds = torch.cat(preds)
    loss = mse_loss(all_preds, y_eval_tensor[:, None].float()).item()
    return loss

def train_and_validate(config):

    '''Parameters
    ----------
    X_train: np.array
        matrix of training data features
    y_train: np.array
        vector of training data labels
    max_iter: int
        maximum number of iterations to train for
    batch_size: int
        batch size to use when training w/ SGD
    '''
    # intialize neural network
    data_loaded = data_loaders()
    X_train, X_eval, y_train, y_eval = data_loaded
    print(y_eval)
    n_samples, n_features = X_train.shape
    nn_model = NN_configureable(n_features, config["n_hidden_dim"], config["n_layers"])

    opt = torch.optim.SGD(nn_model.parameters(), lr=config["lr"],  momentum=0.9)
    #opt = torch.optim.SGD(nn_model.parameters(), lr=config["lr"])
    batch_size = config["batch_size"]
    max_iter = config["train_iterations"]

    # Start the training.
    for it in range(max_iter):
        # save losses across all batches
        train_epoch_loss, test_epoch_loss = train_model(nn_model, data_loaded, opt, batch_size)
        valid_epoch_loss = evaluate_model(nn_model, X_eval, y_eval, batch_size)

        with tune.checkpoint_dir(it) as checkpoint_dir:
                path = os.path.join(checkpoint_dir, 'checkpoint')
                torch.save((nn_model.state_dict(), opt.state_dict()), path)
        tune.report(
            loss=valid_epoch_loss)

class NN(nn.Module):
    '''
    Class for fully connected neural net.
    '''
    def __init__(self, input_dim, hidden_dim):
        '''
        Parameters
        ----------
        input_dim: int
            input dimension (i.e., # of features in each example passed to the network)
        hidden_dim: int
            number of nodes in hidden layer
        '''
        super().__init__()
        self.input_dim = input_dim
        self.hidden_dim = hidden_dim
        self.layers = nn.Sequential(
            # Network has a single hidden layer
            # Apply ReLU activation in between the hidden layer and output node
            nn.Linear(input_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, 1),
            nn.ReLU()
        )

    def forward(self, x):
        x = self.layers(x)
        return x


class NN_configureable(nn.Module):
    '''
    Class for fully connected neural net.
    '''
    def __init__(self, input_dim, hidden_dim, hidden_layers):
        '''
        Parameters
        ----------
        input_dim: int
            input dimension (i.e., # of features in each example passed to the network)
        hidden_dim: int
            number of nodes in hidden layer
        '''
        super().__init__()
        self.input_dim = input_dim
        self.hidden_layers = hidden_layers
        #self.layers = nn.ModuleDict()
        self.layers = nn.ModuleDict()
        
        # Define input layer
        self.layers["input"] = nn.Linear(in_features = input_dim, out_features = hidden_dim)
        # Define hidden layers
        for i in range(self.hidden_layers):
            self.layers[f"hidden_{i}"] = nn.Linear(in_features = hidden_dim, out_features = hidden_dim)
        # Define output layer
        self.layers["output"] = nn.Linear(in_features = hidden_dim, out_features = 1)

    def forward(self, x):
        x = self.layers["input"](x)
        for i in range(self.hidden_layers):
            x = F.relu(self.layers[f"hidden_{i}"](x))

        return self.layers["output"](x)        
        

# Search Function for Ray Tune
Hyperparameter search



# Import Data
Import combined data

In [5]:
def main():   
    
    # Search Function for Ray Tune - Hyperparameter search
     
    #X = pd.read_csv('./Data/df_X_county.csv')
    #print(X.head)
     
    X_train, X_eval, y_train, y_eval = data_loaders()
    n_samples, n_features = X_train.shape

    # Define the parameter search configuration.
    config = {
        "n_layers": 
            #tune.sample_from(lambda _: 2 ** np.random.randint(1, 5)),
            tune.grid_search([1, 2, 3, 4, 8]),
        "n_hidden_dim": 
            #tune.sample_from(lambda _: 2 ** np.random.randint(4, 8)),
            tune.grid_search([2, 4, 8, n_features]),
        "lr": tune.loguniform(1e-5, 1e-3),
        "batch_size": tune.choice([32, 64]),
        "train_iterations": tune.choice([50, 100, 200])
    }

    max_num_iter = 50
    grace_period = 1
    # Number of Ray Tune random search experiments to run.
    num_samples = 20
    
    # Schduler to stop bad performing trails.
    scheduler = ASHAScheduler(
        metric="loss",
        mode="min",
        max_t = max_num_iter,
        grace_period = grace_period,
        reduction_factor = 2 
    )

    # Reporter to show on command line/output window
    reporter = CLIReporter(
        metric_columns=["loss", "accuracy", "training_iteration"])


    # Start Ray Tune search
    result = tune.run(
        train_and_validate,
        resources_per_trial = {"cpu": 2, "gpu": 0},
        config = config,
        num_samples = num_samples,
        scheduler = scheduler,
        local_dir = '../outputs/raytune_result',
        keep_checkpoints_num = 1,
        checkpoint_score_attr = 'min-validation_loss',
        progress_reporter = reporter)

    # Extract the best trial run from the search.
    best_trial = result.get_best_trial('loss', 'min', 'last')
    print(f"Best trial config: {best_trial.config}")
    print(f"Best trial final validation loss: {best_trial.last_result['loss']}")
    
    #result.dataframe().csv("df_raytune_search.csv")

    '''
    plt.xlabel('Iteration Step')
    plt.ylabel('Test Error')
    plt.title("Model 1 - Hidden Layer - ReLU")
    plt.legend()
    plt.figure(figsize=(20,12))
    plt.show()
'''

In [6]:
if __name__ == '__main__':
    main()

== Status ==
Current time: 2023-05-13 10:05:23 (running for 00:00:00.06)
Using AsyncHyperBand: num_stopped=0
Bracket: Iter 32.000: None | Iter 16.000: None | Iter 8.000: None | Iter 4.000: None | Iter 2.000: None | Iter 1.000: None
Logical resource usage: 2.0/8 CPUs, 0/0 GPUs
Result logdir: /Users/donokoye/Documents/Spring_23/ML_1.C51/Final_Project/outputs/raytune_result/train_and_validate_2023-05-13_10-05-23
Number of trials: 16/400 (15 PENDING, 1 RUNNING)
+--------------------------------+----------+----------------+--------------+-------------+----------------+------------+--------------------+
| Trial name                     | status   | loc            |   batch_size |          lr |   n_hidden_dim |   n_layers |   train_iterations |
|--------------------------------+----------+----------------+--------------+-------------+----------------+------------+--------------------|
| train_and_validate_3405a_00000 | RUNNING  | 127.0.0.1:4181 |           32 | 1.08674e-05 |              2 | 

Trial name,date,done,hostname,iterations_since_restore,loss,node_ip,pid,should_checkpoint,time_since_restore,time_this_iter_s,time_total_s,timestamp,training_iteration,trial_id
train_and_validate_3405a_00000,2023-05-13_10-05-29,True,Dons-MacBook-Pro.local,50,575827.0,127.0.0.1,4181,True,4.18271,0.0796368,4.18271,1683986729,50,3405a_00000
train_and_validate_3405a_00001,2023-05-13_10-05-30,True,Dons-MacBook-Pro.local,50,275717.0,127.0.0.1,4182,True,3.28376,0.066803,3.28376,1683986730,50,3405a_00001
train_and_validate_3405a_00002,2023-05-13_10-05-27,True,Dons-MacBook-Pro.local,1,657199.0,127.0.0.1,4183,True,0.092442,0.092442,0.092442,1683986727,1,3405a_00002
train_and_validate_3405a_00003,2023-05-13_10-05-27,True,Dons-MacBook-Pro.local,1,655789.0,127.0.0.1,4184,True,0.0522969,0.0522969,0.0522969,1683986727,1,3405a_00003
train_and_validate_3405a_00004,2023-05-13_10-05-27,True,Dons-MacBook-Pro.local,1,666613.0,127.0.0.1,4184,True,0.0993128,0.0993128,0.0993128,1683986727,1,3405a_00004
train_and_validate_3405a_00005,2023-05-13_10-05-27,True,Dons-MacBook-Pro.local,1,664488.0,127.0.0.1,4183,True,0.0560126,0.0560126,0.0560126,1683986727,1,3405a_00005
train_and_validate_3405a_00006,2023-05-13_10-05-27,True,Dons-MacBook-Pro.local,1,667616.0,127.0.0.1,4184,True,0.0516701,0.0516701,0.0516701,1683986727,1,3405a_00006
train_and_validate_3405a_00007,2023-05-13_10-05-34,True,Dons-MacBook-Pro.local,50,93325.1,127.0.0.1,4183,True,7.16021,0.130332,7.16021,1683986734,50,3405a_00007
train_and_validate_3405a_00008,2023-05-13_10-05-27,True,Dons-MacBook-Pro.local,1,667701.0,127.0.0.1,4184,True,0.154349,0.154349,0.154349,1683986727,1,3405a_00008
train_and_validate_3405a_00009,2023-05-13_10-05-32,True,Dons-MacBook-Pro.local,32,296522.0,127.0.0.1,4184,True,5.18435,0.16744,5.18435,1683986732,32,3405a_00009


2023-05-13 10:05:25,291	ERROR checkpoint_manager.py:361 -- Result dict has no key: validation_loss. checkpoint_score_attr must be set to a key in the result dict. Valid keys are: ['loss', 'time_this_iter_s', 'should_checkpoint', 'done', 'training_iteration', 'trial_id', 'date', 'timestamp', 'time_total_s', 'pid', 'hostname', 'node_ip', 'time_since_restore', 'iterations_since_restore', 'experiment_tag', 'config/n_layers', 'config/n_hidden_dim', 'config/lr', 'config/batch_size', 'config/train_iterations']
2023-05-13 10:05:25,361	ERROR checkpoint_manager.py:361 -- Result dict has no key: validation_loss. checkpoint_score_attr must be set to a key in the result dict. Valid keys are: ['loss', 'time_this_iter_s', 'should_checkpoint', 'done', 'training_iteration', 'trial_id', 'date', 'timestamp', 'time_total_s', 'pid', 'hostname', 'node_ip', 'time_since_restore', 'iterations_since_restore', 'experiment_tag', 'config/n_layers', 'config/n_hidden_dim', 'config/lr', 'config/batch_size', 'config/t

[2m[36m(train_and_validate pid=4181)[0m [[ 13.][32m [repeated 8x across cluster][0m
[2m[36m(train_and_validate pid=4181)[0m  [122.][32m [repeated 16x across cluster][0m
[2m[36m(train_and_validate pid=4181)[0m  [ 10.]][32m [repeated 24x across cluster][0m
[2m[36m(train_and_validate pid=4181)[0m  ...[32m [repeated 8x across cluster][0m


2023-05-13 10:05:25,432	ERROR checkpoint_manager.py:361 -- Result dict has no key: validation_loss. checkpoint_score_attr must be set to a key in the result dict. Valid keys are: ['loss', 'time_this_iter_s', 'should_checkpoint', 'done', 'training_iteration', 'trial_id', 'date', 'timestamp', 'time_total_s', 'pid', 'hostname', 'node_ip', 'time_since_restore', 'iterations_since_restore', 'experiment_tag', 'config/n_layers', 'config/n_hidden_dim', 'config/lr', 'config/batch_size', 'config/train_iterations']
2023-05-13 10:05:25,433	ERROR checkpoint_manager.py:361 -- Result dict has no key: validation_loss. checkpoint_score_attr must be set to a key in the result dict. Valid keys are: ['loss', 'time_this_iter_s', 'should_checkpoint', 'done', 'training_iteration', 'trial_id', 'date', 'timestamp', 'time_total_s', 'pid', 'hostname', 'node_ip', 'time_since_restore', 'iterations_since_restore', 'experiment_tag', 'config/n_layers', 'config/n_hidden_dim', 'config/lr', 'config/batch_size', 'config/t

== Status ==
Current time: 2023-05-13 10:05:28 (running for 00:00:05.07)
Using AsyncHyperBand: num_stopped=6
Bracket: Iter 32.000: -637892.9375 | Iter 16.000: -480404.140625 | Iter 8.000: -488468.140625 | Iter 4.000: -322882.109375 | Iter 2.000: -374520.296875 | Iter 1.000: -660843.125
Logical resource usage: 8.0/8 CPUs, 0/0 GPUs
Result logdir: /Users/donokoye/Documents/Spring_23/ML_1.C51/Final_Project/outputs/raytune_result/train_and_validate_2023-05-13_10-05-23
Number of trials: 26/400 (16 PENDING, 4 RUNNING, 6 TERMINATED)
+--------------------------------+------------+----------------+--------------+-------------+----------------+------------+--------------------+--------+----------------------+
| Trial name                     | status     | loc            |   batch_size |          lr |   n_hidden_dim |   n_layers |   train_iterations |   loss |   training_iteration |
|--------------------------------+------------+----------------+--------------+-------------+----------------+-----

2023-05-13 10:05:28,623	ERROR checkpoint_manager.py:361 -- Result dict has no key: validation_loss. checkpoint_score_attr must be set to a key in the result dict. Valid keys are: ['loss', 'time_this_iter_s', 'should_checkpoint', 'done', 'training_iteration', 'trial_id', 'date', 'timestamp', 'time_total_s', 'pid', 'hostname', 'node_ip', 'time_since_restore', 'iterations_since_restore', 'experiment_tag', 'config/n_layers', 'config/n_hidden_dim', 'config/lr', 'config/batch_size', 'config/train_iterations']
2023-05-13 10:05:28,624	ERROR checkpoint_manager.py:361 -- Result dict has no key: validation_loss. checkpoint_score_attr must be set to a key in the result dict. Valid keys are: ['loss', 'time_this_iter_s', 'should_checkpoint', 'done', 'training_iteration', 'trial_id', 'date', 'timestamp', 'time_total_s', 'pid', 'hostname', 'node_ip', 'time_since_restore', 'iterations_since_restore', 'experiment_tag', 'config/n_layers', 'config/n_hidden_dim', 'config/lr', 'config/batch_size', 'config/t

[2m[36m(train_and_validate pid=4181)[0m [[ 13.][32m [repeated 16x across cluster][0m
[2m[36m(train_and_validate pid=4181)[0m  [122.][32m [repeated 32x across cluster][0m
[2m[36m(train_and_validate pid=4181)[0m  [ 10.]][32m [repeated 48x across cluster][0m
[2m[36m(train_and_validate pid=4181)[0m  ...[32m [repeated 16x across cluster][0m


2023-05-13 10:05:30,479	ERROR checkpoint_manager.py:361 -- Result dict has no key: validation_loss. checkpoint_score_attr must be set to a key in the result dict. Valid keys are: ['loss', 'time_this_iter_s', 'should_checkpoint', 'done', 'training_iteration', 'trial_id', 'date', 'timestamp', 'time_total_s', 'pid', 'hostname', 'node_ip', 'time_since_restore', 'iterations_since_restore', 'experiment_tag', 'config/n_layers', 'config/n_hidden_dim', 'config/lr', 'config/batch_size', 'config/train_iterations']
2023-05-13 10:05:30,527	ERROR checkpoint_manager.py:361 -- Result dict has no key: validation_loss. checkpoint_score_attr must be set to a key in the result dict. Valid keys are: ['loss', 'time_this_iter_s', 'should_checkpoint', 'done', 'training_iteration', 'trial_id', 'date', 'timestamp', 'time_total_s', 'pid', 'hostname', 'node_ip', 'time_since_restore', 'iterations_since_restore', 'experiment_tag', 'config/n_layers', 'config/n_hidden_dim', 'config/lr', 'config/batch_size', 'config/t

== Status ==
Current time: 2023-05-13 10:05:33 (running for 00:00:10.08)
Using AsyncHyperBand: num_stopped=30
Bracket: Iter 32.000: -293375.25 | Iter 16.000: -297537.46875 | Iter 8.000: -300090.765625 | Iter 4.000: -331992.0 | Iter 2.000: -368753.0625 | Iter 1.000: -666796.90625
Logical resource usage: 8.0/8 CPUs, 0/0 GPUs
Result logdir: /Users/donokoye/Documents/Spring_23/ML_1.C51/Final_Project/outputs/raytune_result/train_and_validate_2023-05-13_10-05-23
Number of trials: 50/400 (16 PENDING, 4 RUNNING, 30 TERMINATED)
+--------------------------------+------------+----------------+--------------+-------------+----------------+------------+--------------------+--------+----------------------+
| Trial name                     | status     | loc            |   batch_size |          lr |   n_hidden_dim |   n_layers |   train_iterations |   loss |   training_iteration |
|--------------------------------+------------+----------------+--------------+-------------+----------------+-----------

2023-05-13 10:05:33,668	ERROR checkpoint_manager.py:361 -- Result dict has no key: validation_loss. checkpoint_score_attr must be set to a key in the result dict. Valid keys are: ['loss', 'time_this_iter_s', 'should_checkpoint', 'done', 'training_iteration', 'trial_id', 'date', 'timestamp', 'time_total_s', 'pid', 'hostname', 'node_ip', 'time_since_restore', 'iterations_since_restore', 'experiment_tag', 'config/n_layers', 'config/n_hidden_dim', 'config/lr', 'config/batch_size', 'config/train_iterations']
2023-05-13 10:05:33,669	ERROR checkpoint_manager.py:361 -- Result dict has no key: validation_loss. checkpoint_score_attr must be set to a key in the result dict. Valid keys are: ['loss', 'time_this_iter_s', 'should_checkpoint', 'done', 'training_iteration', 'trial_id', 'date', 'timestamp', 'time_total_s', 'pid', 'hostname', 'node_ip', 'time_since_restore', 'iterations_since_restore', 'experiment_tag', 'config/n_layers', 'config/n_hidden_dim', 'config/lr', 'config/batch_size', 'config/t

[2m[36m(train_and_validate pid=4183)[0m [[ 13.][32m [repeated 26x across cluster][0m
[2m[36m(train_and_validate pid=4183)[0m  [122.][32m [repeated 52x across cluster][0m
[2m[36m(train_and_validate pid=4183)[0m  [ 10.]][32m [repeated 78x across cluster][0m
[2m[36m(train_and_validate pid=4183)[0m  ...[32m [repeated 26x across cluster][0m


2023-05-13 10:05:35,587	ERROR checkpoint_manager.py:361 -- Result dict has no key: validation_loss. checkpoint_score_attr must be set to a key in the result dict. Valid keys are: ['loss', 'time_this_iter_s', 'should_checkpoint', 'done', 'training_iteration', 'trial_id', 'date', 'timestamp', 'time_total_s', 'pid', 'hostname', 'node_ip', 'time_since_restore', 'iterations_since_restore', 'experiment_tag', 'config/n_layers', 'config/n_hidden_dim', 'config/lr', 'config/batch_size', 'config/train_iterations']
2023-05-13 10:05:35,595	ERROR checkpoint_manager.py:361 -- Result dict has no key: validation_loss. checkpoint_score_attr must be set to a key in the result dict. Valid keys are: ['loss', 'time_this_iter_s', 'should_checkpoint', 'done', 'training_iteration', 'trial_id', 'date', 'timestamp', 'time_total_s', 'pid', 'hostname', 'node_ip', 'time_since_restore', 'iterations_since_restore', 'experiment_tag', 'config/n_layers', 'config/n_hidden_dim', 'config/lr', 'config/batch_size', 'config/t

== Status ==
Current time: 2023-05-13 10:05:38 (running for 00:00:15.08)
Using AsyncHyperBand: num_stopped=64
Bracket: Iter 32.000: -288973.46875 | Iter 16.000: -295342.9375 | Iter 8.000: -300722.84375 | Iter 4.000: -312892.8125 | Iter 2.000: -407617.484375 | Iter 1.000: -666980.6875
Logical resource usage: 8.0/8 CPUs, 0/0 GPUs
Result logdir: /Users/donokoye/Documents/Spring_23/ML_1.C51/Final_Project/outputs/raytune_result/train_and_validate_2023-05-13_10-05-23
Number of trials: 84/400 (16 PENDING, 4 RUNNING, 64 TERMINATED)
+--------------------------------+------------+----------------+--------------+-------------+----------------+------------+--------------------+----------+----------------------+
| Trial name                     | status     | loc            |   batch_size |          lr |   n_hidden_dim |   n_layers |   train_iterations |     loss |   training_iteration |
|--------------------------------+------------+----------------+--------------+-------------+----------------+--

2023-05-13 10:05:38,501	ERROR checkpoint_manager.py:361 -- Result dict has no key: validation_loss. checkpoint_score_attr must be set to a key in the result dict. Valid keys are: ['loss', 'time_this_iter_s', 'should_checkpoint', 'done', 'training_iteration', 'trial_id', 'date', 'timestamp', 'time_total_s', 'pid', 'hostname', 'node_ip', 'time_since_restore', 'iterations_since_restore', 'experiment_tag', 'config/n_layers', 'config/n_hidden_dim', 'config/lr', 'config/batch_size', 'config/train_iterations']
2023-05-13 10:05:38,501	ERROR checkpoint_manager.py:361 -- Result dict has no key: validation_loss. checkpoint_score_attr must be set to a key in the result dict. Valid keys are: ['loss', 'time_this_iter_s', 'should_checkpoint', 'done', 'training_iteration', 'trial_id', 'date', 'timestamp', 'time_total_s', 'pid', 'hostname', 'node_ip', 'time_since_restore', 'iterations_since_restore', 'experiment_tag', 'config/n_layers', 'config/n_hidden_dim', 'config/lr', 'config/batch_size', 'config/t

[2m[36m(train_and_validate pid=4184)[0m [[ 13.][32m [repeated 30x across cluster][0m
[2m[36m(train_and_validate pid=4184)[0m  [122.][32m [repeated 60x across cluster][0m
[2m[36m(train_and_validate pid=4184)[0m  [ 10.]][32m [repeated 90x across cluster][0m
[2m[36m(train_and_validate pid=4184)[0m  ...[32m [repeated 30x across cluster][0m


2023-05-13 10:05:40,687	ERROR checkpoint_manager.py:361 -- Result dict has no key: validation_loss. checkpoint_score_attr must be set to a key in the result dict. Valid keys are: ['loss', 'time_this_iter_s', 'should_checkpoint', 'done', 'training_iteration', 'trial_id', 'date', 'timestamp', 'time_total_s', 'pid', 'hostname', 'node_ip', 'time_since_restore', 'iterations_since_restore', 'experiment_tag', 'config/n_layers', 'config/n_hidden_dim', 'config/lr', 'config/batch_size', 'config/train_iterations']
2023-05-13 10:05:40,715	ERROR checkpoint_manager.py:361 -- Result dict has no key: validation_loss. checkpoint_score_attr must be set to a key in the result dict. Valid keys are: ['loss', 'time_this_iter_s', 'should_checkpoint', 'done', 'training_iteration', 'trial_id', 'date', 'timestamp', 'time_total_s', 'pid', 'hostname', 'node_ip', 'time_since_restore', 'iterations_since_restore', 'experiment_tag', 'config/n_layers', 'config/n_hidden_dim', 'config/lr', 'config/batch_size', 'config/t

== Status ==
Current time: 2023-05-13 10:05:43 (running for 00:00:20.09)
Using AsyncHyperBand: num_stopped=72
Bracket: Iter 32.000: -176116.390625 | Iter 16.000: -195472.5 | Iter 8.000: -299458.6875 | Iter 4.000: -312388.0 | Iter 2.000: -386188.828125 | Iter 1.000: -666974.65625
Logical resource usage: 8.0/8 CPUs, 0/0 GPUs
Result logdir: /Users/donokoye/Documents/Spring_23/ML_1.C51/Final_Project/outputs/raytune_result/train_and_validate_2023-05-13_10-05-23
Number of trials: 92/400 (16 PENDING, 4 RUNNING, 72 TERMINATED)
+--------------------------------+------------+----------------+--------------+-------------+----------------+------------+--------------------+----------+----------------------+
| Trial name                     | status     | loc            |   batch_size |          lr |   n_hidden_dim |   n_layers |   train_iterations |     loss |   training_iteration |
|--------------------------------+------------+----------------+--------------+-------------+----------------+-------

2023-05-13 10:05:43,478	ERROR checkpoint_manager.py:361 -- Result dict has no key: validation_loss. checkpoint_score_attr must be set to a key in the result dict. Valid keys are: ['loss', 'time_this_iter_s', 'should_checkpoint', 'done', 'training_iteration', 'trial_id', 'date', 'timestamp', 'time_total_s', 'pid', 'hostname', 'node_ip', 'time_since_restore', 'iterations_since_restore', 'experiment_tag', 'config/n_layers', 'config/n_hidden_dim', 'config/lr', 'config/batch_size', 'config/train_iterations']
2023-05-13 10:05:43,480	ERROR checkpoint_manager.py:361 -- Result dict has no key: validation_loss. checkpoint_score_attr must be set to a key in the result dict. Valid keys are: ['loss', 'time_this_iter_s', 'should_checkpoint', 'done', 'training_iteration', 'trial_id', 'date', 'timestamp', 'time_total_s', 'pid', 'hostname', 'node_ip', 'time_since_restore', 'iterations_since_restore', 'experiment_tag', 'config/n_layers', 'config/n_hidden_dim', 'config/lr', 'config/batch_size', 'config/t

[2m[36m(train_and_validate pid=4183)[0m [[ 13.][32m [repeated 17x across cluster][0m
[2m[36m(train_and_validate pid=4183)[0m  [122.][32m [repeated 34x across cluster][0m
[2m[36m(train_and_validate pid=4183)[0m  [ 10.]][32m [repeated 51x across cluster][0m
[2m[36m(train_and_validate pid=4183)[0m  ...[32m [repeated 17x across cluster][0m


2023-05-13 10:05:45,751	ERROR checkpoint_manager.py:361 -- Result dict has no key: validation_loss. checkpoint_score_attr must be set to a key in the result dict. Valid keys are: ['loss', 'time_this_iter_s', 'should_checkpoint', 'done', 'training_iteration', 'trial_id', 'date', 'timestamp', 'time_total_s', 'pid', 'hostname', 'node_ip', 'time_since_restore', 'iterations_since_restore', 'experiment_tag', 'config/n_layers', 'config/n_hidden_dim', 'config/lr', 'config/batch_size', 'config/train_iterations']
2023-05-13 10:05:45,816	ERROR checkpoint_manager.py:361 -- Result dict has no key: validation_loss. checkpoint_score_attr must be set to a key in the result dict. Valid keys are: ['loss', 'time_this_iter_s', 'should_checkpoint', 'done', 'training_iteration', 'trial_id', 'date', 'timestamp', 'time_total_s', 'pid', 'hostname', 'node_ip', 'time_since_restore', 'iterations_since_restore', 'experiment_tag', 'config/n_layers', 'config/n_hidden_dim', 'config/lr', 'config/batch_size', 'config/t

== Status ==
Current time: 2023-05-13 10:05:48 (running for 00:00:25.15)
Using AsyncHyperBand: num_stopped=96
Bracket: Iter 32.000: -174144.078125 | Iter 16.000: -190969.5078125 | Iter 8.000: -278603.03125 | Iter 4.000: -311750.4375 | Iter 2.000: -376374.3125 | Iter 1.000: -666990.71875
Logical resource usage: 8.0/8 CPUs, 0/0 GPUs
Result logdir: /Users/donokoye/Documents/Spring_23/ML_1.C51/Final_Project/outputs/raytune_result/train_and_validate_2023-05-13_10-05-23
Number of trials: 116/400 (16 PENDING, 4 RUNNING, 96 TERMINATED)
+--------------------------------+------------+----------------+--------------+-------------+----------------+------------+--------------------+----------+----------------------+
| Trial name                     | status     | loc            |   batch_size |          lr |   n_hidden_dim |   n_layers |   train_iterations |     loss |   training_iteration |
|--------------------------------+------------+----------------+--------------+-------------+---------------

2023-05-13 10:05:48,716	ERROR checkpoint_manager.py:361 -- Result dict has no key: validation_loss. checkpoint_score_attr must be set to a key in the result dict. Valid keys are: ['loss', 'time_this_iter_s', 'should_checkpoint', 'done', 'training_iteration', 'trial_id', 'date', 'timestamp', 'time_total_s', 'pid', 'hostname', 'node_ip', 'time_since_restore', 'iterations_since_restore', 'experiment_tag', 'config/n_layers', 'config/n_hidden_dim', 'config/lr', 'config/batch_size', 'config/train_iterations']
2023-05-13 10:05:48,718	ERROR checkpoint_manager.py:361 -- Result dict has no key: validation_loss. checkpoint_score_attr must be set to a key in the result dict. Valid keys are: ['loss', 'time_this_iter_s', 'should_checkpoint', 'done', 'training_iteration', 'trial_id', 'date', 'timestamp', 'time_total_s', 'pid', 'hostname', 'node_ip', 'time_since_restore', 'iterations_since_restore', 'experiment_tag', 'config/n_layers', 'config/n_hidden_dim', 'config/lr', 'config/batch_size', 'config/t

[2m[36m(train_and_validate pid=4184)[0m [[ 13.][32m [repeated 13x across cluster][0m
[2m[36m(train_and_validate pid=4184)[0m  [122.][32m [repeated 26x across cluster][0m
[2m[36m(train_and_validate pid=4184)[0m  [ 10.]][32m [repeated 39x across cluster][0m
[2m[36m(train_and_validate pid=4184)[0m  ...[32m [repeated 13x across cluster][0m


2023-05-13 10:05:50,790	ERROR checkpoint_manager.py:361 -- Result dict has no key: validation_loss. checkpoint_score_attr must be set to a key in the result dict. Valid keys are: ['loss', 'time_this_iter_s', 'should_checkpoint', 'done', 'training_iteration', 'trial_id', 'date', 'timestamp', 'time_total_s', 'pid', 'hostname', 'node_ip', 'time_since_restore', 'iterations_since_restore', 'experiment_tag', 'config/n_layers', 'config/n_hidden_dim', 'config/lr', 'config/batch_size', 'config/train_iterations']
2023-05-13 10:05:50,791	ERROR checkpoint_manager.py:361 -- Result dict has no key: validation_loss. checkpoint_score_attr must be set to a key in the result dict. Valid keys are: ['loss', 'time_this_iter_s', 'should_checkpoint', 'done', 'training_iteration', 'trial_id', 'date', 'timestamp', 'time_total_s', 'pid', 'hostname', 'node_ip', 'time_since_restore', 'iterations_since_restore', 'experiment_tag', 'config/n_layers', 'config/n_hidden_dim', 'config/lr', 'config/batch_size', 'config/t

== Status ==
Current time: 2023-05-13 10:05:53 (running for 00:00:30.20)
Using AsyncHyperBand: num_stopped=110
Bracket: Iter 32.000: -145827.42578125 | Iter 16.000: -178730.3046875 | Iter 8.000: -278266.65625 | Iter 4.000: -311750.4375 | Iter 2.000: -407617.484375 | Iter 1.000: -666957.6875
Logical resource usage: 8.0/8 CPUs, 0/0 GPUs
Result logdir: /Users/donokoye/Documents/Spring_23/ML_1.C51/Final_Project/outputs/raytune_result/train_and_validate_2023-05-13_10-05-23
Number of trials: 130/400 (16 PENDING, 4 RUNNING, 110 TERMINATED)
+--------------------------------+------------+----------------+--------------+-------------+----------------+------------+--------------------+----------+----------------------+
| Trial name                     | status     | loc            |   batch_size |          lr |   n_hidden_dim |   n_layers |   train_iterations |     loss |   training_iteration |
|--------------------------------+------------+----------------+--------------+-------------+----------

2023-05-13 10:05:53,660	ERROR checkpoint_manager.py:361 -- Result dict has no key: validation_loss. checkpoint_score_attr must be set to a key in the result dict. Valid keys are: ['loss', 'time_this_iter_s', 'should_checkpoint', 'done', 'training_iteration', 'trial_id', 'date', 'timestamp', 'time_total_s', 'pid', 'hostname', 'node_ip', 'time_since_restore', 'iterations_since_restore', 'experiment_tag', 'config/n_layers', 'config/n_hidden_dim', 'config/lr', 'config/batch_size', 'config/train_iterations']
2023-05-13 10:05:53,661	ERROR checkpoint_manager.py:361 -- Result dict has no key: validation_loss. checkpoint_score_attr must be set to a key in the result dict. Valid keys are: ['loss', 'time_this_iter_s', 'should_checkpoint', 'done', 'training_iteration', 'trial_id', 'date', 'timestamp', 'time_total_s', 'pid', 'hostname', 'node_ip', 'time_since_restore', 'iterations_since_restore', 'experiment_tag', 'config/n_layers', 'config/n_hidden_dim', 'config/lr', 'config/batch_size', 'config/t

[2m[36m(train_and_validate pid=4182)[0m [[ 13.][32m [repeated 24x across cluster][0m
[2m[36m(train_and_validate pid=4182)[0m  [122.][32m [repeated 48x across cluster][0m
[2m[36m(train_and_validate pid=4182)[0m  [ 10.]][32m [repeated 72x across cluster][0m
[2m[36m(train_and_validate pid=4182)[0m  ...[32m [repeated 24x across cluster][0m


2023-05-13 10:05:55,972	ERROR checkpoint_manager.py:361 -- Result dict has no key: validation_loss. checkpoint_score_attr must be set to a key in the result dict. Valid keys are: ['loss', 'time_this_iter_s', 'should_checkpoint', 'done', 'training_iteration', 'trial_id', 'date', 'timestamp', 'time_total_s', 'pid', 'hostname', 'node_ip', 'time_since_restore', 'iterations_since_restore', 'experiment_tag', 'config/n_layers', 'config/n_hidden_dim', 'config/lr', 'config/batch_size', 'config/train_iterations']
2023-05-13 10:05:56,027	ERROR checkpoint_manager.py:361 -- Result dict has no key: validation_loss. checkpoint_score_attr must be set to a key in the result dict. Valid keys are: ['loss', 'time_this_iter_s', 'should_checkpoint', 'done', 'training_iteration', 'trial_id', 'date', 'timestamp', 'time_total_s', 'pid', 'hostname', 'node_ip', 'time_since_restore', 'iterations_since_restore', 'experiment_tag', 'config/n_layers', 'config/n_hidden_dim', 'config/lr', 'config/batch_size', 'config/t

== Status ==
Current time: 2023-05-13 10:05:58 (running for 00:00:35.21)
Using AsyncHyperBand: num_stopped=141
Bracket: Iter 32.000: -119483.0859375 | Iter 16.000: -170994.09375 | Iter 8.000: -272068.421875 | Iter 4.000: -313209.40625 | Iter 2.000: -399594.3125 | Iter 1.000: -666974.65625
Logical resource usage: 8.0/8 CPUs, 0/0 GPUs
Result logdir: /Users/donokoye/Documents/Spring_23/ML_1.C51/Final_Project/outputs/raytune_result/train_and_validate_2023-05-13_10-05-23
Number of trials: 160/400 (16 PENDING, 4 RUNNING, 140 TERMINATED)
+--------------------------------+------------+----------------+--------------+-------------+----------------+------------+--------------------+----------+----------------------+
| Trial name                     | status     | loc            |   batch_size |          lr |   n_hidden_dim |   n_layers |   train_iterations |     loss |   training_iteration |
|--------------------------------+------------+----------------+--------------+-------------+------------

2023-05-13 10:05:58,712	ERROR checkpoint_manager.py:361 -- Result dict has no key: validation_loss. checkpoint_score_attr must be set to a key in the result dict. Valid keys are: ['loss', 'time_this_iter_s', 'should_checkpoint', 'done', 'training_iteration', 'trial_id', 'date', 'timestamp', 'time_total_s', 'pid', 'hostname', 'node_ip', 'time_since_restore', 'iterations_since_restore', 'experiment_tag', 'config/n_layers', 'config/n_hidden_dim', 'config/lr', 'config/batch_size', 'config/train_iterations']
2023-05-13 10:05:58,713	ERROR checkpoint_manager.py:361 -- Result dict has no key: validation_loss. checkpoint_score_attr must be set to a key in the result dict. Valid keys are: ['loss', 'time_this_iter_s', 'should_checkpoint', 'done', 'training_iteration', 'trial_id', 'date', 'timestamp', 'time_total_s', 'pid', 'hostname', 'node_ip', 'time_since_restore', 'iterations_since_restore', 'experiment_tag', 'config/n_layers', 'config/n_hidden_dim', 'config/lr', 'config/batch_size', 'config/t

[2m[36m(train_and_validate pid=4181)[0m [[ 13.][32m [repeated 24x across cluster][0m
[2m[36m(train_and_validate pid=4181)[0m  [122.][32m [repeated 48x across cluster][0m
[2m[36m(train_and_validate pid=4181)[0m  [ 10.]][32m [repeated 72x across cluster][0m
[2m[36m(train_and_validate pid=4181)[0m  ...[32m [repeated 24x across cluster][0m


2023-05-13 10:06:01,082	ERROR checkpoint_manager.py:361 -- Result dict has no key: validation_loss. checkpoint_score_attr must be set to a key in the result dict. Valid keys are: ['loss', 'time_this_iter_s', 'should_checkpoint', 'done', 'training_iteration', 'trial_id', 'date', 'timestamp', 'time_total_s', 'pid', 'hostname', 'node_ip', 'time_since_restore', 'iterations_since_restore', 'experiment_tag', 'config/n_layers', 'config/n_hidden_dim', 'config/lr', 'config/batch_size', 'config/train_iterations']
2023-05-13 10:06:01,157	ERROR checkpoint_manager.py:361 -- Result dict has no key: validation_loss. checkpoint_score_attr must be set to a key in the result dict. Valid keys are: ['loss', 'time_this_iter_s', 'should_checkpoint', 'done', 'training_iteration', 'trial_id', 'date', 'timestamp', 'time_total_s', 'pid', 'hostname', 'node_ip', 'time_since_restore', 'iterations_since_restore', 'experiment_tag', 'config/n_layers', 'config/n_hidden_dim', 'config/lr', 'config/batch_size', 'config/t

== Status ==
Current time: 2023-05-13 10:06:03 (running for 00:00:40.23)
Using AsyncHyperBand: num_stopped=155
Bracket: Iter 32.000: -114086.5546875 | Iter 16.000: -169327.8828125 | Iter 8.000: -263570.3359375 | Iter 4.000: -313568.9375 | Iter 2.000: -396003.34375 | Iter 1.000: -666946.75
Logical resource usage: 8.0/8 CPUs, 0/0 GPUs
Result logdir: /Users/donokoye/Documents/Spring_23/ML_1.C51/Final_Project/outputs/raytune_result/train_and_validate_2023-05-13_10-05-23
Number of trials: 175/400 (16 PENDING, 4 RUNNING, 155 TERMINATED)
+--------------------------------+------------+----------------+--------------+-------------+----------------+------------+--------------------+----------+----------------------+
| Trial name                     | status     | loc            |   batch_size |          lr |   n_hidden_dim |   n_layers |   train_iterations |     loss |   training_iteration |
|--------------------------------+------------+----------------+--------------+-------------+------------

2023-05-13 10:06:03,867	ERROR checkpoint_manager.py:361 -- Result dict has no key: validation_loss. checkpoint_score_attr must be set to a key in the result dict. Valid keys are: ['loss', 'time_this_iter_s', 'should_checkpoint', 'done', 'training_iteration', 'trial_id', 'date', 'timestamp', 'time_total_s', 'pid', 'hostname', 'node_ip', 'time_since_restore', 'iterations_since_restore', 'experiment_tag', 'config/n_layers', 'config/n_hidden_dim', 'config/lr', 'config/batch_size', 'config/train_iterations']
2023-05-13 10:06:03,867	ERROR checkpoint_manager.py:361 -- Result dict has no key: validation_loss. checkpoint_score_attr must be set to a key in the result dict. Valid keys are: ['loss', 'time_this_iter_s', 'should_checkpoint', 'done', 'training_iteration', 'trial_id', 'date', 'timestamp', 'time_total_s', 'pid', 'hostname', 'node_ip', 'time_since_restore', 'iterations_since_restore', 'experiment_tag', 'config/n_layers', 'config/n_hidden_dim', 'config/lr', 'config/batch_size', 'config/t

[2m[36m(train_and_validate pid=4181)[0m [[ 13.][32m [repeated 19x across cluster][0m
[2m[36m(train_and_validate pid=4181)[0m  [122.][32m [repeated 38x across cluster][0m
[2m[36m(train_and_validate pid=4181)[0m  [ 10.]][32m [repeated 57x across cluster][0m
[2m[36m(train_and_validate pid=4181)[0m  ...[32m [repeated 19x across cluster][0m


2023-05-13 10:06:06,149	ERROR checkpoint_manager.py:361 -- Result dict has no key: validation_loss. checkpoint_score_attr must be set to a key in the result dict. Valid keys are: ['loss', 'time_this_iter_s', 'should_checkpoint', 'done', 'training_iteration', 'trial_id', 'date', 'timestamp', 'time_total_s', 'pid', 'hostname', 'node_ip', 'time_since_restore', 'iterations_since_restore', 'experiment_tag', 'config/n_layers', 'config/n_hidden_dim', 'config/lr', 'config/batch_size', 'config/train_iterations']
2023-05-13 10:06:06,149	ERROR checkpoint_manager.py:361 -- Result dict has no key: validation_loss. checkpoint_score_attr must be set to a key in the result dict. Valid keys are: ['loss', 'time_this_iter_s', 'should_checkpoint', 'done', 'training_iteration', 'trial_id', 'date', 'timestamp', 'time_total_s', 'pid', 'hostname', 'node_ip', 'time_since_restore', 'iterations_since_restore', 'experiment_tag', 'config/n_layers', 'config/n_hidden_dim', 'config/lr', 'config/batch_size', 'config/t

== Status ==
Current time: 2023-05-13 10:06:08 (running for 00:00:45.24)
Using AsyncHyperBand: num_stopped=175
Bracket: Iter 32.000: -108690.0234375 | Iter 16.000: -169327.8828125 | Iter 8.000: -263570.3359375 | Iter 4.000: -313209.40625 | Iter 2.000: -396003.34375 | Iter 1.000: -666829.1875
Logical resource usage: 8.0/8 CPUs, 0/0 GPUs
Result logdir: /Users/donokoye/Documents/Spring_23/ML_1.C51/Final_Project/outputs/raytune_result/train_and_validate_2023-05-13_10-05-23
Number of trials: 195/400 (16 PENDING, 4 RUNNING, 175 TERMINATED)
+--------------------------------+------------+----------------+--------------+-------------+----------------+------------+--------------------+----------+----------------------+
| Trial name                     | status     | loc            |   batch_size |          lr |   n_hidden_dim |   n_layers |   train_iterations |     loss |   training_iteration |
|--------------------------------+------------+----------------+--------------+-------------+---------

2023-05-13 10:06:08,824	ERROR checkpoint_manager.py:361 -- Result dict has no key: validation_loss. checkpoint_score_attr must be set to a key in the result dict. Valid keys are: ['loss', 'time_this_iter_s', 'should_checkpoint', 'done', 'training_iteration', 'trial_id', 'date', 'timestamp', 'time_total_s', 'pid', 'hostname', 'node_ip', 'time_since_restore', 'iterations_since_restore', 'experiment_tag', 'config/n_layers', 'config/n_hidden_dim', 'config/lr', 'config/batch_size', 'config/train_iterations']
2023-05-13 10:06:08,825	ERROR checkpoint_manager.py:361 -- Result dict has no key: validation_loss. checkpoint_score_attr must be set to a key in the result dict. Valid keys are: ['loss', 'time_this_iter_s', 'should_checkpoint', 'done', 'training_iteration', 'trial_id', 'date', 'timestamp', 'time_total_s', 'pid', 'hostname', 'node_ip', 'time_since_restore', 'iterations_since_restore', 'experiment_tag', 'config/n_layers', 'config/n_hidden_dim', 'config/lr', 'config/batch_size', 'config/t

[2m[36m(train_and_validate pid=4182)[0m [[ 13.][32m [repeated 19x across cluster][0m
[2m[36m(train_and_validate pid=4182)[0m  [122.][32m [repeated 38x across cluster][0m
[2m[36m(train_and_validate pid=4182)[0m  [ 10.]][32m [repeated 57x across cluster][0m
[2m[36m(train_and_validate pid=4182)[0m  ...[32m [repeated 19x across cluster][0m


2023-05-13 10:06:11,383	ERROR checkpoint_manager.py:361 -- Result dict has no key: validation_loss. checkpoint_score_attr must be set to a key in the result dict. Valid keys are: ['loss', 'time_this_iter_s', 'should_checkpoint', 'done', 'training_iteration', 'trial_id', 'date', 'timestamp', 'time_total_s', 'pid', 'hostname', 'node_ip', 'time_since_restore', 'iterations_since_restore', 'experiment_tag', 'config/n_layers', 'config/n_hidden_dim', 'config/lr', 'config/batch_size', 'config/train_iterations']
2023-05-13 10:06:11,438	ERROR checkpoint_manager.py:361 -- Result dict has no key: validation_loss. checkpoint_score_attr must be set to a key in the result dict. Valid keys are: ['loss', 'time_this_iter_s', 'should_checkpoint', 'done', 'training_iteration', 'trial_id', 'date', 'timestamp', 'time_total_s', 'pid', 'hostname', 'node_ip', 'time_since_restore', 'iterations_since_restore', 'experiment_tag', 'config/n_layers', 'config/n_hidden_dim', 'config/lr', 'config/batch_size', 'config/t

== Status ==
Current time: 2023-05-13 10:06:13 (running for 00:00:50.25)
Using AsyncHyperBand: num_stopped=199
Bracket: Iter 32.000: -108690.0234375 | Iter 16.000: -167661.671875 | Iter 8.000: -265870.1875 | Iter 4.000: -315420.375 | Iter 2.000: -390472.25 | Iter 1.000: -666532.5625
Logical resource usage: 8.0/8 CPUs, 0/0 GPUs
Result logdir: /Users/donokoye/Documents/Spring_23/ML_1.C51/Final_Project/outputs/raytune_result/train_and_validate_2023-05-13_10-05-23
Number of trials: 219/400 (16 PENDING, 4 RUNNING, 199 TERMINATED)
+--------------------------------+------------+----------------+--------------+-------------+----------------+------------+--------------------+----------+----------------------+
| Trial name                     | status     | loc            |   batch_size |          lr |   n_hidden_dim |   n_layers |   train_iterations |     loss |   training_iteration |
|--------------------------------+------------+----------------+--------------+-------------+----------------+-

2023-05-13 10:06:13,687	ERROR checkpoint_manager.py:361 -- Result dict has no key: validation_loss. checkpoint_score_attr must be set to a key in the result dict. Valid keys are: ['loss', 'time_this_iter_s', 'should_checkpoint', 'done', 'training_iteration', 'trial_id', 'date', 'timestamp', 'time_total_s', 'pid', 'hostname', 'node_ip', 'time_since_restore', 'iterations_since_restore', 'experiment_tag', 'config/n_layers', 'config/n_hidden_dim', 'config/lr', 'config/batch_size', 'config/train_iterations']
2023-05-13 10:06:13,688	ERROR checkpoint_manager.py:361 -- Result dict has no key: validation_loss. checkpoint_score_attr must be set to a key in the result dict. Valid keys are: ['loss', 'time_this_iter_s', 'should_checkpoint', 'done', 'training_iteration', 'trial_id', 'date', 'timestamp', 'time_total_s', 'pid', 'hostname', 'node_ip', 'time_since_restore', 'iterations_since_restore', 'experiment_tag', 'config/n_layers', 'config/n_hidden_dim', 'config/lr', 'config/batch_size', 'config/t

[2m[36m(train_and_validate pid=4182)[0m [[ 13.][32m [repeated 29x across cluster][0m
[2m[36m(train_and_validate pid=4182)[0m  [122.][32m [repeated 58x across cluster][0m
[2m[36m(train_and_validate pid=4182)[0m  [ 10.]][32m [repeated 87x across cluster][0m
[2m[36m(train_and_validate pid=4182)[0m  ...[32m [repeated 29x across cluster][0m


2023-05-13 10:06:16,350	ERROR checkpoint_manager.py:361 -- Result dict has no key: validation_loss. checkpoint_score_attr must be set to a key in the result dict. Valid keys are: ['loss', 'time_this_iter_s', 'should_checkpoint', 'done', 'training_iteration', 'trial_id', 'date', 'timestamp', 'time_total_s', 'pid', 'hostname', 'node_ip', 'time_since_restore', 'iterations_since_restore', 'experiment_tag', 'config/n_layers', 'config/n_hidden_dim', 'config/lr', 'config/batch_size', 'config/train_iterations']
2023-05-13 10:06:16,351	ERROR checkpoint_manager.py:361 -- Result dict has no key: validation_loss. checkpoint_score_attr must be set to a key in the result dict. Valid keys are: ['loss', 'time_this_iter_s', 'should_checkpoint', 'done', 'training_iteration', 'trial_id', 'date', 'timestamp', 'time_total_s', 'pid', 'hostname', 'node_ip', 'time_since_restore', 'iterations_since_restore', 'experiment_tag', 'config/n_layers', 'config/n_hidden_dim', 'config/lr', 'config/batch_size', 'config/t

== Status ==
Current time: 2023-05-13 10:06:18 (running for 00:00:55.25)
Using AsyncHyperBand: num_stopped=224
Bracket: Iter 32.000: -108690.0234375 | Iter 16.000: -164946.71875 | Iter 8.000: -263570.3359375 | Iter 4.000: -314486.453125 | Iter 2.000: -393263.984375 | Iter 1.000: -666722.90625
Logical resource usage: 8.0/8 CPUs, 0/0 GPUs
Result logdir: /Users/donokoye/Documents/Spring_23/ML_1.C51/Final_Project/outputs/raytune_result/train_and_validate_2023-05-13_10-05-23
Number of trials: 244/400 (16 PENDING, 4 RUNNING, 224 TERMINATED)
+--------------------------------+------------+----------------+--------------+-------------+----------------+------------+--------------------+----------+----------------------+
| Trial name                     | status     | loc            |   batch_size |          lr |   n_hidden_dim |   n_layers |   train_iterations |     loss |   training_iteration |
|--------------------------------+------------+----------------+--------------+-------------+--------

2023-05-13 10:06:18,805	ERROR checkpoint_manager.py:361 -- Result dict has no key: validation_loss. checkpoint_score_attr must be set to a key in the result dict. Valid keys are: ['loss', 'time_this_iter_s', 'should_checkpoint', 'done', 'training_iteration', 'trial_id', 'date', 'timestamp', 'time_total_s', 'pid', 'hostname', 'node_ip', 'time_since_restore', 'iterations_since_restore', 'experiment_tag', 'config/n_layers', 'config/n_hidden_dim', 'config/lr', 'config/batch_size', 'config/train_iterations']
2023-05-13 10:06:18,806	ERROR checkpoint_manager.py:361 -- Result dict has no key: validation_loss. checkpoint_score_attr must be set to a key in the result dict. Valid keys are: ['loss', 'time_this_iter_s', 'should_checkpoint', 'done', 'training_iteration', 'trial_id', 'date', 'timestamp', 'time_total_s', 'pid', 'hostname', 'node_ip', 'time_since_restore', 'iterations_since_restore', 'experiment_tag', 'config/n_layers', 'config/n_hidden_dim', 'config/lr', 'config/batch_size', 'config/t

[2m[36m(train_and_validate pid=4182)[0m [[ 13.][32m [repeated 16x across cluster][0m
[2m[36m(train_and_validate pid=4182)[0m  [122.][32m [repeated 32x across cluster][0m
[2m[36m(train_and_validate pid=4182)[0m  [ 10.]][32m [repeated 48x across cluster][0m
[2m[36m(train_and_validate pid=4182)[0m  ...[32m [repeated 16x across cluster][0m


2023-05-13 10:06:21,552	ERROR checkpoint_manager.py:361 -- Result dict has no key: validation_loss. checkpoint_score_attr must be set to a key in the result dict. Valid keys are: ['loss', 'time_this_iter_s', 'should_checkpoint', 'done', 'training_iteration', 'trial_id', 'date', 'timestamp', 'time_total_s', 'pid', 'hostname', 'node_ip', 'time_since_restore', 'iterations_since_restore', 'experiment_tag', 'config/n_layers', 'config/n_hidden_dim', 'config/lr', 'config/batch_size', 'config/train_iterations']
2023-05-13 10:06:21,634	ERROR checkpoint_manager.py:361 -- Result dict has no key: validation_loss. checkpoint_score_attr must be set to a key in the result dict. Valid keys are: ['loss', 'time_this_iter_s', 'should_checkpoint', 'done', 'training_iteration', 'trial_id', 'date', 'timestamp', 'time_total_s', 'pid', 'hostname', 'node_ip', 'time_since_restore', 'iterations_since_restore', 'experiment_tag', 'config/n_layers', 'config/n_hidden_dim', 'config/lr', 'config/batch_size', 'config/t

== Status ==
Current time: 2023-05-13 10:06:23 (running for 00:01:00.26)
Using AsyncHyperBand: num_stopped=237
Bracket: Iter 32.000: -119483.0859375 | Iter 16.000: -164832.1484375 | Iter 8.000: -274694.75 | Iter 4.000: -313392.5 | Iter 2.000: -377980.75 | Iter 1.000: -666656.0
Logical resource usage: 8.0/8 CPUs, 0/0 GPUs
Result logdir: /Users/donokoye/Documents/Spring_23/ML_1.C51/Final_Project/outputs/raytune_result/train_and_validate_2023-05-13_10-05-23
Number of trials: 257/400 (16 PENDING, 4 RUNNING, 237 TERMINATED)
+--------------------------------+------------+----------------+--------------+-------------+----------------+------------+--------------------+----------+----------------------+
| Trial name                     | status     | loc            |   batch_size |          lr |   n_hidden_dim |   n_layers |   train_iterations |     loss |   training_iteration |
|--------------------------------+------------+----------------+--------------+-------------+----------------+-------

2023-05-13 10:06:23,732	ERROR checkpoint_manager.py:361 -- Result dict has no key: validation_loss. checkpoint_score_attr must be set to a key in the result dict. Valid keys are: ['loss', 'time_this_iter_s', 'should_checkpoint', 'done', 'training_iteration', 'trial_id', 'date', 'timestamp', 'time_total_s', 'pid', 'hostname', 'node_ip', 'time_since_restore', 'iterations_since_restore', 'experiment_tag', 'config/n_layers', 'config/n_hidden_dim', 'config/lr', 'config/batch_size', 'config/train_iterations']
2023-05-13 10:06:23,733	ERROR checkpoint_manager.py:361 -- Result dict has no key: validation_loss. checkpoint_score_attr must be set to a key in the result dict. Valid keys are: ['loss', 'time_this_iter_s', 'should_checkpoint', 'done', 'training_iteration', 'trial_id', 'date', 'timestamp', 'time_total_s', 'pid', 'hostname', 'node_ip', 'time_since_restore', 'iterations_since_restore', 'experiment_tag', 'config/n_layers', 'config/n_hidden_dim', 'config/lr', 'config/batch_size', 'config/t

[2m[36m(train_and_validate pid=4182)[0m [[ 13.][32m [repeated 16x across cluster][0m
[2m[36m(train_and_validate pid=4182)[0m  [122.][32m [repeated 32x across cluster][0m
[2m[36m(train_and_validate pid=4182)[0m  [ 10.]][32m [repeated 48x across cluster][0m
[2m[36m(train_and_validate pid=4182)[0m  ...[32m [repeated 16x across cluster][0m


2023-05-13 10:06:26,567	ERROR checkpoint_manager.py:361 -- Result dict has no key: validation_loss. checkpoint_score_attr must be set to a key in the result dict. Valid keys are: ['loss', 'time_this_iter_s', 'should_checkpoint', 'done', 'training_iteration', 'trial_id', 'date', 'timestamp', 'time_total_s', 'pid', 'hostname', 'node_ip', 'time_since_restore', 'iterations_since_restore', 'experiment_tag', 'config/n_layers', 'config/n_hidden_dim', 'config/lr', 'config/batch_size', 'config/train_iterations']
2023-05-13 10:06:26,567	ERROR checkpoint_manager.py:361 -- Result dict has no key: validation_loss. checkpoint_score_attr must be set to a key in the result dict. Valid keys are: ['loss', 'time_this_iter_s', 'should_checkpoint', 'done', 'training_iteration', 'trial_id', 'date', 'timestamp', 'time_total_s', 'pid', 'hostname', 'node_ip', 'time_since_restore', 'iterations_since_restore', 'experiment_tag', 'config/n_layers', 'config/n_hidden_dim', 'config/lr', 'config/batch_size', 'config/t

== Status ==
Current time: 2023-05-13 10:06:28 (running for 00:01:05.27)
Using AsyncHyperBand: num_stopped=254
Bracket: Iter 32.000: -114086.5546875 | Iter 16.000: -162835.2109375 | Iter 8.000: -265870.1875 | Iter 4.000: -313660.484375 | Iter 2.000: -380868.59375 | Iter 1.000: -666822.53125
Logical resource usage: 8.0/8 CPUs, 0/0 GPUs
Result logdir: /Users/donokoye/Documents/Spring_23/ML_1.C51/Final_Project/outputs/raytune_result/train_and_validate_2023-05-13_10-05-23
Number of trials: 274/400 (16 PENDING, 4 RUNNING, 254 TERMINATED)
+--------------------------------+------------+----------------+--------------+-------------+----------------+------------+--------------------+----------+----------------------+
| Trial name                     | status     | loc            |   batch_size |          lr |   n_hidden_dim |   n_layers |   train_iterations |     loss |   training_iteration |
|--------------------------------+------------+----------------+--------------+-------------+----------

2023-05-13 10:06:28,699	ERROR checkpoint_manager.py:361 -- Result dict has no key: validation_loss. checkpoint_score_attr must be set to a key in the result dict. Valid keys are: ['loss', 'time_this_iter_s', 'should_checkpoint', 'done', 'training_iteration', 'trial_id', 'date', 'timestamp', 'time_total_s', 'pid', 'hostname', 'node_ip', 'time_since_restore', 'iterations_since_restore', 'experiment_tag', 'config/n_layers', 'config/n_hidden_dim', 'config/lr', 'config/batch_size', 'config/train_iterations']
2023-05-13 10:06:28,700	ERROR checkpoint_manager.py:361 -- Result dict has no key: validation_loss. checkpoint_score_attr must be set to a key in the result dict. Valid keys are: ['loss', 'time_this_iter_s', 'should_checkpoint', 'done', 'training_iteration', 'trial_id', 'date', 'timestamp', 'time_total_s', 'pid', 'hostname', 'node_ip', 'time_since_restore', 'iterations_since_restore', 'experiment_tag', 'config/n_layers', 'config/n_hidden_dim', 'config/lr', 'config/batch_size', 'config/t

[2m[36m(train_and_validate pid=4181)[0m [[ 13.][32m [repeated 11x across cluster][0m
[2m[36m(train_and_validate pid=4181)[0m  [122.][32m [repeated 22x across cluster][0m
[2m[36m(train_and_validate pid=4181)[0m  [ 10.]][32m [repeated 33x across cluster][0m
[2m[36m(train_and_validate pid=4181)[0m  ...[32m [repeated 11x across cluster][0m


2023-05-13 10:06:31,895	ERROR checkpoint_manager.py:361 -- Result dict has no key: validation_loss. checkpoint_score_attr must be set to a key in the result dict. Valid keys are: ['loss', 'time_this_iter_s', 'should_checkpoint', 'done', 'training_iteration', 'trial_id', 'date', 'timestamp', 'time_total_s', 'pid', 'hostname', 'node_ip', 'time_since_restore', 'iterations_since_restore', 'experiment_tag', 'config/n_layers', 'config/n_hidden_dim', 'config/lr', 'config/batch_size', 'config/train_iterations']
2023-05-13 10:06:31,901	ERROR checkpoint_manager.py:361 -- Result dict has no key: validation_loss. checkpoint_score_attr must be set to a key in the result dict. Valid keys are: ['loss', 'time_this_iter_s', 'should_checkpoint', 'done', 'training_iteration', 'trial_id', 'date', 'timestamp', 'time_total_s', 'pid', 'hostname', 'node_ip', 'time_since_restore', 'iterations_since_restore', 'experiment_tag', 'config/n_layers', 'config/n_hidden_dim', 'config/lr', 'config/batch_size', 'config/t

== Status ==
Current time: 2023-05-13 10:06:33 (running for 00:01:10.27)
Using AsyncHyperBand: num_stopped=272
Bracket: Iter 32.000: -108162.921875 | Iter 16.000: -160952.84375 | Iter 8.000: -263570.3359375 | Iter 4.000: -313660.484375 | Iter 2.000: -390472.25 | Iter 1.000: -666829.1875
Logical resource usage: 8.0/8 CPUs, 0/0 GPUs
Result logdir: /Users/donokoye/Documents/Spring_23/ML_1.C51/Final_Project/outputs/raytune_result/train_and_validate_2023-05-13_10-05-23
Number of trials: 289/400 (16 PENDING, 4 RUNNING, 269 TERMINATED)
+--------------------------------+------------+----------------+--------------+-------------+----------------+------------+--------------------+----------+----------------------+
| Trial name                     | status     | loc            |   batch_size |          lr |   n_hidden_dim |   n_layers |   train_iterations |     loss |   training_iteration |
|--------------------------------+------------+----------------+--------------+-------------+--------------

2023-05-13 10:06:33,872	ERROR checkpoint_manager.py:361 -- Result dict has no key: validation_loss. checkpoint_score_attr must be set to a key in the result dict. Valid keys are: ['loss', 'time_this_iter_s', 'should_checkpoint', 'done', 'training_iteration', 'trial_id', 'date', 'timestamp', 'time_total_s', 'pid', 'hostname', 'node_ip', 'time_since_restore', 'iterations_since_restore', 'experiment_tag', 'config/n_layers', 'config/n_hidden_dim', 'config/lr', 'config/batch_size', 'config/train_iterations']
2023-05-13 10:06:34,035	ERROR checkpoint_manager.py:361 -- Result dict has no key: validation_loss. checkpoint_score_attr must be set to a key in the result dict. Valid keys are: ['loss', 'time_this_iter_s', 'should_checkpoint', 'done', 'training_iteration', 'trial_id', 'date', 'timestamp', 'time_total_s', 'pid', 'hostname', 'node_ip', 'time_since_restore', 'iterations_since_restore', 'experiment_tag', 'config/n_layers', 'config/n_hidden_dim', 'config/lr', 'config/batch_size', 'config/t

[2m[36m(train_and_validate pid=4183)[0m [[ 13.][32m [repeated 32x across cluster][0m
[2m[36m(train_and_validate pid=4183)[0m  [122.][32m [repeated 64x across cluster][0m
[2m[36m(train_and_validate pid=4183)[0m  [ 10.]][32m [repeated 96x across cluster][0m
[2m[36m(train_and_validate pid=4183)[0m  ...[32m [repeated 32x across cluster][0m


2023-05-13 10:06:37,059	ERROR checkpoint_manager.py:361 -- Result dict has no key: validation_loss. checkpoint_score_attr must be set to a key in the result dict. Valid keys are: ['loss', 'time_this_iter_s', 'should_checkpoint', 'done', 'training_iteration', 'trial_id', 'date', 'timestamp', 'time_total_s', 'pid', 'hostname', 'node_ip', 'time_since_restore', 'iterations_since_restore', 'experiment_tag', 'config/n_layers', 'config/n_hidden_dim', 'config/lr', 'config/batch_size', 'config/train_iterations']
2023-05-13 10:06:37,059	ERROR checkpoint_manager.py:361 -- Result dict has no key: validation_loss. checkpoint_score_attr must be set to a key in the result dict. Valid keys are: ['loss', 'time_this_iter_s', 'should_checkpoint', 'done', 'training_iteration', 'trial_id', 'date', 'timestamp', 'time_total_s', 'pid', 'hostname', 'node_ip', 'time_since_restore', 'iterations_since_restore', 'experiment_tag', 'config/n_layers', 'config/n_hidden_dim', 'config/lr', 'config/batch_size', 'config/t

== Status ==
Current time: 2023-05-13 10:06:38 (running for 00:01:15.29)
Using AsyncHyperBand: num_stopped=299
Bracket: Iter 32.000: -108690.0234375 | Iter 16.000: -160952.84375 | Iter 8.000: -263570.3359375 | Iter 4.000: -313392.5 | Iter 2.000: -396003.34375 | Iter 1.000: -666831.5
Logical resource usage: 8.0/8 CPUs, 0/0 GPUs
Result logdir: /Users/donokoye/Documents/Spring_23/ML_1.C51/Final_Project/outputs/raytune_result/train_and_validate_2023-05-13_10-05-23
Number of trials: 319/400 (16 PENDING, 4 RUNNING, 299 TERMINATED)
+--------------------------------+------------+----------------+--------------+-------------+----------------+------------+--------------------+----------+----------------------+
| Trial name                     | status     | loc            |   batch_size |          lr |   n_hidden_dim |   n_layers |   train_iterations |     loss |   training_iteration |
|--------------------------------+------------+----------------+--------------+-------------+----------------+-

2023-05-13 10:06:38,748	ERROR checkpoint_manager.py:361 -- Result dict has no key: validation_loss. checkpoint_score_attr must be set to a key in the result dict. Valid keys are: ['loss', 'time_this_iter_s', 'should_checkpoint', 'done', 'training_iteration', 'trial_id', 'date', 'timestamp', 'time_total_s', 'pid', 'hostname', 'node_ip', 'time_since_restore', 'iterations_since_restore', 'experiment_tag', 'config/n_layers', 'config/n_hidden_dim', 'config/lr', 'config/batch_size', 'config/train_iterations']
2023-05-13 10:06:38,769	ERROR checkpoint_manager.py:361 -- Result dict has no key: validation_loss. checkpoint_score_attr must be set to a key in the result dict. Valid keys are: ['loss', 'time_this_iter_s', 'should_checkpoint', 'done', 'training_iteration', 'trial_id', 'date', 'timestamp', 'time_total_s', 'pid', 'hostname', 'node_ip', 'time_since_restore', 'iterations_since_restore', 'experiment_tag', 'config/n_layers', 'config/n_hidden_dim', 'config/lr', 'config/batch_size', 'config/t

[2m[36m(train_and_validate pid=4182)[0m [[ 13.][32m [repeated 18x across cluster][0m
[2m[36m(train_and_validate pid=4182)[0m  [122.][32m [repeated 36x across cluster][0m
[2m[36m(train_and_validate pid=4182)[0m  [ 10.]][32m [repeated 54x across cluster][0m
[2m[36m(train_and_validate pid=4182)[0m  ...[32m [repeated 18x across cluster][0m


2023-05-13 10:06:42,503	ERROR checkpoint_manager.py:361 -- Result dict has no key: validation_loss. checkpoint_score_attr must be set to a key in the result dict. Valid keys are: ['loss', 'time_this_iter_s', 'should_checkpoint', 'done', 'training_iteration', 'trial_id', 'date', 'timestamp', 'time_total_s', 'pid', 'hostname', 'node_ip', 'time_since_restore', 'iterations_since_restore', 'experiment_tag', 'config/n_layers', 'config/n_hidden_dim', 'config/lr', 'config/batch_size', 'config/train_iterations']
2023-05-13 10:06:42,565	ERROR checkpoint_manager.py:361 -- Result dict has no key: validation_loss. checkpoint_score_attr must be set to a key in the result dict. Valid keys are: ['loss', 'time_this_iter_s', 'should_checkpoint', 'done', 'training_iteration', 'trial_id', 'date', 'timestamp', 'time_total_s', 'pid', 'hostname', 'node_ip', 'time_since_restore', 'iterations_since_restore', 'experiment_tag', 'config/n_layers', 'config/n_hidden_dim', 'config/lr', 'config/batch_size', 'config/t

== Status ==
Current time: 2023-05-13 10:06:43 (running for 00:01:20.30)
Using AsyncHyperBand: num_stopped=310
Bracket: Iter 32.000: -108162.921875 | Iter 16.000: -155389.5 | Iter 8.000: -263570.3359375 | Iter 4.000: -312892.8125 | Iter 2.000: -398633.15625 | Iter 1.000: -666829.1875
Logical resource usage: 8.0/8 CPUs, 0/0 GPUs
Result logdir: /Users/donokoye/Documents/Spring_23/ML_1.C51/Final_Project/outputs/raytune_result/train_and_validate_2023-05-13_10-05-23
Number of trials: 330/400 (16 PENDING, 4 RUNNING, 310 TERMINATED)
+--------------------------------+------------+----------------+--------------+-------------+----------------+------------+--------------------+----------+----------------------+
| Trial name                     | status     | loc            |   batch_size |          lr |   n_hidden_dim |   n_layers |   train_iterations |     loss |   training_iteration |
|--------------------------------+------------+----------------+--------------+-------------+----------------+

2023-05-13 10:06:43,730	ERROR checkpoint_manager.py:361 -- Result dict has no key: validation_loss. checkpoint_score_attr must be set to a key in the result dict. Valid keys are: ['loss', 'time_this_iter_s', 'should_checkpoint', 'done', 'training_iteration', 'trial_id', 'date', 'timestamp', 'time_total_s', 'pid', 'hostname', 'node_ip', 'time_since_restore', 'iterations_since_restore', 'experiment_tag', 'config/n_layers', 'config/n_hidden_dim', 'config/lr', 'config/batch_size', 'config/train_iterations']
2023-05-13 10:06:43,751	ERROR checkpoint_manager.py:361 -- Result dict has no key: validation_loss. checkpoint_score_attr must be set to a key in the result dict. Valid keys are: ['loss', 'time_this_iter_s', 'should_checkpoint', 'done', 'training_iteration', 'trial_id', 'date', 'timestamp', 'time_total_s', 'pid', 'hostname', 'node_ip', 'time_since_restore', 'iterations_since_restore', 'experiment_tag', 'config/n_layers', 'config/n_hidden_dim', 'config/lr', 'config/batch_size', 'config/t

[2m[36m(train_and_validate pid=4181)[0m [[ 13.][32m [repeated 10x across cluster][0m
[2m[36m(train_and_validate pid=4181)[0m  [122.][32m [repeated 20x across cluster][0m
[2m[36m(train_and_validate pid=4181)[0m  [ 10.]][32m [repeated 30x across cluster][0m
[2m[36m(train_and_validate pid=4181)[0m  ...[32m [repeated 10x across cluster][0m


2023-05-13 10:06:47,489	ERROR checkpoint_manager.py:361 -- Result dict has no key: validation_loss. checkpoint_score_attr must be set to a key in the result dict. Valid keys are: ['loss', 'time_this_iter_s', 'should_checkpoint', 'done', 'training_iteration', 'trial_id', 'date', 'timestamp', 'time_total_s', 'pid', 'hostname', 'node_ip', 'time_since_restore', 'iterations_since_restore', 'experiment_tag', 'config/n_layers', 'config/n_hidden_dim', 'config/lr', 'config/batch_size', 'config/train_iterations']
2023-05-13 10:06:47,490	ERROR checkpoint_manager.py:361 -- Result dict has no key: validation_loss. checkpoint_score_attr must be set to a key in the result dict. Valid keys are: ['loss', 'time_this_iter_s', 'should_checkpoint', 'done', 'training_iteration', 'trial_id', 'date', 'timestamp', 'time_total_s', 'pid', 'hostname', 'node_ip', 'time_since_restore', 'iterations_since_restore', 'experiment_tag', 'config/n_layers', 'config/n_hidden_dim', 'config/lr', 'config/batch_size', 'config/t

== Status ==
Current time: 2023-05-13 10:06:48 (running for 00:01:25.31)
Using AsyncHyperBand: num_stopped=321
Bracket: Iter 32.000: -108162.921875 | Iter 16.000: -155389.5 | Iter 8.000: -261270.484375 | Iter 4.000: -313209.40625 | Iter 2.000: -397318.25 | Iter 1.000: -666789.8125
Logical resource usage: 8.0/8 CPUs, 0/0 GPUs
Result logdir: /Users/donokoye/Documents/Spring_23/ML_1.C51/Final_Project/outputs/raytune_result/train_and_validate_2023-05-13_10-05-23
Number of trials: 341/400 (16 PENDING, 4 RUNNING, 321 TERMINATED)
+--------------------------------+------------+----------------+--------------+-------------+----------------+------------+--------------------+----------+----------------------+
| Trial name                     | status     | loc            |   batch_size |          lr |   n_hidden_dim |   n_layers |   train_iterations |     loss |   training_iteration |
|--------------------------------+------------+----------------+--------------+-------------+----------------+---

2023-05-13 10:06:48,740	ERROR checkpoint_manager.py:361 -- Result dict has no key: validation_loss. checkpoint_score_attr must be set to a key in the result dict. Valid keys are: ['loss', 'time_this_iter_s', 'should_checkpoint', 'done', 'training_iteration', 'trial_id', 'date', 'timestamp', 'time_total_s', 'pid', 'hostname', 'node_ip', 'time_since_restore', 'iterations_since_restore', 'experiment_tag', 'config/n_layers', 'config/n_hidden_dim', 'config/lr', 'config/batch_size', 'config/train_iterations']
2023-05-13 10:06:48,740	ERROR checkpoint_manager.py:361 -- Result dict has no key: validation_loss. checkpoint_score_attr must be set to a key in the result dict. Valid keys are: ['loss', 'time_this_iter_s', 'should_checkpoint', 'done', 'training_iteration', 'trial_id', 'date', 'timestamp', 'time_total_s', 'pid', 'hostname', 'node_ip', 'time_since_restore', 'iterations_since_restore', 'experiment_tag', 'config/n_layers', 'config/n_hidden_dim', 'config/lr', 'config/batch_size', 'config/t

[2m[36m(train_and_validate pid=4183)[0m [[ 13.][32m [repeated 24x across cluster][0m
[2m[36m(train_and_validate pid=4183)[0m  [122.][32m [repeated 48x across cluster][0m
[2m[36m(train_and_validate pid=4183)[0m  [ 10.]][32m [repeated 72x across cluster][0m
[2m[36m(train_and_validate pid=4183)[0m  ...[32m [repeated 24x across cluster][0m


2023-05-13 10:06:52,705	ERROR checkpoint_manager.py:361 -- Result dict has no key: validation_loss. checkpoint_score_attr must be set to a key in the result dict. Valid keys are: ['loss', 'time_this_iter_s', 'should_checkpoint', 'done', 'training_iteration', 'trial_id', 'date', 'timestamp', 'time_total_s', 'pid', 'hostname', 'node_ip', 'time_since_restore', 'iterations_since_restore', 'experiment_tag', 'config/n_layers', 'config/n_hidden_dim', 'config/lr', 'config/batch_size', 'config/train_iterations']
2023-05-13 10:06:52,706	ERROR checkpoint_manager.py:361 -- Result dict has no key: validation_loss. checkpoint_score_attr must be set to a key in the result dict. Valid keys are: ['loss', 'time_this_iter_s', 'should_checkpoint', 'done', 'training_iteration', 'trial_id', 'date', 'timestamp', 'time_total_s', 'pid', 'hostname', 'node_ip', 'time_since_restore', 'iterations_since_restore', 'experiment_tag', 'config/n_layers', 'config/n_hidden_dim', 'config/lr', 'config/batch_size', 'config/t

== Status ==
Current time: 2023-05-13 10:06:53 (running for 00:01:30.32)
Using AsyncHyperBand: num_stopped=343
Bracket: Iter 32.000: -108162.921875 | Iter 16.000: -152048.65625 | Iter 8.000: -261270.484375 | Iter 4.000: -313209.40625 | Iter 2.000: -390524.625 | Iter 1.000: -666826.4375
Logical resource usage: 8.0/8 CPUs, 0/0 GPUs
Result logdir: /Users/donokoye/Documents/Spring_23/ML_1.C51/Final_Project/outputs/raytune_result/train_and_validate_2023-05-13_10-05-23
Number of trials: 362/400 (16 PENDING, 4 RUNNING, 342 TERMINATED)
+--------------------------------+------------+----------------+--------------+-------------+----------------+------------+--------------------+----------+----------------------+
| Trial name                     | status     | loc            |   batch_size |          lr |   n_hidden_dim |   n_layers |   train_iterations |     loss |   training_iteration |
|--------------------------------+------------+----------------+--------------+-------------+---------------

2023-05-13 10:06:53,869	ERROR checkpoint_manager.py:361 -- Result dict has no key: validation_loss. checkpoint_score_attr must be set to a key in the result dict. Valid keys are: ['loss', 'time_this_iter_s', 'should_checkpoint', 'done', 'training_iteration', 'trial_id', 'date', 'timestamp', 'time_total_s', 'pid', 'hostname', 'node_ip', 'time_since_restore', 'iterations_since_restore', 'experiment_tag', 'config/n_layers', 'config/n_hidden_dim', 'config/lr', 'config/batch_size', 'config/train_iterations']
2023-05-13 10:06:53,889	ERROR checkpoint_manager.py:361 -- Result dict has no key: validation_loss. checkpoint_score_attr must be set to a key in the result dict. Valid keys are: ['loss', 'time_this_iter_s', 'should_checkpoint', 'done', 'training_iteration', 'trial_id', 'date', 'timestamp', 'time_total_s', 'pid', 'hostname', 'node_ip', 'time_since_restore', 'iterations_since_restore', 'experiment_tag', 'config/n_layers', 'config/n_hidden_dim', 'config/lr', 'config/batch_size', 'config/t

[2m[36m(train_and_validate pid=4182)[0m [[ 13.][32m [repeated 23x across cluster][0m
[2m[36m(train_and_validate pid=4182)[0m  [122.][32m [repeated 46x across cluster][0m
[2m[36m(train_and_validate pid=4182)[0m  [ 10.]][32m [repeated 69x across cluster][0m
[2m[36m(train_and_validate pid=4182)[0m  ...[32m [repeated 23x across cluster][0m


2023-05-13 10:06:57,906	ERROR checkpoint_manager.py:361 -- Result dict has no key: validation_loss. checkpoint_score_attr must be set to a key in the result dict. Valid keys are: ['loss', 'time_this_iter_s', 'should_checkpoint', 'done', 'training_iteration', 'trial_id', 'date', 'timestamp', 'time_total_s', 'pid', 'hostname', 'node_ip', 'time_since_restore', 'iterations_since_restore', 'experiment_tag', 'config/n_layers', 'config/n_hidden_dim', 'config/lr', 'config/batch_size', 'config/train_iterations']
2023-05-13 10:06:58,061	ERROR checkpoint_manager.py:361 -- Result dict has no key: validation_loss. checkpoint_score_attr must be set to a key in the result dict. Valid keys are: ['loss', 'time_this_iter_s', 'should_checkpoint', 'done', 'training_iteration', 'trial_id', 'date', 'timestamp', 'time_total_s', 'pid', 'hostname', 'node_ip', 'time_since_restore', 'iterations_since_restore', 'experiment_tag', 'config/n_layers', 'config/n_hidden_dim', 'config/lr', 'config/batch_size', 'config/t

== Status ==
Current time: 2023-05-13 10:06:58 (running for 00:01:35.34)
Using AsyncHyperBand: num_stopped=370
Bracket: Iter 32.000: -108690.0234375 | Iter 16.000: -155389.5 | Iter 8.000: -274694.75 | Iter 4.000: -312640.40625 | Iter 2.000: -393263.984375 | Iter 1.000: -666829.1875
Logical resource usage: 8.0/8 CPUs, 0/0 GPUs
Result logdir: /Users/donokoye/Documents/Spring_23/ML_1.C51/Final_Project/outputs/raytune_result/train_and_validate_2023-05-13_10-05-23
Number of trials: 389/400 (16 PENDING, 4 RUNNING, 369 TERMINATED)
+--------------------------------+------------+----------------+--------------+-------------+----------------+------------+--------------------+----------+----------------------+
| Trial name                     | status     | loc            |   batch_size |          lr |   n_hidden_dim |   n_layers |   train_iterations |     loss |   training_iteration |
|--------------------------------+------------+----------------+--------------+-------------+----------------+--

2023-05-13 10:06:58,948	ERROR checkpoint_manager.py:361 -- Result dict has no key: validation_loss. checkpoint_score_attr must be set to a key in the result dict. Valid keys are: ['loss', 'time_this_iter_s', 'should_checkpoint', 'done', 'training_iteration', 'trial_id', 'date', 'timestamp', 'time_total_s', 'pid', 'hostname', 'node_ip', 'time_since_restore', 'iterations_since_restore', 'experiment_tag', 'config/n_layers', 'config/n_hidden_dim', 'config/lr', 'config/batch_size', 'config/train_iterations']
2023-05-13 10:06:59,010	ERROR checkpoint_manager.py:361 -- Result dict has no key: validation_loss. checkpoint_score_attr must be set to a key in the result dict. Valid keys are: ['loss', 'time_this_iter_s', 'should_checkpoint', 'done', 'training_iteration', 'trial_id', 'date', 'timestamp', 'time_total_s', 'pid', 'hostname', 'node_ip', 'time_since_restore', 'iterations_since_restore', 'experiment_tag', 'config/n_layers', 'config/n_hidden_dim', 'config/lr', 'config/batch_size', 'config/t

[2m[36m(train_and_validate pid=4181)[0m [[ 13.][32m [repeated 21x across cluster][0m
[2m[36m(train_and_validate pid=4181)[0m  [122.][32m [repeated 42x across cluster][0m
[2m[36m(train_and_validate pid=4181)[0m  [ 10.]][32m [repeated 63x across cluster][0m
[2m[36m(train_and_validate pid=4181)[0m  ...[32m [repeated 21x across cluster][0m


2023-05-13 10:07:03,497	ERROR checkpoint_manager.py:361 -- Result dict has no key: validation_loss. checkpoint_score_attr must be set to a key in the result dict. Valid keys are: ['loss', 'time_this_iter_s', 'should_checkpoint', 'done', 'training_iteration', 'trial_id', 'date', 'timestamp', 'time_total_s', 'pid', 'hostname', 'node_ip', 'time_since_restore', 'iterations_since_restore', 'experiment_tag', 'config/n_layers', 'config/n_hidden_dim', 'config/lr', 'config/batch_size', 'config/train_iterations']
2023-05-13 10:07:03,587	ERROR checkpoint_manager.py:361 -- Result dict has no key: validation_loss. checkpoint_score_attr must be set to a key in the result dict. Valid keys are: ['loss', 'time_this_iter_s', 'should_checkpoint', 'done', 'training_iteration', 'trial_id', 'date', 'timestamp', 'time_total_s', 'pid', 'hostname', 'node_ip', 'time_since_restore', 'iterations_since_restore', 'experiment_tag', 'config/n_layers', 'config/n_hidden_dim', 'config/lr', 'config/batch_size', 'config/t

== Status ==
Current time: 2023-05-13 10:07:03 (running for 00:01:40.34)
Using AsyncHyperBand: num_stopped=387
Bracket: Iter 32.000: -108690.0234375 | Iter 16.000: -158730.34375 | Iter 8.000: -274694.75 | Iter 4.000: -311112.875 | Iter 2.000: -390472.25 | Iter 1.000: -666831.5
Logical resource usage: 8.0/8 CPUs, 0/0 GPUs
Result logdir: /Users/donokoye/Documents/Spring_23/ML_1.C51/Final_Project/outputs/raytune_result/train_and_validate_2023-05-13_10-05-23
Number of trials: 400/400 (9 PENDING, 4 RUNNING, 387 TERMINATED)
+--------------------------------+------------+----------------+--------------+-------------+----------------+------------+--------------------+----------+----------------------+
| Trial name                     | status     | loc            |   batch_size |          lr |   n_hidden_dim |   n_layers |   train_iterations |     loss |   training_iteration |
|--------------------------------+------------+----------------+--------------+-------------+----------------+--------

2023-05-13 10:07:03,790	ERROR checkpoint_manager.py:361 -- Result dict has no key: validation_loss. checkpoint_score_attr must be set to a key in the result dict. Valid keys are: ['loss', 'time_this_iter_s', 'should_checkpoint', 'done', 'training_iteration', 'trial_id', 'date', 'timestamp', 'time_total_s', 'pid', 'hostname', 'node_ip', 'time_since_restore', 'iterations_since_restore', 'experiment_tag', 'config/n_layers', 'config/n_hidden_dim', 'config/lr', 'config/batch_size', 'config/train_iterations']
2023-05-13 10:07:03,839	ERROR checkpoint_manager.py:361 -- Result dict has no key: validation_loss. checkpoint_score_attr must be set to a key in the result dict. Valid keys are: ['loss', 'time_this_iter_s', 'should_checkpoint', 'done', 'training_iteration', 'trial_id', 'date', 'timestamp', 'time_total_s', 'pid', 'hostname', 'node_ip', 'time_since_restore', 'iterations_since_restore', 'experiment_tag', 'config/n_layers', 'config/n_hidden_dim', 'config/lr', 'config/batch_size', 'config/t

== Status ==
Current time: 2023-05-13 10:07:08 (running for 00:01:45.40)
Using AsyncHyperBand: num_stopped=399
Bracket: Iter 32.000: -110037.31640625 | Iter 16.000: -159619.828125 | Iter 8.000: -274082.390625 | Iter 4.000: -310897.8125 | Iter 2.000: -390498.4375 | Iter 1.000: -666802.84375
Logical resource usage: 2.0/8 CPUs, 0/0 GPUs
Result logdir: /Users/donokoye/Documents/Spring_23/ML_1.C51/Final_Project/outputs/raytune_result/train_and_validate_2023-05-13_10-05-23
Number of trials: 400/400 (1 RUNNING, 399 TERMINATED)
+--------------------------------+------------+----------------+--------------+-------------+----------------+------------+--------------------+----------+----------------------+
| Trial name                     | status     | loc            |   batch_size |          lr |   n_hidden_dim |   n_layers |   train_iterations |     loss |   training_iteration |
|--------------------------------+------------+----------------+--------------+-------------+----------------+------

2023-05-13 10:07:08,987	ERROR checkpoint_manager.py:361 -- Result dict has no key: validation_loss. checkpoint_score_attr must be set to a key in the result dict. Valid keys are: ['loss', 'time_this_iter_s', 'should_checkpoint', 'done', 'training_iteration', 'trial_id', 'date', 'timestamp', 'time_total_s', 'pid', 'hostname', 'node_ip', 'time_since_restore', 'iterations_since_restore', 'experiment_tag', 'config/n_layers', 'config/n_hidden_dim', 'config/lr', 'config/batch_size', 'config/train_iterations']
2023-05-13 10:07:08,988	ERROR checkpoint_manager.py:361 -- Result dict has no key: validation_loss. checkpoint_score_attr must be set to a key in the result dict. Valid keys are: ['loss', 'time_this_iter_s', 'should_checkpoint', 'done', 'training_iteration', 'trial_id', 'date', 'timestamp', 'time_total_s', 'pid', 'hostname', 'node_ip', 'time_since_restore', 'iterations_since_restore', 'experiment_tag', 'config/n_layers', 'config/n_hidden_dim', 'config/lr', 'config/batch_size', 'config/t

== Status ==
Current time: 2023-05-13 10:07:09 (running for 00:01:45.97)
Using AsyncHyperBand: num_stopped=400
Bracket: Iter 32.000: -110037.31640625 | Iter 16.000: -160509.3125 | Iter 8.000: -274082.390625 | Iter 4.000: -310897.8125 | Iter 2.000: -390498.4375 | Iter 1.000: -666802.84375
Logical resource usage: 0/8 CPUs, 0/0 GPUs
Result logdir: /Users/donokoye/Documents/Spring_23/ML_1.C51/Final_Project/outputs/raytune_result/train_and_validate_2023-05-13_10-05-23
Number of trials: 400/400 (400 TERMINATED)
+--------------------------------+------------+----------------+--------------+-------------+----------------+------------+--------------------+----------+----------------------+
| Trial name                     | status     | loc            |   batch_size |          lr |   n_hidden_dim |   n_layers |   train_iterations |     loss |   training_iteration |
|--------------------------------+------------+----------------+--------------+-------------+----------------+------------+--------