<!-- ---
title: Hyperparameter Tuning in Ignite
date: 2021-09-28
downloads: true
sidebar: true
tags:
  - hyperparameter tuning
  - ray tune
  - optuna
  - ax
--- -->

#  Hyperparameter Tuning in Ignite

In this tutorial, we will see how [Ray Tune](https://docs.ray.io/en/stable/tune.html) can be used with Ignite for hyperparameter tuning. We will also compare it with other frameworks like [Optuna](https://optuna.org/) and [Ax](https://ax.dev/) for hyperparameter optimization.

<!--more-->

We will follow [this PyTorch tutorial](https://pytorch.org/tutorials/beginner/hyperparameter_tuning_tutorial.html) for reference.

In this example, we will be using a [ResNet18](https://pytorch.org/vision/stable/models.html#torchvision.models.resnet18) model on the [MNIST](https://pytorch.org/vision/stable/datasets.html#torchvision.datasets.MNIST) dataset. The base code is the same as used in the [Getting Started Guide](https://pytorch-ignite.ai/tutorials/getting-started/).

## Required Dependencies

In [None]:
!pip install pytorch-ignite
!pip install ray

## Imports

In [2]:
import os
from functools import partial
import numpy as np

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import random_split

from torch.utils.data import DataLoader, SubsetRandomSampler
from torchvision.datasets import CIFAR10
from torchvision.models import resnet18
from torchvision.transforms import Compose, Normalize, ToTensor

from ignite.engine import Events, create_supervised_trainer, create_supervised_evaluator
from ignite.metrics import Accuracy, Loss, RunningAverage
import ignite.distributed as idist

from ray import tune
from ray.tune import CLIReporter
from ray.tune.schedulers import ASHAScheduler

In [3]:
def load_data(data_dir="./data"):
    transform = Compose([
        ToTensor(),
        Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
    ])

    trainset = CIFAR10(
        root=data_dir, train=True, download=True, transform=transform)

    testset = CIFAR10(
        root=data_dir, train=False, download=True, transform=transform)

    return trainset, testset

In [4]:
class Net(nn.Module):
    def __init__(self, l1=120, l2=84):
        super(Net, self).__init__()
        self.conv1 = nn.Conv2d(3, 6, 5)
        self.pool = nn.MaxPool2d(2, 2)
        self.conv2 = nn.Conv2d(6, 16, 5)
        self.fc1 = nn.Linear(16 * 5 * 5, l1)
        self.fc2 = nn.Linear(l1, l2)
        self.fc3 = nn.Linear(l2, 10)

    def forward(self, x):
        x = self.pool(F.relu(self.conv1(x)))
        x = self.pool(F.relu(self.conv2(x)))
        x = x.view(-1, 16 * 5 * 5)
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x

In [13]:
def train_cifar(config, checkpoint_dir=None, data_dir=None):
    net = idist.auto_model(Net(config["l1"], config["l2"]))

    device = idist.device()

    criterion = nn.CrossEntropyLoss()
    optimizer = idist.auto_optim(optim.SGD(net.parameters(), lr=config["lr"], momentum=0.9))

    trainer = create_supervised_trainer(net, optimizer, criterion, device=device, non_blocking=True)
    val_evaluator = create_supervised_evaluator(net, metrics={ "accuracy": Accuracy(), "loss": Loss(criterion)}, device=device, non_blocking=True)

    to_save = { "model": net, "optimizer": optimizer}

    if checkpoint_dir:
        model_state, optimizer_state = torch.load(
            os.path.join(checkpoint_dir, "checkpoint"))
        net.load_state_dict(model_state)
        optimizer.load_state_dict(optimizer_state)

    trainset, testset = load_data(data_dir)

    test_abs = int(len(trainset) * 0.8)
    train_subset, val_subset = random_split(
        trainset, [test_abs, len(trainset) - test_abs])

    trainloader = idist.auto_dataloader(
        train_subset,
        batch_size=int(config["batch_size"]),
        shuffle=True,
        num_workers=8)
    valloader = idist.auto_dataloader(
        val_subset,
        batch_size=int(config["batch_size"]),
        shuffle=True,
        num_workers=8)

    avg_output = RunningAverage(output_transform=lambda x: x)
    avg_output.attach(trainer, 'running_avg_loss')

    # handler = Checkpoint(
    #     to_save, DiskSaver('models', create_dir=True), n_saved=2, global_step_transform=gst
    # )
    # trainer.add_event_handler(Events.EPOCH_COMPLETED, handler)

    @trainer.on(Events.ITERATION_COMPLETED(every=2000))
    def log_training_loss(engine):
        print(f"Epoch[{engine.state.epoch}], Iter[{engine.state.iteration}] Loss: {engine.state.output:.2f} Running Avg Loss: {engine.state.metrics['running_avg_loss']:.2f}")


    @trainer.on(Events.EPOCH_COMPLETED)
    def log_validation_results(trainer):
        val_evaluator.run(valloader)
        metrics = val_evaluator.state.metrics
        print(f"Validation Results - Epoch[{trainer.state.epoch}] Avg accuracy: {metrics['accuracy']:.2f} Avg loss: {metrics['loss']:.2f}")

        with tune.checkpoint_dir(trainer.state.epoch) as checkpoint_dir:
          path = os.path.join(checkpoint_dir, "checkpoint")
          torch.save((net.state_dict(), optimizer.state_dict()), path)
        tune.report(loss=metrics['loss'], accuracy=metrics['accuracy'])   

    trainer.run(trainloader, max_epochs=10) 

        
    print("Finished Training")

In [14]:
def main(num_samples=10, max_num_epochs=10, gpus_per_trial=1):
    data_dir = os.path.abspath("./data")
    trainset, testset = load_data(data_dir)
    config = {
        "l1": tune.sample_from(lambda _: 2 ** np.random.randint(2, 9)),
        "l2": tune.sample_from(lambda _: 2 ** np.random.randint(2, 9)),
        "lr": tune.loguniform(1e-4, 1e-1),
        "batch_size": tune.choice([2, 4, 8, 16])
    }
    scheduler = ASHAScheduler(
        metric="loss",
        mode="min",
        max_t=max_num_epochs,
        grace_period=1,
        reduction_factor=2)
    reporter = CLIReporter(
        # parameter_columns=["l1", "l2", "lr", "batch_size"],
        metric_columns=["loss", "accuracy", "training_iteration"])
    result = tune.run(
        partial(train_cifar, data_dir=data_dir),
        resources_per_trial={"cpu": 2, "gpu": gpus_per_trial},
        config=config,
        num_samples=num_samples,
        scheduler=scheduler,
        progress_reporter=reporter)

    best_trial = result.get_best_trial("loss", "min", "last")
    print("Best trial config: {}".format(best_trial.config))
    print("Best trial final validation loss: {}".format(
        best_trial.last_result["loss"]))
    print("Best trial final validation accuracy: {}".format(
        best_trial.last_result["accuracy"]))

    best_trained_model = idist.auto_model(Net(best_trial.config["l1"], best_trial.config["l2"]))
    device = idist.device()

    best_checkpoint_dir = best_trial.checkpoint.value
    model_state, optimizer_state = torch.load(os.path.join(
        best_checkpoint_dir, "checkpoint"))
    best_trained_model.load_state_dict(model_state)

    test_evaluator = create_supervised_evaluator(best_trained_model, metrics={"Accuracy": Accuracy()}, device=device, non_blocking=True)

    trainset, testset = load_data()

    testloader = idist.auto_dataloader(testset, batch_size=4, shuffle=False, num_workers=2)

    test_evaluator.run(testloader)
    print(test_evaluator.state.metrics)

In [15]:
main(num_samples=3, max_num_epochs=3, gpus_per_trial=1)

Files already downloaded and verified
Files already downloaded and verified


2021-10-01 11:50:00,379	INFO registry.py:67 -- Detected unknown callable for trainable. Converting to class.


== Status ==
Memory usage on this node: 2.2/12.7 GiB
Using AsyncHyperBand: num_stopped=0
Bracket: Iter 2.000: None | Iter 1.000: None
Resources requested: 0/2 CPUs, 0/1 GPUs, 0.0/7.33 GiB heap, 0.0/3.67 GiB objects (0.0/1.0 accelerator_type:K80)
Result logdir: /root/ray_results/DEFAULT_2021-10-01_11-50-00
Number of trials: 3/3 (3 PENDING)
+---------------------+----------+-------+--------------+------+------+------------+
| Trial name          | status   | loc   |   batch_size |   l1 |   l2 |         lr |
|---------------------+----------+-------+--------------+------+------+------------|
| DEFAULT_b50ea_00000 | PENDING  |       |            8 |   16 |    4 | 0.00059636 |
| DEFAULT_b50ea_00001 | PENDING  |       |            2 |   32 |   64 | 0.00074678 |
| DEFAULT_b50ea_00002 | PENDING  |       |            2 |   64 |   32 | 0.00122324 |
+---------------------+----------+-------+--------------+------+------+------------+


[2m[36m(pid=1853)[0m Files already downloaded and verified


[2m[36m(pid=1853)[0m 2021-10-01 11:50:06,682 ignite.distributed.auto.auto_dataloader INFO: Use data loader kwargs for dataset '<torch.utils.data.da': 
[2m[36m(pid=1853)[0m 	{'batch_size': 8, 'shuffle': True, 'num_workers': 8, 'pin_memory': True}
[2m[36m(pid=1853)[0m   cpuset_checked))
[2m[36m(pid=1853)[0m 2021-10-01 11:50:06,683 ignite.distributed.auto.auto_dataloader INFO: Use data loader kwargs for dataset '<torch.utils.data.da': 
[2m[36m(pid=1853)[0m 	{'batch_size': 8, 'shuffle': True, 'num_workers': 8, 'pin_memory': True}
[2m[36m(pid=1853)[0m   return torch.max_pool2d(input, kernel_size, stride, padding, dilation, ceil_mode)


[2m[36m(pid=1853)[0m Epoch[1], Iter[2000] Loss: 2.31 Running Avg Loss: 2.31
[2m[36m(pid=1853)[0m Epoch[1], Iter[4000] Loss: 2.31 Running Avg Loss: 2.30




Result for DEFAULT_b50ea_00000:
  accuracy: 0.1312
  date: 2021-10-01_11-50-46
  done: false
  experiment_id: e32295dd189a4a47a6018a23aa66c38e
  hostname: ac982acc27c2
  iterations_since_restore: 1
  loss: 2.3001421875
  node_ip: 172.28.0.2
  pid: 1853
  should_checkpoint: true
  time_since_restore: 44.10174560546875
  time_this_iter_s: 44.10174560546875
  time_total_s: 44.10174560546875
  timestamp: 1633089046
  timesteps_since_restore: 0
  training_iteration: 1
  trial_id: b50ea_00000
  
== Status ==
Memory usage on this node: 3.4/12.7 GiB
Using AsyncHyperBand: num_stopped=0
Bracket: Iter 2.000: None | Iter 1.000: -2.3001421875
Resources requested: 2.0/2 CPUs, 1.0/1 GPUs, 0.0/7.33 GiB heap, 0.0/3.67 GiB objects (0.0/1.0 accelerator_type:K80)
Result logdir: /root/ray_results/DEFAULT_2021-10-01_11-50-00
Number of trials: 3/3 (2 PENDING, 1 RUNNING)
+---------------------+----------+-----------------+--------------+------+------+------------+---------+------------+----------------------+



[2m[36m(pid=1853)[0m Epoch[2], Iter[6000] Loss: 2.30 Running Avg Loss: 2.29
[2m[36m(pid=1853)[0m Epoch[2], Iter[8000] Loss: 2.26 Running Avg Loss: 2.21
[2m[36m(pid=1853)[0m Epoch[2], Iter[10000] Loss: 2.52 Running Avg Loss: 2.08




Result for DEFAULT_b50ea_00000:
  accuracy: 0.2316
  date: 2021-10-01_11-51-27
  done: false
  experiment_id: e32295dd189a4a47a6018a23aa66c38e
  hostname: ac982acc27c2
  iterations_since_restore: 2
  loss: 2.0590619140625
  node_ip: 172.28.0.2
  pid: 1853
  should_checkpoint: true
  time_since_restore: 84.69665455818176
  time_this_iter_s: 40.59490895271301
  time_total_s: 84.69665455818176
  timestamp: 1633089087
  timesteps_since_restore: 0
  training_iteration: 2
  trial_id: b50ea_00000
  
== Status ==
Memory usage on this node: 3.4/12.7 GiB
Using AsyncHyperBand: num_stopped=0
Bracket: Iter 2.000: -2.0590619140625 | Iter 1.000: -2.3001421875
Resources requested: 2.0/2 CPUs, 1.0/1 GPUs, 0.0/7.33 GiB heap, 0.0/3.67 GiB objects (0.0/1.0 accelerator_type:K80)
Result logdir: /root/ray_results/DEFAULT_2021-10-01_11-50-00
Number of trials: 3/3 (2 PENDING, 1 RUNNING)
+---------------------+----------+-----------------+--------------+------+------+------------+---------+------------+--------



[2m[36m(pid=1853)[0m Epoch[3], Iter[12000] Loss: 1.76 Running Avg Loss: 1.82
[2m[36m(pid=1853)[0m Epoch[3], Iter[14000] Loss: 1.76 Running Avg Loss: 1.83




Result for DEFAULT_b50ea_00000:
  accuracy: 0.3351
  date: 2021-10-01_11-52-08
  done: true
  experiment_id: e32295dd189a4a47a6018a23aa66c38e
  hostname: ac982acc27c2
  iterations_since_restore: 3
  loss: 1.724516796875
  node_ip: 172.28.0.2
  pid: 1853
  should_checkpoint: true
  time_since_restore: 125.52060723304749
  time_this_iter_s: 40.82395267486572
  time_total_s: 125.52060723304749
  timestamp: 1633089128
  timesteps_since_restore: 0
  training_iteration: 3
  trial_id: b50ea_00000
  
== Status ==
Memory usage on this node: 3.4/12.7 GiB
Using AsyncHyperBand: num_stopped=1
Bracket: Iter 2.000: -2.0590619140625 | Iter 1.000: -2.3001421875
Resources requested: 2.0/2 CPUs, 1.0/1 GPUs, 0.0/7.33 GiB heap, 0.0/3.67 GiB objects (0.0/1.0 accelerator_type:K80)
Result logdir: /root/ray_results/DEFAULT_2021-10-01_11-50-00
Number of trials: 3/3 (2 PENDING, 1 RUNNING)
+---------------------+----------+-----------------+--------------+------+------+------------+---------+------------+--------

[2m[36m(pid=2098)[0m 2021-10-01 11:52:14,024 ignite.distributed.auto.auto_dataloader INFO: Use data loader kwargs for dataset '<torch.utils.data.da': 
[2m[36m(pid=2098)[0m 	{'batch_size': 2, 'shuffle': True, 'num_workers': 8, 'pin_memory': True}
[2m[36m(pid=2098)[0m   cpuset_checked))
[2m[36m(pid=2098)[0m 2021-10-01 11:52:14,025 ignite.distributed.auto.auto_dataloader INFO: Use data loader kwargs for dataset '<torch.utils.data.da': 
[2m[36m(pid=2098)[0m 	{'batch_size': 2, 'shuffle': True, 'num_workers': 8, 'pin_memory': True}
[2m[36m(pid=2098)[0m   return torch.max_pool2d(input, kernel_size, stride, padding, dilation, ceil_mode)


[2m[36m(pid=2098)[0m Epoch[1], Iter[2000] Loss: 2.23 Running Avg Loss: 2.19
[2m[36m(pid=2098)[0m Epoch[1], Iter[4000] Loss: 1.13 Running Avg Loss: 1.89
[2m[36m(pid=2098)[0m Epoch[1], Iter[6000] Loss: 1.89 Running Avg Loss: 1.75
[2m[36m(pid=2098)[0m Epoch[1], Iter[8000] Loss: 2.85 Running Avg Loss: 1.65
[2m[36m(pid=2098)[0m Epoch[1], Iter[10000] Loss: 1.28 Running Avg Loss: 1.58
[2m[36m(pid=2098)[0m Epoch[1], Iter[12000] Loss: 3.34 Running Avg Loss: 1.71
[2m[36m(pid=2098)[0m Epoch[1], Iter[14000] Loss: 1.81 Running Avg Loss: 1.57
[2m[36m(pid=2098)[0m Epoch[1], Iter[16000] Loss: 1.26 Running Avg Loss: 1.40
[2m[36m(pid=2098)[0m Epoch[1], Iter[18000] Loss: 0.86 Running Avg Loss: 1.56
[2m[36m(pid=2098)[0m Epoch[1], Iter[20000] Loss: 1.49 Running Avg Loss: 1.38




Result for DEFAULT_b50ea_00001:
  accuracy: 0.4689
  date: 2021-10-01_11-54-20
  done: false
  experiment_id: 036f604837ca47d3bd6fde124228b865
  hostname: ac982acc27c2
  iterations_since_restore: 1
  loss: 1.45748916015625
  node_ip: 172.28.0.2
  pid: 2098
  should_checkpoint: true
  time_since_restore: 130.6066439151764
  time_this_iter_s: 130.6066439151764
  time_total_s: 130.6066439151764
  timestamp: 1633089260
  timesteps_since_restore: 0
  training_iteration: 1
  trial_id: b50ea_00001
  
== Status ==
Memory usage on this node: 3.5/12.7 GiB
Using AsyncHyperBand: num_stopped=1
Bracket: Iter 2.000: -2.0590619140625 | Iter 1.000: -1.8788156738281252
Resources requested: 2.0/2 CPUs, 1.0/1 GPUs, 0.0/7.33 GiB heap, 0.0/3.67 GiB objects (0.0/1.0 accelerator_type:K80)
Result logdir: /root/ray_results/DEFAULT_2021-10-01_11-50-00
Number of trials: 3/3 (1 PENDING, 1 RUNNING, 1 TERMINATED)
+---------------------+------------+-----------------+--------------+------+------+------------+--------



[2m[36m(pid=2098)[0m Epoch[2], Iter[22000] Loss: 1.37 Running Avg Loss: 1.46
[2m[36m(pid=2098)[0m Epoch[2], Iter[24000] Loss: 0.96 Running Avg Loss: 1.35
[2m[36m(pid=2098)[0m Epoch[2], Iter[26000] Loss: 1.13 Running Avg Loss: 1.40
[2m[36m(pid=2098)[0m Epoch[2], Iter[28000] Loss: 1.63 Running Avg Loss: 1.45
[2m[36m(pid=2098)[0m Epoch[2], Iter[30000] Loss: 1.68 Running Avg Loss: 1.51
[2m[36m(pid=2098)[0m Epoch[2], Iter[32000] Loss: 2.97 Running Avg Loss: 1.41
[2m[36m(pid=2098)[0m Epoch[2], Iter[34000] Loss: 1.36 Running Avg Loss: 1.42
[2m[36m(pid=2098)[0m Epoch[2], Iter[36000] Loss: 2.18 Running Avg Loss: 1.39
[2m[36m(pid=2098)[0m Epoch[2], Iter[38000] Loss: 1.48 Running Avg Loss: 1.22
[2m[36m(pid=2098)[0m Epoch[2], Iter[40000] Loss: 1.10 Running Avg Loss: 1.29




Result for DEFAULT_b50ea_00001:
  accuracy: 0.5089
  date: 2021-10-01_11-56-27
  done: false
  experiment_id: 036f604837ca47d3bd6fde124228b865
  hostname: ac982acc27c2
  iterations_since_restore: 2
  loss: 1.36539609375
  node_ip: 172.28.0.2
  pid: 2098
  should_checkpoint: true
  time_since_restore: 257.20395398139954
  time_this_iter_s: 126.59731006622314
  time_total_s: 257.20395398139954
  timestamp: 1633089387
  timesteps_since_restore: 0
  training_iteration: 2
  trial_id: b50ea_00001
  
== Status ==
Memory usage on this node: 3.5/12.7 GiB
Using AsyncHyperBand: num_stopped=1
Bracket: Iter 2.000: -1.7122290039062502 | Iter 1.000: -1.8788156738281252
Resources requested: 2.0/2 CPUs, 1.0/1 GPUs, 0.0/7.33 GiB heap, 0.0/3.67 GiB objects (0.0/1.0 accelerator_type:K80)
Result logdir: /root/ray_results/DEFAULT_2021-10-01_11-50-00
Number of trials: 3/3 (1 PENDING, 1 RUNNING, 1 TERMINATED)
+---------------------+------------+-----------------+--------------+------+------+------------+-----



[2m[36m(pid=2098)[0m Epoch[3], Iter[42000] Loss: 2.02 Running Avg Loss: 1.34
[2m[36m(pid=2098)[0m Epoch[3], Iter[44000] Loss: 1.41 Running Avg Loss: 1.26
[2m[36m(pid=2098)[0m Epoch[3], Iter[46000] Loss: 0.31 Running Avg Loss: 1.31
[2m[36m(pid=2098)[0m Epoch[3], Iter[48000] Loss: 1.21 Running Avg Loss: 1.43
[2m[36m(pid=2098)[0m Epoch[3], Iter[50000] Loss: 1.30 Running Avg Loss: 1.40
[2m[36m(pid=2098)[0m Epoch[3], Iter[52000] Loss: 0.17 Running Avg Loss: 1.29
[2m[36m(pid=2098)[0m Epoch[3], Iter[54000] Loss: 1.23 Running Avg Loss: 1.21
[2m[36m(pid=2098)[0m Epoch[3], Iter[56000] Loss: 0.61 Running Avg Loss: 1.47
[2m[36m(pid=2098)[0m Epoch[3], Iter[58000] Loss: 1.67 Running Avg Loss: 1.35
[2m[36m(pid=2098)[0m Epoch[3], Iter[60000] Loss: 1.83 Running Avg Loss: 1.30




Result for DEFAULT_b50ea_00001:
  accuracy: 0.5269
  date: 2021-10-01_11-58-34
  done: true
  experiment_id: 036f604837ca47d3bd6fde124228b865
  hostname: ac982acc27c2
  iterations_since_restore: 3
  loss: 1.315916015625
  node_ip: 172.28.0.2
  pid: 2098
  should_checkpoint: true
  time_since_restore: 384.62691617012024
  time_this_iter_s: 127.4229621887207
  time_total_s: 384.62691617012024
  timestamp: 1633089514
  timesteps_since_restore: 0
  training_iteration: 3
  trial_id: b50ea_00001
  
== Status ==
Memory usage on this node: 3.4/12.7 GiB
Using AsyncHyperBand: num_stopped=2
Bracket: Iter 2.000: -1.7122290039062502 | Iter 1.000: -1.8788156738281252
Resources requested: 2.0/2 CPUs, 1.0/1 GPUs, 0.0/7.33 GiB heap, 0.0/3.67 GiB objects (0.0/1.0 accelerator_type:K80)
Result logdir: /root/ray_results/DEFAULT_2021-10-01_11-50-00
Number of trials: 3/3 (1 PENDING, 1 RUNNING, 1 TERMINATED)
+---------------------+------------+-----------------+--------------+------+------+------------+------

[2m[36m(pid=2361)[0m 2021-10-01 11:58:40,529 ignite.distributed.auto.auto_dataloader INFO: Use data loader kwargs for dataset '<torch.utils.data.da': 
[2m[36m(pid=2361)[0m 	{'batch_size': 2, 'shuffle': True, 'num_workers': 8, 'pin_memory': True}
[2m[36m(pid=2361)[0m   cpuset_checked))
[2m[36m(pid=2361)[0m 2021-10-01 11:58:40,530 ignite.distributed.auto.auto_dataloader INFO: Use data loader kwargs for dataset '<torch.utils.data.da': 
[2m[36m(pid=2361)[0m 	{'batch_size': 2, 'shuffle': True, 'num_workers': 8, 'pin_memory': True}
[2m[36m(pid=2361)[0m   return torch.max_pool2d(input, kernel_size, stride, padding, dilation, ceil_mode)


[2m[36m(pid=2361)[0m Epoch[1], Iter[2000] Loss: 1.55 Running Avg Loss: 2.09
[2m[36m(pid=2361)[0m Epoch[1], Iter[4000] Loss: 1.92 Running Avg Loss: 1.97
[2m[36m(pid=2361)[0m Epoch[1], Iter[6000] Loss: 2.98 Running Avg Loss: 1.67
[2m[36m(pid=2361)[0m Epoch[1], Iter[8000] Loss: 2.05 Running Avg Loss: 1.60
[2m[36m(pid=2361)[0m Epoch[1], Iter[10000] Loss: 1.80 Running Avg Loss: 1.75
[2m[36m(pid=2361)[0m Epoch[1], Iter[12000] Loss: 2.18 Running Avg Loss: 1.62
[2m[36m(pid=2361)[0m Epoch[1], Iter[14000] Loss: 1.39 Running Avg Loss: 1.54
[2m[36m(pid=2361)[0m Epoch[1], Iter[16000] Loss: 1.25 Running Avg Loss: 1.55
[2m[36m(pid=2361)[0m Epoch[1], Iter[18000] Loss: 1.56 Running Avg Loss: 1.63
[2m[36m(pid=2361)[0m Epoch[1], Iter[20000] Loss: 2.59 Running Avg Loss: 1.51




Result for DEFAULT_b50ea_00002:
  accuracy: 0.4689
  date: 2021-10-01_12-00-48
  done: false
  experiment_id: 1c5539c7b0e24aef998eaca10497f948
  hostname: ac982acc27c2
  iterations_since_restore: 1
  loss: 1.46265712890625
  node_ip: 172.28.0.2
  pid: 2361
  should_checkpoint: true
  time_since_restore: 132.09271001815796
  time_this_iter_s: 132.09271001815796
  time_total_s: 132.09271001815796
  timestamp: 1633089648
  timesteps_since_restore: 0
  training_iteration: 1
  trial_id: b50ea_00002
  
== Status ==
Memory usage on this node: 3.4/12.7 GiB
Using AsyncHyperBand: num_stopped=2
Bracket: Iter 2.000: -1.7122290039062502 | Iter 1.000: -1.46265712890625
Resources requested: 2.0/2 CPUs, 1.0/1 GPUs, 0.0/7.33 GiB heap, 0.0/3.67 GiB objects (0.0/1.0 accelerator_type:K80)
Result logdir: /root/ray_results/DEFAULT_2021-10-01_11-50-00
Number of trials: 3/3 (1 RUNNING, 2 TERMINATED)
+---------------------+------------+-----------------+--------------+------+------+------------+---------+-----



[2m[36m(pid=2361)[0m Epoch[2], Iter[22000] Loss: 1.63 Running Avg Loss: 1.37
[2m[36m(pid=2361)[0m Epoch[2], Iter[24000] Loss: 1.70 Running Avg Loss: 1.43
[2m[36m(pid=2361)[0m Epoch[2], Iter[26000] Loss: 1.49 Running Avg Loss: 1.42
[2m[36m(pid=2361)[0m Epoch[2], Iter[28000] Loss: 1.42 Running Avg Loss: 1.58
[2m[36m(pid=2361)[0m Epoch[2], Iter[30000] Loss: 2.66 Running Avg Loss: 1.47
[2m[36m(pid=2361)[0m Epoch[2], Iter[32000] Loss: 1.87 Running Avg Loss: 1.50
[2m[36m(pid=2361)[0m Epoch[2], Iter[34000] Loss: 1.32 Running Avg Loss: 1.41
[2m[36m(pid=2361)[0m Epoch[2], Iter[36000] Loss: 0.78 Running Avg Loss: 1.49
[2m[36m(pid=2361)[0m Epoch[2], Iter[38000] Loss: 1.86 Running Avg Loss: 1.37
[2m[36m(pid=2361)[0m Epoch[2], Iter[40000] Loss: 3.33 Running Avg Loss: 1.53




Result for DEFAULT_b50ea_00002:
  accuracy: 0.4984
  date: 2021-10-01_12-02-57
  done: false
  experiment_id: 1c5539c7b0e24aef998eaca10497f948
  hostname: ac982acc27c2
  iterations_since_restore: 2
  loss: 1.4066556640625
  node_ip: 172.28.0.2
  pid: 2361
  should_checkpoint: true
  time_since_restore: 260.71564173698425
  time_this_iter_s: 128.6229317188263
  time_total_s: 260.71564173698425
  timestamp: 1633089777
  timesteps_since_restore: 0
  training_iteration: 2
  trial_id: b50ea_00002
  
== Status ==
Memory usage on this node: 3.4/12.7 GiB
Using AsyncHyperBand: num_stopped=2
Bracket: Iter 2.000: -1.4066556640625 | Iter 1.000: -1.46265712890625
Resources requested: 2.0/2 CPUs, 1.0/1 GPUs, 0.0/7.33 GiB heap, 0.0/3.67 GiB objects (0.0/1.0 accelerator_type:K80)
Result logdir: /root/ray_results/DEFAULT_2021-10-01_11-50-00
Number of trials: 3/3 (1 RUNNING, 2 TERMINATED)
+---------------------+------------+-----------------+--------------+------+------+------------+---------+----------



[2m[36m(pid=2361)[0m Epoch[3], Iter[42000] Loss: 2.52 Running Avg Loss: 1.34
[2m[36m(pid=2361)[0m Epoch[3], Iter[44000] Loss: 1.35 Running Avg Loss: 1.32
[2m[36m(pid=2361)[0m Epoch[3], Iter[46000] Loss: 1.74 Running Avg Loss: 1.36
[2m[36m(pid=2361)[0m Epoch[3], Iter[48000] Loss: 1.13 Running Avg Loss: 1.45
[2m[36m(pid=2361)[0m Epoch[3], Iter[50000] Loss: 1.30 Running Avg Loss: 1.40
[2m[36m(pid=2361)[0m Epoch[3], Iter[52000] Loss: 1.87 Running Avg Loss: 1.24
[2m[36m(pid=2361)[0m Epoch[3], Iter[54000] Loss: 0.37 Running Avg Loss: 1.19
[2m[36m(pid=2361)[0m Epoch[3], Iter[56000] Loss: 0.92 Running Avg Loss: 1.37
[2m[36m(pid=2361)[0m Epoch[3], Iter[58000] Loss: 0.80 Running Avg Loss: 1.36
[2m[36m(pid=2361)[0m Epoch[3], Iter[60000] Loss: 1.18 Running Avg Loss: 1.38


2021-10-01 12:05:05,896	INFO tune.py:561 -- Total run time: 905.52 seconds (905.37 seconds for the tuning loop).


Result for DEFAULT_b50ea_00002:
  accuracy: 0.5291
  date: 2021-10-01_12-05-05
  done: true
  experiment_id: 1c5539c7b0e24aef998eaca10497f948
  hostname: ac982acc27c2
  iterations_since_restore: 3
  loss: 1.33420810546875
  node_ip: 172.28.0.2
  pid: 2361
  should_checkpoint: true
  time_since_restore: 389.19638776779175
  time_this_iter_s: 128.4807460308075
  time_total_s: 389.19638776779175
  timestamp: 1633089905
  timesteps_since_restore: 0
  training_iteration: 3
  trial_id: b50ea_00002
  
== Status ==
Memory usage on this node: 3.4/12.7 GiB
Using AsyncHyperBand: num_stopped=3
Bracket: Iter 2.000: -1.4066556640625 | Iter 1.000: -1.46265712890625
Resources requested: 2.0/2 CPUs, 1.0/1 GPUs, 0.0/7.33 GiB heap, 0.0/3.67 GiB objects (0.0/1.0 accelerator_type:K80)
Result logdir: /root/ray_results/DEFAULT_2021-10-01_11-50-00
Number of trials: 3/3 (1 RUNNING, 2 TERMINATED)
+---------------------+------------+-----------------+--------------+------+------+------------+---------+----------

2021-10-01 12:05:07,794 ignite.distributed.auto.auto_dataloader INFO: Use data loader kwargs for dataset 'Dataset CIFAR10': 
	{'batch_size': 4, 'shuffle': False, 'num_workers': 2, 'pin_memory': True}
  return torch.max_pool2d(input, kernel_size, stride, padding, dilation, ceil_mode)


{'Accuracy': 0.5379}
