# Install packages and import

In [None]:
# If you are running on Google Colab, uncomment below to install the necessary dependencies 
# before beginning the exercise.

print("Setting up colab environment")
!pip uninstall -y -q pyarrow
!pip install -q ray[debug] lightning-bolts
!pip install -U -q ray[tune]
# !pip install cloud-tpu-client==0.10 https://storage.googleapis.com/tpu-pytorch/wheels/torch_xla-1.8-cp37-cp37m-linux_x86_64.whl

# # A hack to force the runtime to restart, needed to include the above dependencies.
# print("Done installing! Restarting via forced crash (this is not an issue).")
# import os
# os._exit(0)

Setting up colab environment
[K     |████████████████████████████████| 51.6MB 59kB/s 
[K     |████████████████████████████████| 256kB 56.8MB/s 
[K     |████████████████████████████████| 133kB 57.5MB/s 
[K     |████████████████████████████████| 1.3MB 47.4MB/s 
[K     |████████████████████████████████| 1.0MB 52.3MB/s 
[K     |████████████████████████████████| 81kB 11.4MB/s 
[K     |████████████████████████████████| 71kB 10.5MB/s 
[K     |████████████████████████████████| 10.1MB 40.1MB/s 
[K     |████████████████████████████████| 3.1MB 32.2MB/s 
[K     |████████████████████████████████| 81kB 11.9MB/s 
[K     |████████████████████████████████| 819kB 54.4MB/s 
[K     |████████████████████████████████| 235kB 58.3MB/s 
[K     |████████████████████████████████| 143kB 59.7MB/s 
[K     |████████████████████████████████| 296kB 57.0MB/s 
[K     |████████████████████████████████| 92kB 11.8MB/s 
[K     |████████████████████████████████| 10.6MB 42.1MB/s 
[K     |█████████████████████

In [None]:
# If you are running on Google Colab, please install TensorFlow 2.0 by uncommenting below..

try:
  # %tensorflow_version only exists in Colab.
  %tensorflow_version 2.x
except Exception:
  pass

In [None]:
# __import_lightning_begin__
import math
import gdown, tarfile           #
from zipfile import ZipFile
import shutil
import numpy as np              #
from matplotlib import pyplot as plt
from itertools import cycle
import torch
from torch import nn
import pytorch_lightning as pl
from filelock import FileLock
from torch.utils.data import DataLoader, random_split
from torch.nn import functional as F
import torchvision.datasets as datasets
from torchvision.models import resnet18
from pl_bolts.models.self_supervised.resnets import BasicBlock                  # problem with resnet18
from pl_bolts.models.gans import DCGAN
from pl_bolts.models.gans.dcgan.components import DCGANDiscriminator, DCGANGenerator
import torchmetrics as tm
from torchvision import transforms
import os
from os.path import basename
# __import_lightning_end__

# __import_tune_begin__
from pytorch_lightning.loggers import TensorBoardLogger
from pytorch_lightning.utilities.cloud_io import load as pl_load
from ray import tune
from ray.tune import CLIReporter, JupyterNotebookReporter
from ray.tune.schedulers import ASHAScheduler, PopulationBasedTraining
from ray.tune.integration.pytorch_lightning import TuneReportCallback, \
    TuneReportCheckpointCallback
# __import_tune_end__



# Download and extract data

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# 'a': 1Cjcw2EWorhdhJSGoWOdxsEUDxvl943dt, 'b': 15yXXC4h5VsytP3Ak1jfUSjQhdgP2s23K, 'c': 1vuQ-pLzoKT4Hd_V7949r9eND9E2fB_u_,
# 'd': , 'e': 1wFuasvb7PthxXtMUlsD13uzYHWlWt06H, 'f': 17l6H61tLAu26zGuei38r_T5ssjbYUeaJ, 
# 'g': 1SxQVosWeEjY3Pyn8LRXA11rLnZ9HK_7B, 'h': 1Atau0RH4oyLAiYReW-G9a8l9pUNltglF, 'i': 15lEgsR1p00KSHieaT9a1nkbJ86pDxwgp, 
# 'j': 1m0EQUbqZZeyl76XsQIKWU5Qd7jGmmWhB, 'k': , 'l': 1meTDi4aeWfdChOiXeLtUOGhjVDVu000e

# !rm -rf images
!gdown --id 17l6H61tLAu26zGuei38r_T5ssjbYUeaJ
!tar zxf ./model_f.tgz

# def prepare_data(data_dir: str = '/content'):
#     gdown.download('https://drive.google.com/uc?id=17l6H61tLAu26zGuei38r_T5ssjbYUeaJ', data_dir+'/model_f.tgz', quiet=True)
    
#     temp = tarfile.open(data_dir+'/model_f.tgz', 'r|gz')
#     temp.extractall()
#     temp.close()

Downloading...
From: https://drive.google.com/uc?id=17l6H61tLAu26zGuei38r_T5ssjbYUeaJ
To: /content/model_f.tgz
2.34GB [00:27, 85.1MB/s]


# DataModule
This creates dataloaders which need to be supplied to train, validate or test the module we have.

In [None]:
class NpyDataModule(pl.LightningDataModule):

    def __init__(self, config, data_dir: str = '/content/images/', img_width: int = 150):
        super().__init__()
        # This method is not implemented
        # self.save_hyperparameters()
        self.batch_size = config['batch_size']
        self.data_dir = os.path.expanduser(data_dir)
        
        GLOBAL = np.load('/content/drive/MyDrive/git_repos/forging_new_worlds/GLOBAL_VALS_F.npz')
        self.transform = transforms.Compose([
            # transforms.ConvertImageDtype(torch.float32),
            # Can't use this, divides values by dtype.max, use float() in npyloader instead
            transforms.RandomHorizontalFlip(),
            transforms.RandomVerticalFlip(),
            transforms.Normalize(mean=(GLOBAL['VALS'][0],), std=(GLOBAL['VALS'][1],)),
            # this shift-scales the pixel values, N(mu, sigma) -> N(0, 1)
            transforms.Resize(img_width, transforms.InterpolationMode.NEAREST),
        ])
    
    @staticmethod
    def npy_loader(path):
        # s=np.load(path).astype('float',copy=False)
        return torch.from_numpy(np.load(path)).unsqueeze(0).float()
        # Convert to tenssor first, and then to float, otherwise final dtype 
        # would be float64, which would raise errors in conv layers      ###### type as

    def setup(self, stage: str = None):
        if stage in ('fit', None):
            self.full_set = datasets.DatasetFolder(os.path.join(self.data_dir,'train'),
                                                   self.npy_loader, 
                                                   ('.npy'), 
                                                   self.transform,
                                                   )
            self.train_set, self.val_set = random_split(self.full_set, [60000, 15000])            
            self.dims = tuple(self.train_set[0][0].shape)

        if stage in ('test', None):
            self.test_set = datasets.DatasetFolder(os.path.join(self.data_dir,'val'), 
                                                   self.npy_loader, 
                                                   ('.npy'), 
                                                   self.transform,
                                                   )
            self.dims = getattr(self, 'dims', self.test_set[0][0].shape)
    
    def train_dataloader(self):
        return DataLoader(self.train_set, self.batch_size, shuffle=True)

    def val_dataloader(self):
        return DataLoader(self.val_set, self.batch_size, shuffle=True)

    def test_dataloader(self):
        return DataLoader(self.test_set, self.batch_size, shuffle=True)


# ResNet:
We modify a ResNet slightly for our purpose.

In [None]:
class LensResnet(pl.LightningModule):

    def __init__(self, config, data_dir: str = '/content/images/', image_channels: int = 1, 
                 num_classes: int = 3, **kwargs):
        super().__init__()
        self.save_hyperparameters(ignore=config)
        self.learning_rate = config['learning_rate']

        # init a pretrained resnet
        self.backbone = resnet18(num_classes = self.hparams.num_classes)
        self.backbone.conv1 = nn.Conv2d(self.hparams.image_channels, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
        #  can't merely change the in_channels since weights have to changed as well
        self.backbone.fc = nn.Sequential(
            nn.Dropout(0.5),
            self.backbone.fc
        )
        # self.backbone.
        # metrics = tm.MetricCollection([
        #     # tm.AUROC(self.hparams.num_classes, average='weighted'),
        #     # tm.ROC(self.hparams.num_classes),
        # #     tm.PrecisionRecallCurve(self.hparams.num_classes),
        # ])
        # self.train_metrics = metrics.clone(prefix='ResNet/train/')
        # self.val_metrics = metrics.clone(prefix='ResNet/val/')

    def configure_optimizers(self):
        return torch.optim.Adam(self.backbone.parameters(), self.learning_rate)

    def forward(self, x):
        return F.softmax(self.backbone(x), 1)

    def training_step(self, batch, batch_idx):
        imgs, labels = batch
        self.log('ResNet/train/auroc', tm.functional.auroc(self(imgs),labels, average='weighted', num_classes=self.hparams.num_classes))
        loss = F.cross_entropy(self.backbone(imgs), labels)
        self.log('ResNet/train/loss', loss)
        #  keep only scalars here, for no errors
        return loss

    def validation_step(self, batch, batch_idx):
        imgs, labels = batch
        self.log('ResNet/val/loss', F.cross_entropy(self.backbone(imgs), labels))
        #  keep only scalars here, for no errors
        return {'pred': self(imgs), 'target': labels}

    def validation_epoch_end(self, Listofdicts):
        prediction, target = torch.cat([x["pred"] for x in Listofdicts]), torch.cat([x["target"] for x in Listofdicts])
        aurocTensor = tm.functional.auroc(prediction, target, num_classes=self.hparams.num_classes, average=None)
        self.log('ResNet/val/auroc', aurocTensor.min())
        fprList, tprList, _ = tm.functional.roc(prediction, target, num_classes=self.hparams.num_classes)
        
        f = plt.figure()
        colors = cycle(['aqua', 'darkorange', 'cornflowerblue'])
        for i, color in zip(range(self.hparams.num_classes), colors):
            plt.plot(fprList[i].cpu(), tprList[i].cpu(), color=color,
                    label='ROC curve of class {0} (area = {1:0.2f})'
                    ''.format(i, aurocTensor[i].cpu()))
        plt.plot([0, 1], [0, 1], 'k--')
        plt.xlim([0.0, 1.0])
        plt.ylim([0.0, 1.05])
        plt.xlabel('False Positive Rate')
        plt.ylabel('True Positive Rate')
        plt.title('Multi-class ROC')
        plt.legend(loc="lower right")

        self.logger.experiment.add_figure('ResNet/val/ROC', f)
        f.savefig(str(tune.get_trial_dir())+'ROC_epoch_'+str(self.current_epoch)+'.pdf')

# Trying out Auto Tuning of learning rate

In [None]:
# Can't work with multiple optimizers
config = {
    'learning_rate': 1e-4, 'batch_size': 128, 'feature_maps': 64,
}
dm = NpyDataModule(config)
generator = StackGAN(config)

trainer = pl.Trainer(
    # logger=,
    # checkpoint_callback=,
    default_root_dir='./drive/MyDrive/Logs/', 
    gpus=1,
    auto_select_gpus=True, 
    # tpu_cores=
    progress_bar_refresh_rate=1,
    # fast_dev_run=,
    max_epochs=5,
    # max_time=,
    # limit_train_batches=,
    # flush_logs_every_n_steps=,
    # log_every_n_steps=,
    # resume_from_checkpoint='./drive/MyDrive/Logs/lr_find_temp_model.ckpt',
    auto_lr_find = True,
    # auto_scale_batch_size=True,
    # prepare_data_per_node=,
    )

# Run learning rate finder
lr_finder = trainer.tuner.lr_find(generator, dm)

# # Results can be found in
# # lr_finder.results

# Plot with
fig = lr_finder.plot(suggest=True)
fig.show()

# Pick point based on plot, or get suggestion
new_lr = lr_finder.suggestion()

# # update hparams of the model
# model.hparams.lr = new_lr

# # Fit model
# trainer.fit(model)

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


AttributeError: ignored

# Tune ResNet hyperparameters:
Here we tune hyperparameters as we train our modified ResNet.

In [None]:
# __tune_train_checkpoint_begin
def train_LensResnet_tune_checkpoint(config,
                                    checkpoint_dir=None,
                                    num_epochs=10,
                                    num_gpus=1):
    data_dir = os.path.expanduser("/content/images/")

    trainer = pl.Trainer(
        max_epochs=num_epochs,
        prepare_data_per_node = False,
        num_sanity_val_steps=0,
        # If fractional GPUs passed in, convert to int.
        gpus=math.ceil(num_gpus),
        # tpu_cores = 8,
        logger=TensorBoardLogger(
            save_dir=tune.get_trial_dir(), name="", version="."),
        # progress_bar_refresh_rate=1,
        callbacks=[
            TuneReportCheckpointCallback(
                metrics={
                    "loss": "ResNet/val/loss",
                    "auroc": "ResNet/val/auroc",
                },
                filename="checkpoint",
                # on="validation_end"
            )
        ],
        stochastic_weight_avg=True,
    )

    dm = NpyDataModule(config, data_dir)
    
    if checkpoint_dir:
        # Currently, this leads to errors:
        # model = LensResnet.load_from_checkpoint(
        #     os.path.join(checkpoint, "checkpoint"))
        # Workaround:
        ckpt = pl_load(
            os.path.join(checkpoint_dir, "checkpoint"),
            map_location=lambda storage, loc: storage)
        model = LensResnet._load_model_state(
            ckpt, config=config, 
            # data_dir=data_dir
            )
        trainer.current_epoch = ckpt["epoch"]
    else:
        model = LensResnet(config, 
                        #  data_dir
                         )

    trainer.fit(model, dm)

# __tune_train_checkpoint_end__


# __tune_asha_begin__
def tune_LensResnet_asha(num_samples=10, num_epochs=10, gpus_per_trial=1):
    config = {
        "learning_rate": tune.choice([1e-5, 1e-4, 1e-3, 1e-2]),
        "batch_size": tune.choice([128, 64, 32]),
    }

    scheduler = ASHAScheduler(
        max_t=num_epochs,
        grace_period=1,
        reduction_factor=2)

    reporter = CLIReporter(
        # overwrite=True,
        parameter_columns=["learning_rate", "batch_size"],
        metric_columns=["loss", "auroc", "training_iteration"])

    analysis = tune.run(
        tune.with_parameters(
            train_LensResnet_tune_checkpoint,
            num_epochs=num_epochs,
            num_gpus=gpus_per_trial),
        name="LensResNet_F",
        metric="auroc",
        mode="max",
        config=config,
        resources_per_trial={
            "cpu": 2,
            "gpu": gpus_per_trial,
            # "tpu": 8,
        },
        # num_samples=num_samples,
        local_dir='./drive/MyDrive/Logs',
        scheduler=scheduler,
        progress_reporter=reporter,
        fail_fast = True,
        # restore = '/content/drive/MyDrive/Logs/tune_LensResnet_asha_model_j/train_LensResnet_tune_checkpoint_e38cb_00000_0_batch_size=128,learning_rate=0.001_2021-07-06_17-52-11/checkpoint_tmp208560',
        # '/content/drive/MyDrive/Logs/tune_LensResnet_asha_model_f/train_LensResnet_tune_checkpoint_e32ba_00000_0_batch_size=64,learning_rate=0.0001_2021-07-06_03-33-10/checkpoint_epoch=14-step=4689',
        resume='PROMPT',
        )

    print("Best hyperparameters found were: ", analysis.best_config)
# __tune_asha_end__


# __tune_pbt_begin__
def tune_LensResnet_pbt(num_samples=10, num_epochs=10, gpus_per_trial=1):
    config = {
        "learning_rate": 1e-3,
        "batch_size": 64,
    }

    scheduler = PopulationBasedTraining(
        perturbation_interval=4,
        hyperparam_mutations={
            "learning_rate": [1e-5, 1e-4, 1e-3, 1e-2],
            "batch_size": [32, 64, 128]
        })

    reporter = CLIReporter(
        # overwrite=True,
        parameter_columns=["learning_rate", "batch_size"],
        metric_columns=["loss", "auroc", "training_iteration"])

    analysis = tune.run(
        # resume=True,
        tune.with_parameters(
            train_LensResnet_tune_checkpoint,
            num_epochs=num_epochs,
            num_gpus=gpus_per_trial),
        metric="auroc",
        mode="max",
        resources_per_trial={
            "cpu": 2,
            "gpu": gpus_per_trial,
            # "tpu": 8,
        },
        fail_fast = True,
        config=config,
        num_samples=num_samples,
        scheduler=scheduler,
        progress_reporter=reporter,
        local_dir='./drive/MyDrive/Logs' ,
        name="tune_LensResnet_pbt")

    print("Best hyperparameters found were: ", analysis.best_config)

# __tune_pbt_end__


if __name__ == "__main__":
    import argparse

    parser = argparse.ArgumentParser()
    parser.add_argument(
        "--smoke-test", action="store_true", help="Finish quickly for testing")
    args, _ = parser.parse_known_args()

    if args.smoke_test:
        tune_LensResnet_asha(num_samples=1, num_epochs=6, gpus_per_trial=1)
        tune_LensResnet_pbt(num_samples=1, num_epochs=6, gpus_per_trial=1)
    else:
        # ASHA scheduler
        tune_LensResnet_asha(num_samples=12, num_epochs=3, gpus_per_trial=1)
        # Population based training
        # tune_LensResnet_pbt(num_samples=10, num_epochs=10, gpus_per_trial=1)

== Status ==
Memory usage on this node: 1.5/12.7 GiB
Using AsyncHyperBand: num_stopped=0
Bracket: Iter 2.000: None | Iter 1.000: None
Resources requested: 0/2 CPUs, 0/1 GPUs, 0.0/7.48 GiB heap, 0.0/3.74 GiB objects (0.0/1.0 accelerator_type:T4)
Result logdir: /content/drive/MyDrive/Logs/LensResNet_F
Number of trials: 12/12 (12 PENDING)
+----------------------------------------------+----------+-------+-----------------+--------------+
| Trial name                                   | status   | loc   |   learning_rate |   batch_size |
|----------------------------------------------+----------+-------+-----------------+--------------|
| train_LensResnet_tune_checkpoint_83150_00000 | PENDING  |       |          0.01   |           64 |
| train_LensResnet_tune_checkpoint_83150_00001 | PENDING  |       |          0.001  |           64 |
| train_LensResnet_tune_checkpoint_83150_00002 | PENDING  |       |          0.001  |          128 |
| train_LensResnet_tune_checkpoint_83150_00003 | PENDING

[2m[36m(pid=623)[0m GPU available: True, used: True
[2m[36m(pid=623)[0m TPU available: False, using: 0 TPU cores
[2m[36m(pid=623)[0m LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


== Status ==
Memory usage on this node: 1.9/12.7 GiB
Using AsyncHyperBand: num_stopped=0
Bracket: Iter 2.000: None | Iter 1.000: None
Resources requested: 2.0/2 CPUs, 1.0/1 GPUs, 0.0/7.48 GiB heap, 0.0/3.74 GiB objects (0.0/1.0 accelerator_type:T4)
Result logdir: /content/drive/MyDrive/Logs/LensResNet_F
Number of trials: 12/12 (11 PENDING, 1 RUNNING)
+----------------------------------------------+----------+-------+-----------------+--------------+
| Trial name                                   | status   | loc   |   learning_rate |   batch_size |
|----------------------------------------------+----------+-------+-----------------+--------------|
| train_LensResnet_tune_checkpoint_83150_00000 | RUNNING  |       |          0.01   |           64 |
| train_LensResnet_tune_checkpoint_83150_00001 | PENDING  |       |          0.001  |           64 |
| train_LensResnet_tune_checkpoint_83150_00002 | PENDING  |       |          0.001  |          128 |
| train_LensResnet_tune_checkpoint_83150_

[2m[36m(pid=623)[0m 2021-07-08 04:40:01.526809: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudart.so.11.0
[2m[36m(pid=623)[0m 
[2m[36m(pid=623)[0m   | Name     | Type   | Params
[2m[36m(pid=623)[0m ------------------------------------
[2m[36m(pid=623)[0m 0 | backbone | ResNet | 11.2 M
[2m[36m(pid=623)[0m ------------------------------------
[2m[36m(pid=623)[0m 11.2 M    Trainable params
[2m[36m(pid=623)[0m 0         Non-trainable params
[2m[36m(pid=623)[0m 11.2 M    Total params
[2m[36m(pid=623)[0m 44.687    Total estimated model params size (MB)
[2m[36m(pid=623)[0m   f'Your {mode}_dataloader has `shuffle=True`, it is best practice to turn'


[2m[36m(pid=623)[0m Training: 0it [00:00, ?it/s]Training:   0%|          | 0/1173 [00:00<?, ?it/s]Epoch 0:   0%|          | 0/1173 [00:00<?, ?it/s] 


[2m[36m(pid=623)[0m   return torch.max_pool2d(input, kernel_size, stride, padding, dilation, ceil_mode)


Epoch 0:   2%|▏         | 20/1173 [00:09<09:23,  2.05it/s, loss=1.4, v_num=.]
Epoch 0:   3%|▎         | 40/1173 [00:18<08:49,  2.14it/s, loss=1.26, v_num=.]
Epoch 0:   5%|▌         | 60/1173 [00:27<08:34,  2.16it/s, loss=1.21, v_num=.]
Epoch 0:   7%|▋         | 80/1173 [00:36<08:21,  2.18it/s, loss=1.2, v_num=.] 
Epoch 0:   9%|▊         | 100/1173 [00:45<08:09,  2.19it/s, loss=1.21, v_num=.]
Epoch 0:  10%|█         | 120/1173 [00:54<07:58,  2.20it/s, loss=1.14, v_num=.]
Epoch 0:  12%|█▏        | 140/1173 [01:03<07:48,  2.21it/s, loss=1.18, v_num=.]
Epoch 0:  14%|█▎        | 160/1173 [01:12<07:38,  2.21it/s, loss=1.14, v_num=.]
Epoch 0:  15%|█▌        | 180/1173 [01:21<07:29,  2.21it/s, loss=1.13, v_num=.]
Epoch 0:  17%|█▋        | 200/1173 [01:30<07:19,  2.21it/s, loss=1.12, v_num=.]
Epoch 0:  19%|█▉        | 220/1173 [01:39<07:10,  2.22it/s, loss=1.12, v_num=.]
Epoch 0:  20%|██        | 240/1173 [01:48<07:01,  2.21it/s, loss=1.12, v_num=.]
Epoch 0:  22%|██▏       | 260/1173 [01:57<06:

[2m[36m(pid=623)[0m   "The signature of `Callback.on_train_epoch_end` has changed in v1.3."


[2m[36m(pid=623)[0m Epoch 0:  80%|████████  | 940/1173 [07:09<01:46,  2.19it/s, loss=1.1, v_num=.]
[2m[36m(pid=623)[0m Validating: 0it [00:00, ?it/s][A
Validating:   0%|          | 0/235 [00:00<?, ?it/s][A
[2m[36m(pid=623)[0m 
Epoch 0:  82%|████████▏ | 960/1173 [07:17<01:37,  2.19it/s, loss=1.1, v_num=.]
[2m[36m(pid=623)[0m 
Epoch 0:  84%|████████▎ | 980/1173 [07:26<01:28,  2.19it/s, loss=1.1, v_num=.]
[2m[36m(pid=623)[0m 
Epoch 0:  85%|████████▌ | 1000/1173 [07:35<01:18,  2.19it/s, loss=1.1, v_num=.]
[2m[36m(pid=623)[0m 
Epoch 0:  87%|████████▋ | 1020/1173 [07:45<01:09,  2.19it/s, loss=1.1, v_num=.]
[2m[36m(pid=623)[0m 
Epoch 0:  89%|████████▊ | 1040/1173 [07:54<01:00,  2.19it/s, loss=1.1, v_num=.]
[2m[36m(pid=623)[0m 
Epoch 0:  90%|█████████ | 1060/1173 [08:03<00:51,  2.19it/s, loss=1.1, v_num=.]
[2m[36m(pid=623)[0m 
Epoch 0:  92%|█████████▏| 1080/1173 [08:12<00:42,  2.19it/s, loss=1.1, v_num=.]
[2m[36m(pid=623)[0m 
Epoch 0:  94%|█████████▍| 1100/117

[2m[36m(pid=623)[0m   "`Trainer.running_sanity_check` has been renamed to `Trainer.sanity_checking` and will be removed in v1.5."


Result for train_LensResnet_tune_checkpoint_83150_00000:
  auroc: 0.5039865970611572
  date: 2021-07-08_04-49-03
  done: false
  experiment_id: a2aedbaa924043a5b44630f25b707c69
  hostname: 0b93bd12a956
  iterations_since_restore: 1
  loss: 1.11496901512146
  node_ip: 172.28.0.2
  pid: 623
  should_checkpoint: true
  time_since_restore: 557.7166030406952
  time_this_iter_s: 557.7166030406952
  time_total_s: 557.7166030406952
  timestamp: 1625719743
  timesteps_since_restore: 0
  training_iteration: 1
  trial_id: '83150_00000'
  
== Status ==
Memory usage on this node: 4.2/12.7 GiB
Using AsyncHyperBand: num_stopped=0
Bracket: Iter 2.000: None | Iter 1.000: 0.5039865970611572
Resources requested: 2.0/2 CPUs, 1.0/1 GPUs, 0.0/7.48 GiB heap, 0.0/3.74 GiB objects (0.0/1.0 accelerator_type:T4, 0.0/1.0 GPU_group_7647334c6aa32c88cd055440d9a148be, 0.0/1.0 GPU_group_0_7647334c6aa32c88cd055440d9a148be, 0.0/2.0 CPU_group_0_7647334c6aa32c88cd055440d9a148be, 0.0/2.0 CPU_group_7647334c6aa32c88cd055440d

[2m[36m(pid=623)[0m   "`Trainer.running_sanity_check` has been renamed to `Trainer.sanity_checking` and will be removed in v1.5."


Result for train_LensResnet_tune_checkpoint_83150_00000:
  auroc: 0.547535240650177
  date: 2021-07-08_04-58-02
  done: false
  experiment_id: a2aedbaa924043a5b44630f25b707c69
  hostname: 0b93bd12a956
  iterations_since_restore: 2
  loss: 1.2411631345748901
  node_ip: 172.28.0.2
  pid: 623
  should_checkpoint: true
  time_since_restore: 1096.8282725811005
  time_this_iter_s: 539.1116695404053
  time_total_s: 1096.8282725811005
  timestamp: 1625720282
  timesteps_since_restore: 0
  training_iteration: 2
  trial_id: '83150_00000'
  
== Status ==
Memory usage on this node: 4.3/12.7 GiB
Using AsyncHyperBand: num_stopped=0
Bracket: Iter 2.000: 0.547535240650177 | Iter 1.000: 0.5039865970611572
Resources requested: 2.0/2 CPUs, 1.0/1 GPUs, 0.0/7.48 GiB heap, 0.0/3.74 GiB objects (0.0/2.0 CPU_group_0_7647334c6aa32c88cd055440d9a148be, 0.0/1.0 GPU_group_0_7647334c6aa32c88cd055440d9a148be, 0.0/1.0 GPU_group_7647334c6aa32c88cd055440d9a148be, 0.0/1.0 accelerator_type:T4, 0.0/2.0 CPU_group_7647334c6

[2m[36m(pid=623)[0m   "`Trainer.running_sanity_check` has been renamed to `Trainer.sanity_checking` and will be removed in v1.5."


Result for train_LensResnet_tune_checkpoint_83150_00000:
  auroc: 0.6731033325195312
  date: 2021-07-08_05-06-57
  done: true
  experiment_id: a2aedbaa924043a5b44630f25b707c69
  hostname: 0b93bd12a956
  iterations_since_restore: 3
  loss: 9.628469467163086
  node_ip: 172.28.0.2
  pid: 623
  should_checkpoint: true
  time_since_restore: 1632.327488899231
  time_this_iter_s: 535.4992163181305
  time_total_s: 1632.327488899231
  timestamp: 1625720817
  timesteps_since_restore: 0
  training_iteration: 3
  trial_id: '83150_00000'
  
== Status ==
Memory usage on this node: 4.3/12.7 GiB
Using AsyncHyperBand: num_stopped=1
Bracket: Iter 2.000: 0.547535240650177 | Iter 1.000: 0.5039865970611572
Resources requested: 2.0/2 CPUs, 1.0/1 GPUs, 0.0/7.48 GiB heap, 0.0/3.74 GiB objects (0.0/2.0 CPU_group_7647334c6aa32c88cd055440d9a148be, 0.0/1.0 accelerator_type:T4, 0.0/2.0 CPU_group_0_7647334c6aa32c88cd055440d9a148be, 0.0/1.0 GPU_group_0_7647334c6aa32c88cd055440d9a148be, 0.0/1.0 GPU_group_7647334c6aa3

[2m[36m(pid=624)[0m GPU available: True, used: True
[2m[36m(pid=624)[0m TPU available: False, using: 0 TPU cores
[2m[36m(pid=624)[0m LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
[2m[36m(pid=624)[0m 2021-07-08 05:07:11.346080: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudart.so.11.0
[2m[36m(pid=624)[0m 
[2m[36m(pid=624)[0m   | Name     | Type   | Params
[2m[36m(pid=624)[0m ------------------------------------
[2m[36m(pid=624)[0m 0 | backbone | ResNet | 11.2 M
[2m[36m(pid=624)[0m ------------------------------------
[2m[36m(pid=624)[0m 11.2 M    Trainable params
[2m[36m(pid=624)[0m 0         Non-trainable params
[2m[36m(pid=624)[0m 11.2 M    Total params
[2m[36m(pid=624)[0m 44.687    Total estimated model params size (MB)
[2m[36m(pid=624)[0m   f'Your {mode}_dataloader has `shuffle=True`, it is best practice to turn'


[2m[36m(pid=624)[0m Training: 0it [00:00, ?it/s]Training:   0%|          | 0/1173 [00:00<?, ?it/s]Epoch 0:   0%|          | 0/1173 [00:00<?, ?it/s] 


[2m[36m(pid=624)[0m   return torch.max_pool2d(input, kernel_size, stride, padding, dilation, ceil_mode)


Epoch 0:   2%|▏         | 20/1173 [00:08<07:56,  2.42it/s, loss=1.25, v_num=.]
Epoch 0:   3%|▎         | 40/1173 [00:16<07:41,  2.46it/s, loss=1.17, v_num=.]
Epoch 0:   5%|▌         | 60/1173 [00:24<07:30,  2.47it/s, loss=1.16, v_num=.]
Epoch 0:   7%|▋         | 80/1173 [00:32<07:21,  2.48it/s, loss=1.17, v_num=.]
Epoch 0:   9%|▊         | 100/1173 [00:40<07:12,  2.48it/s, loss=1.15, v_num=.]
Epoch 0:  10%|█         | 120/1173 [00:48<07:04,  2.48it/s, loss=1.15, v_num=.]
Epoch 0:  12%|█▏        | 140/1173 [00:56<06:58,  2.47it/s, loss=1.15, v_num=.]
Epoch 0:  14%|█▎        | 160/1173 [01:05<06:52,  2.45it/s, loss=1.13, v_num=.]
Epoch 0:  15%|█▌        | 180/1173 [01:14<06:48,  2.43it/s, loss=1.12, v_num=.]
Epoch 0:  17%|█▋        | 200/1173 [01:23<06:43,  2.41it/s, loss=1.13, v_num=.]
Epoch 0:  19%|█▉        | 220/1173 [01:32<06:38,  2.39it/s, loss=1.11, v_num=.]
Epoch 0:  20%|██        | 240/1173 [01:41<06:32,  2.38it/s, loss=1.11, v_num=.]
Epoch 0:  22%|██▏       | 260/1173 [01:50<06

[2m[36m(pid=624)[0m   "The signature of `Callback.on_train_epoch_end` has changed in v1.3."


[2m[36m(pid=624)[0m Epoch 0:  80%|████████  | 940/1173 [06:57<01:43,  2.25it/s, loss=1.09, v_num=.]
[2m[36m(pid=624)[0m Validating: 0it [00:00, ?it/s][A
Validating:   0%|          | 0/235 [00:00<?, ?it/s][A
[2m[36m(pid=624)[0m 
Epoch 0:  82%|████████▏ | 960/1173 [07:05<01:34,  2.26it/s, loss=1.09, v_num=.]
[2m[36m(pid=624)[0m 
Epoch 0:  84%|████████▎ | 980/1173 [07:14<01:25,  2.26it/s, loss=1.09, v_num=.]
[2m[36m(pid=624)[0m 
Epoch 0:  85%|████████▌ | 1000/1173 [07:22<01:16,  2.26it/s, loss=1.09, v_num=.]
[2m[36m(pid=624)[0m 
Epoch 0:  87%|████████▋ | 1020/1173 [07:31<01:07,  2.26it/s, loss=1.09, v_num=.]
[2m[36m(pid=624)[0m 
Epoch 0:  89%|████████▊ | 1040/1173 [07:40<00:58,  2.26it/s, loss=1.09, v_num=.]
[2m[36m(pid=624)[0m 
Epoch 0:  90%|█████████ | 1060/1173 [07:49<00:50,  2.26it/s, loss=1.09, v_num=.]
[2m[36m(pid=624)[0m 
Epoch 0:  92%|█████████▏| 1080/1173 [07:58<00:41,  2.26it/s, loss=1.09, v_num=.]
[2m[36m(pid=624)[0m 
Epoch 0:  94%|█████████▍| 

[2m[36m(pid=624)[0m   "`Trainer.running_sanity_check` has been renamed to `Trainer.sanity_checking` and will be removed in v1.5."


Result for train_LensResnet_tune_checkpoint_83150_00001:
  auroc: 0.5439504981040955
  date: 2021-07-08_05-15-56
  done: false
  experiment_id: df7e1d89950f49d7aa8323e28e9e0044
  hostname: 0b93bd12a956
  iterations_since_restore: 1
  loss: 1.1251014471054077
  node_ip: 172.28.0.2
  pid: 624
  should_checkpoint: true
  time_since_restore: 533.1734511852264
  time_this_iter_s: 533.1734511852264
  time_total_s: 533.1734511852264
  timestamp: 1625721356
  timesteps_since_restore: 0
  training_iteration: 1
  trial_id: '83150_00001'
  
== Status ==
Memory usage on this node: 4.2/12.7 GiB
Using AsyncHyperBand: num_stopped=1
Bracket: Iter 2.000: 0.547535240650177 | Iter 1.000: 0.5239685475826263
Resources requested: 2.0/2 CPUs, 1.0/1 GPUs, 0.0/7.48 GiB heap, 0.0/3.74 GiB objects (0.0/2.0 CPU_group_7647334c6aa32c88cd055440d9a148be, 0.0/1.0 accelerator_type:T4, 0.0/1.0 GPU_group_0_7647334c6aa32c88cd055440d9a148be, 0.0/1.0 GPU_group_7647334c6aa32c88cd055440d9a148be, 0.0/2.0 CPU_group_0_7647334c6a

[2m[36m(pid=624)[0m   "`Trainer.running_sanity_check` has been renamed to `Trainer.sanity_checking` and will be removed in v1.5."


Result for train_LensResnet_tune_checkpoint_83150_00001:
  auroc: 0.7151361107826233
  date: 2021-07-08_05-24-45
  done: false
  experiment_id: df7e1d89950f49d7aa8323e28e9e0044
  hostname: 0b93bd12a956
  iterations_since_restore: 2
  loss: 1.335142970085144
  node_ip: 172.28.0.2
  pid: 624
  should_checkpoint: true
  time_since_restore: 1062.2367935180664
  time_this_iter_s: 529.06334233284
  time_total_s: 1062.2367935180664
  timestamp: 1625721885
  timesteps_since_restore: 0
  training_iteration: 2
  trial_id: '83150_00001'
  
== Status ==
Memory usage on this node: 4.2/12.7 GiB
Using AsyncHyperBand: num_stopped=1
Bracket: Iter 2.000: 0.6313356757164001 | Iter 1.000: 0.5239685475826263
Resources requested: 2.0/2 CPUs, 1.0/1 GPUs, 0.0/7.48 GiB heap, 0.0/3.74 GiB objects (0.0/1.0 GPU_group_0_7647334c6aa32c88cd055440d9a148be, 0.0/1.0 accelerator_type:T4, 0.0/2.0 CPU_group_7647334c6aa32c88cd055440d9a148be, 0.0/2.0 CPU_group_0_7647334c6aa32c88cd055440d9a148be, 0.0/1.0 GPU_group_7647334c6a

[2m[36m(pid=624)[0m   "`Trainer.running_sanity_check` has been renamed to `Trainer.sanity_checking` and will be removed in v1.5."


Result for train_LensResnet_tune_checkpoint_83150_00001:
  auroc: 0.8491126894950867
  date: 2021-07-08_05-33-34
  done: true
  experiment_id: df7e1d89950f49d7aa8323e28e9e0044
  hostname: 0b93bd12a956
  iterations_since_restore: 3
  loss: 2.214468240737915
  node_ip: 172.28.0.2
  pid: 624
  should_checkpoint: true
  time_since_restore: 1590.5424454212189
  time_this_iter_s: 528.3056519031525
  time_total_s: 1590.5424454212189
  timestamp: 1625722414
  timesteps_since_restore: 0
  training_iteration: 3
  trial_id: '83150_00001'
  
== Status ==
Memory usage on this node: 4.2/12.7 GiB
Using AsyncHyperBand: num_stopped=2
Bracket: Iter 2.000: 0.6313356757164001 | Iter 1.000: 0.5239685475826263
Resources requested: 2.0/2 CPUs, 1.0/1 GPUs, 0.0/7.48 GiB heap, 0.0/3.74 GiB objects (0.0/1.0 GPU_group_0_7647334c6aa32c88cd055440d9a148be, 0.0/1.0 GPU_group_7647334c6aa32c88cd055440d9a148be, 0.0/1.0 accelerator_type:T4, 0.0/2.0 CPU_group_7647334c6aa32c88cd055440d9a148be, 0.0/2.0 CPU_group_0_7647334c6

[2m[36m(pid=1088)[0m GPU available: True, used: True
[2m[36m(pid=1088)[0m TPU available: False, using: 0 TPU cores
[2m[36m(pid=1088)[0m LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
[2m[36m(pid=1088)[0m 2021-07-08 05:33:52.631326: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudart.so.11.0
[2m[36m(pid=1088)[0m 
[2m[36m(pid=1088)[0m   | Name     | Type   | Params
[2m[36m(pid=1088)[0m ------------------------------------
[2m[36m(pid=1088)[0m 0 | backbone | ResNet | 11.2 M
[2m[36m(pid=1088)[0m ------------------------------------
[2m[36m(pid=1088)[0m 11.2 M    Trainable params
[2m[36m(pid=1088)[0m 0         Non-trainable params
[2m[36m(pid=1088)[0m 11.2 M    Total params
[2m[36m(pid=1088)[0m 44.687    Total estimated model params size (MB)
[2m[36m(pid=1088)[0m   f'Your {mode}_dataloader has `shuffle=True`, it is best practice to turn'


[2m[36m(pid=1088)[0m Training: 0it [00:00, ?it/s]Training:   0%|          | 0/587 [00:00<?, ?it/s]Epoch 0:   0%|          | 0/587 [00:00<?, ?it/s] 


[2m[36m(pid=1088)[0m   return torch.max_pool2d(input, kernel_size, stride, padding, dilation, ceil_mode)


Epoch 0:   3%|▎         | 20/587 [00:16<07:56,  1.19it/s, loss=1.23, v_num=.]
Epoch 0:   7%|▋         | 40/587 [00:33<07:38,  1.19it/s, loss=1.18, v_num=.]
Epoch 0:  10%|█         | 60/587 [00:50<07:25,  1.18it/s, loss=1.16, v_num=.]
Epoch 0:  14%|█▎        | 80/587 [01:08<07:13,  1.17it/s, loss=1.14, v_num=.]
Epoch 0:  17%|█▋        | 100/587 [01:26<07:01,  1.16it/s, loss=1.13, v_num=.]
Epoch 0:  20%|██        | 120/587 [01:44<06:48,  1.14it/s, loss=1.12, v_num=.]
Epoch 0:  24%|██▍       | 140/587 [02:03<06:33,  1.14it/s, loss=1.12, v_num=.]
Epoch 0:  27%|██▋       | 160/587 [02:21<06:18,  1.13it/s, loss=1.12, v_num=.]
Epoch 0:  31%|███       | 180/587 [02:40<06:02,  1.12it/s, loss=1.12, v_num=.]
Epoch 0:  34%|███▍      | 200/587 [02:58<05:45,  1.12it/s, loss=1.11, v_num=.]
Epoch 0:  37%|███▋      | 220/587 [03:17<05:28,  1.12it/s, loss=1.12, v_num=.]
Epoch 0:  41%|████      | 240/587 [03:35<05:11,  1.11it/s, loss=1.12, v_num=.]
Epoch 0:  44%|████▍     | 260/587 [03:53<04:53,  1.11it/

[2m[36m(pid=1088)[0m   "The signature of `Callback.on_train_epoch_end` has changed in v1.3."


[2m[36m(pid=1088)[0m Epoch 0:  82%|████████▏ | 480/587 [07:05<01:34,  1.13it/s, loss=1.02, v_num=.]
[2m[36m(pid=1088)[0m Validating: 0it [00:00, ?it/s][A
Validating:   0%|          | 0/118 [00:00<?, ?it/s][A
[2m[36m(pid=1088)[0m 
Epoch 0:  85%|████████▌ | 500/587 [07:18<01:16,  1.14it/s, loss=1.02, v_num=.]
[2m[36m(pid=1088)[0m 
Epoch 0:  89%|████████▊ | 520/587 [07:36<00:58,  1.14it/s, loss=1.02, v_num=.]
[2m[36m(pid=1088)[0m 
Epoch 0:  92%|█████████▏| 540/587 [07:54<00:41,  1.14it/s, loss=1.02, v_num=.]
[2m[36m(pid=1088)[0m 
Epoch 0:  95%|█████████▌| 560/587 [08:12<00:23,  1.14it/s, loss=1.02, v_num=.]
[2m[36m(pid=1088)[0m 
Epoch 0:  99%|█████████▉| 580/587 [08:30<00:06,  1.14it/s, loss=1.02, v_num=.]
[2m[36m(pid=1088)[0m 
Epoch 0: 100%|██████████| 587/587 [08:46<00:00,  1.12it/s, loss=1.02, v_num=.]


[2m[36m(pid=1088)[0m   "`Trainer.running_sanity_check` has been renamed to `Trainer.sanity_checking` and will be removed in v1.5."


Result for train_LensResnet_tune_checkpoint_83150_00002:
  auroc: 0.6530711054801941
  date: 2021-07-08_05-42-45
  done: false
  experiment_id: caca53553b4b411d896dfef803623f9a
  hostname: 0b93bd12a956
  iterations_since_restore: 1
  loss: 3.1327438354492188
  node_ip: 172.28.0.2
  pid: 1088
  should_checkpoint: true
  time_since_restore: 543.5132093429565
  time_this_iter_s: 543.5132093429565
  time_total_s: 543.5132093429565
  timestamp: 1625722965
  timesteps_since_restore: 0
  training_iteration: 1
  trial_id: '83150_00002'
  
== Status ==
Memory usage on this node: 4.2/12.7 GiB
Using AsyncHyperBand: num_stopped=2
Bracket: Iter 2.000: 0.6313356757164001 | Iter 1.000: 0.5439504981040955
Resources requested: 2.0/2 CPUs, 1.0/1 GPUs, 0.0/7.48 GiB heap, 0.0/3.74 GiB objects (0.0/1.0 GPU_group_0_7647334c6aa32c88cd055440d9a148be, 0.0/2.0 CPU_group_7647334c6aa32c88cd055440d9a148be, 0.0/2.0 CPU_group_0_7647334c6aa32c88cd055440d9a148be, 0.0/1.0 GPU_group_7647334c6aa32c88cd055440d9a148be, 0.0

[2m[36m(pid=1088)[0m   "`Trainer.running_sanity_check` has been renamed to `Trainer.sanity_checking` and will be removed in v1.5."


Result for train_LensResnet_tune_checkpoint_83150_00002:
  auroc: 0.8292022347450256
  date: 2021-07-08_05-51-38
  done: false
  experiment_id: caca53553b4b411d896dfef803623f9a
  hostname: 0b93bd12a956
  iterations_since_restore: 2
  loss: 0.904776930809021
  node_ip: 172.28.0.2
  pid: 1088
  should_checkpoint: true
  time_since_restore: 1076.3702998161316
  time_this_iter_s: 532.857090473175
  time_total_s: 1076.3702998161316
  timestamp: 1625723498
  timesteps_since_restore: 0
  training_iteration: 2
  trial_id: '83150_00002'
  
== Status ==
Memory usage on this node: 4.2/12.7 GiB
Using AsyncHyperBand: num_stopped=2
Bracket: Iter 2.000: 0.7151361107826233 | Iter 1.000: 0.5439504981040955
Resources requested: 2.0/2 CPUs, 1.0/1 GPUs, 0.0/7.48 GiB heap, 0.0/3.74 GiB objects (0.0/2.0 CPU_group_7647334c6aa32c88cd055440d9a148be, 0.0/1.0 GPU_group_0_7647334c6aa32c88cd055440d9a148be, 0.0/1.0 accelerator_type:T4, 0.0/1.0 GPU_group_7647334c6aa32c88cd055440d9a148be, 0.0/2.0 CPU_group_0_7647334c

[2m[36m(pid=1088)[0m   "`Trainer.running_sanity_check` has been renamed to `Trainer.sanity_checking` and will be removed in v1.5."


Result for train_LensResnet_tune_checkpoint_83150_00002:
  auroc: 0.8051936030387878
  date: 2021-07-08_06-00-27
  done: true
  experiment_id: caca53553b4b411d896dfef803623f9a
  hostname: 0b93bd12a956
  iterations_since_restore: 3
  loss: 2.065347671508789
  node_ip: 172.28.0.2
  pid: 1088
  should_checkpoint: true
  time_since_restore: 1605.9440941810608
  time_this_iter_s: 529.5737943649292
  time_total_s: 1605.9440941810608
  timestamp: 1625724027
  timesteps_since_restore: 0
  training_iteration: 3
  trial_id: '83150_00002'
  
== Status ==
Memory usage on this node: 4.2/12.7 GiB
Using AsyncHyperBand: num_stopped=3
Bracket: Iter 2.000: 0.7151361107826233 | Iter 1.000: 0.5439504981040955
Resources requested: 2.0/2 CPUs, 1.0/1 GPUs, 0.0/7.48 GiB heap, 0.0/3.74 GiB objects (0.0/1.0 GPU_group_0_7647334c6aa32c88cd055440d9a148be, 0.0/1.0 GPU_group_7647334c6aa32c88cd055440d9a148be, 0.0/1.0 accelerator_type:T4, 0.0/2.0 CPU_group_0_7647334c6aa32c88cd055440d9a148be, 0.0/2.0 CPU_group_7647334c

[2m[36m(pid=1340)[0m GPU available: True, used: True
[2m[36m(pid=1340)[0m TPU available: False, using: 0 TPU cores
[2m[36m(pid=1340)[0m LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
[2m[36m(pid=1340)[0m 2021-07-08 06:00:48.020660: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudart.so.11.0
[2m[36m(pid=1340)[0m 
[2m[36m(pid=1340)[0m   | Name     | Type   | Params
[2m[36m(pid=1340)[0m ------------------------------------
[2m[36m(pid=1340)[0m 0 | backbone | ResNet | 11.2 M
[2m[36m(pid=1340)[0m ------------------------------------
[2m[36m(pid=1340)[0m 11.2 M    Trainable params
[2m[36m(pid=1340)[0m 0         Non-trainable params
[2m[36m(pid=1340)[0m 11.2 M    Total params
[2m[36m(pid=1340)[0m 44.687    Total estimated model params size (MB)
[2m[36m(pid=1340)[0m   f'Your {mode}_dataloader has `shuffle=True`, it is best practice to turn'


[2m[36m(pid=1340)[0m Training: 0it [00:00, ?it/s]Training:   0%|          | 0/2344 [00:00<?, ?it/s]Epoch 0:   0%|          | 0/2344 [00:00<?, ?it/s] 


[2m[36m(pid=1340)[0m   return torch.max_pool2d(input, kernel_size, stride, padding, dilation, ceil_mode)


Epoch 0:   1%|          | 20/2344 [00:04<08:02,  4.81it/s, loss=1.7, v_num=.]
Epoch 0:   2%|▏         | 40/2344 [00:07<07:37,  5.04it/s, loss=1.27, v_num=.]
Epoch 0:   3%|▎         | 60/2344 [00:11<07:26,  5.12it/s, loss=1.25, v_num=.]
Epoch 0:   3%|▎         | 80/2344 [00:15<07:22,  5.12it/s, loss=1.2, v_num=.] 
Epoch 0:   4%|▍         | 100/2344 [00:19<07:18,  5.12it/s, loss=1.18, v_num=.]
Epoch 0:   5%|▌         | 120/2344 [00:23<07:16,  5.09it/s, loss=1.18, v_num=.]
Epoch 0:   5%|▌         | 120/2344 [00:23<07:16,  5.09it/s, loss=1.22, v_num=.]
Epoch 0:   6%|▌         | 140/2344 [00:27<07:13,  5.08it/s, loss=1.19, v_num=.]
Epoch 0:   7%|▋         | 160/2344 [00:31<07:10,  5.07it/s, loss=1.15, v_num=.]
Epoch 0:   8%|▊         | 180/2344 [00:35<07:08,  5.05it/s, loss=1.13, v_num=.]
Epoch 0:   9%|▊         | 200/2344 [00:39<07:05,  5.04it/s, loss=1.14, v_num=.]
Epoch 0:   9%|▉         | 220/2344 [00:43<07:01,  5.04it/s, loss=1.17, v_num=.]
Epoch 0:  10%|█         | 240/2344 [00:47<06:

[2m[36m(pid=1340)[0m   "The signature of `Callback.on_train_epoch_end` has changed in v1.3."


[2m[36m(pid=1340)[0m Epoch 0:  80%|████████  | 1880/2344 [07:00<01:43,  4.47it/s, loss=1.1, v_num=.]
[2m[36m(pid=1340)[0m Validating: 0it [00:00, ?it/s][A
Validating:   0%|          | 0/469 [00:00<?, ?it/s][A
[2m[36m(pid=1340)[0m 
Epoch 0:  81%|████████  | 1900/2344 [07:04<01:39,  4.47it/s, loss=1.1, v_num=.]
[2m[36m(pid=1340)[0m 
Epoch 0:  82%|████████▏ | 1920/2344 [07:09<01:34,  4.47it/s, loss=1.1, v_num=.]
[2m[36m(pid=1340)[0m 
Epoch 0:  83%|████████▎ | 1940/2344 [07:13<01:30,  4.47it/s, loss=1.1, v_num=.]
[2m[36m(pid=1340)[0m 
Epoch 0:  84%|████████▎ | 1960/2344 [07:18<01:25,  4.47it/s, loss=1.1, v_num=.]
[2m[36m(pid=1340)[0m 
Epoch 0:  84%|████████▍ | 1980/2344 [07:22<01:21,  4.47it/s, loss=1.1, v_num=.]
[2m[36m(pid=1340)[0m 
Epoch 0:  85%|████████▌ | 2000/2344 [07:27<01:16,  4.47it/s, loss=1.1, v_num=.]
[2m[36m(pid=1340)[0m 
Epoch 0:  86%|████████▌ | 2020/2344 [07:31<01:12,  4.47it/s, loss=1.1, v_num=.]
[2m[36m(pid=1340)[0m 
Epoch 0:  87%|███████

[2m[36m(pid=1340)[0m   "`Trainer.running_sanity_check` has been renamed to `Trainer.sanity_checking` and will be removed in v1.5."


Result for train_LensResnet_tune_checkpoint_83150_00003:
  auroc: 0.4955345690250397
  date: 2021-07-08_06-09-38
  done: true
  experiment_id: e7850442bd644aaeb5aa062274aa8010
  hostname: 0b93bd12a956
  iterations_since_restore: 1
  loss: 1.1013730764389038
  node_ip: 172.28.0.2
  pid: 1340
  should_checkpoint: true
  time_since_restore: 539.6207184791565
  time_this_iter_s: 539.6207184791565
  time_total_s: 539.6207184791565
  timestamp: 1625724578
  timesteps_since_restore: 0
  training_iteration: 1
  trial_id: '83150_00003'
  
== Status ==
Memory usage on this node: 4.3/12.7 GiB
Using AsyncHyperBand: num_stopped=4
Bracket: Iter 2.000: 0.7151361107826233 | Iter 1.000: 0.5239685475826263
Resources requested: 2.0/2 CPUs, 1.0/1 GPUs, 0.0/7.48 GiB heap, 0.0/3.74 GiB objects (0.0/2.0 CPU_group_7647334c6aa32c88cd055440d9a148be, 0.0/2.0 CPU_group_0_7647334c6aa32c88cd055440d9a148be, 0.0/1.0 accelerator_type:T4, 0.0/1.0 GPU_group_7647334c6aa32c88cd055440d9a148be, 0.0/1.0 GPU_group_0_7647334c6

[2m[36m(pid=1357)[0m GPU available: True, used: True
[2m[36m(pid=1357)[0m TPU available: False, using: 0 TPU cores
[2m[36m(pid=1357)[0m LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
[2m[36m(pid=1357)[0m 2021-07-08 06:09:53.000776: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudart.so.11.0
[2m[36m(pid=1357)[0m 
[2m[36m(pid=1357)[0m   | Name     | Type   | Params
[2m[36m(pid=1357)[0m ------------------------------------
[2m[36m(pid=1357)[0m 0 | backbone | ResNet | 11.2 M
[2m[36m(pid=1357)[0m ------------------------------------
[2m[36m(pid=1357)[0m 11.2 M    Trainable params
[2m[36m(pid=1357)[0m 0         Non-trainable params
[2m[36m(pid=1357)[0m 11.2 M    Total params
[2m[36m(pid=1357)[0m 44.687    Total estimated model params size (MB)
[2m[36m(pid=1357)[0m   f'Your {mode}_dataloader has `shuffle=True`, it is best practice to turn'


[2m[36m(pid=1357)[0m Training: 0it [00:00, ?it/s]Training:   0%|          | 0/587 [00:00<?, ?it/s]Epoch 0:   0%|          | 0/587 [00:00<?, ?it/s] 


[2m[36m(pid=1357)[0m   return torch.max_pool2d(input, kernel_size, stride, padding, dilation, ceil_mode)


Epoch 0:   3%|▎         | 20/587 [00:16<07:39,  1.23it/s, loss=1.21, v_num=.]
Epoch 0:   7%|▋         | 40/587 [00:32<07:24,  1.23it/s, loss=1.18, v_num=.]
Epoch 0:  10%|█         | 60/587 [00:49<07:11,  1.22it/s, loss=1.16, v_num=.]
Epoch 0:  14%|█▎        | 80/587 [01:06<06:59,  1.21it/s, loss=1.17, v_num=.]
Epoch 0:  17%|█▋        | 100/587 [01:24<06:49,  1.19it/s, loss=1.16, v_num=.]
Epoch 0:  20%|██        | 120/587 [01:42<06:37,  1.18it/s, loss=1.16, v_num=.]
Epoch 0:  24%|██▍       | 140/587 [02:00<06:23,  1.16it/s, loss=1.16, v_num=.]
Epoch 0:  27%|██▋       | 160/587 [02:18<06:09,  1.16it/s, loss=1.16, v_num=.]
Epoch 0:  31%|███       | 180/587 [02:36<05:54,  1.15it/s, loss=1.16, v_num=.]
Epoch 0:  31%|███       | 180/587 [02:36<05:54,  1.15it/s, loss=1.15, v_num=.]
Epoch 0:  34%|███▍      | 200/587 [02:55<05:39,  1.14it/s, loss=1.16, v_num=.]
Epoch 0:  37%|███▋      | 220/587 [03:13<05:22,  1.14it/s, loss=1.14, v_num=.]
Epoch 0:  41%|████      | 240/587 [03:31<05:06,  1.13it/

[2m[36m(pid=1357)[0m   "The signature of `Callback.on_train_epoch_end` has changed in v1.3."


[2m[36m(pid=1357)[0m Epoch 0:  82%|████████▏ | 480/587 [06:57<01:33,  1.15it/s, loss=1.13, v_num=.]
[2m[36m(pid=1357)[0m Validating: 0it [00:00, ?it/s][A
Validating:   0%|          | 0/118 [00:00<?, ?it/s][A
Epoch 0:  82%|████████▏ | 480/587 [07:10<01:35,  1.12it/s, loss=1.13, v_num=.]
[2m[36m(pid=1357)[0m 
Epoch 0:  85%|████████▌ | 500/587 [07:14<01:15,  1.15it/s, loss=1.13, v_num=.]
[2m[36m(pid=1357)[0m 
Epoch 0:  89%|████████▊ | 520/587 [07:31<00:58,  1.15it/s, loss=1.13, v_num=.]
[2m[36m(pid=1357)[0m 
Epoch 0:  92%|█████████▏| 540/587 [07:49<00:40,  1.15it/s, loss=1.13, v_num=.]
[2m[36m(pid=1357)[0m 
Epoch 0:  95%|█████████▌| 560/587 [08:07<00:23,  1.15it/s, loss=1.13, v_num=.]
[2m[36m(pid=1357)[0m 
Epoch 0:  99%|█████████▉| 580/587 [08:25<00:06,  1.15it/s, loss=1.13, v_num=.]
[2m[36m(pid=1357)[0m 
Epoch 0: 100%|██████████| 587/587 [08:40<00:00,  1.13it/s, loss=1.13, v_num=.]


[2m[36m(pid=1357)[0m   "`Trainer.running_sanity_check` has been renamed to `Trainer.sanity_checking` and will be removed in v1.5."


Result for train_LensResnet_tune_checkpoint_83150_00004:
  auroc: 0.5865452885627747
  date: 2021-07-08_06-18-39
  done: false
  experiment_id: 478ced1d31b442d68df9fefb9ffdca01
  hostname: 0b93bd12a956
  iterations_since_restore: 1
  loss: 1.079414963722229
  node_ip: 172.28.0.2
  pid: 1357
  should_checkpoint: true
  time_since_restore: 535.4767372608185
  time_this_iter_s: 535.4767372608185
  time_total_s: 535.4767372608185
  timestamp: 1625725119
  timesteps_since_restore: 0
  training_iteration: 1
  trial_id: '83150_00004'
  
== Status ==
Memory usage on this node: 4.2/12.7 GiB
Using AsyncHyperBand: num_stopped=4
Bracket: Iter 2.000: 0.7151361107826233 | Iter 1.000: 0.5439504981040955
Resources requested: 2.0/2 CPUs, 1.0/1 GPUs, 0.0/7.48 GiB heap, 0.0/3.74 GiB objects (0.0/1.0 accelerator_type:T4, 0.0/2.0 CPU_group_7647334c6aa32c88cd055440d9a148be, 0.0/1.0 GPU_group_0_7647334c6aa32c88cd055440d9a148be, 0.0/1.0 GPU_group_7647334c6aa32c88cd055440d9a148be, 0.0/2.0 CPU_group_0_7647334c6

[2m[36m(pid=1357)[0m   "`Trainer.running_sanity_check` has been renamed to `Trainer.sanity_checking` and will be removed in v1.5."


Result for train_LensResnet_tune_checkpoint_83150_00004:
  auroc: 0.6422694325447083
  date: 2021-07-08_06-27-32
  done: true
  experiment_id: 478ced1d31b442d68df9fefb9ffdca01
  hostname: 0b93bd12a956
  iterations_since_restore: 2
  loss: 1.0285533666610718
  node_ip: 172.28.0.2
  pid: 1357
  should_checkpoint: true
  time_since_restore: 1068.1362552642822
  time_this_iter_s: 532.6595180034637
  time_total_s: 1068.1362552642822
  timestamp: 1625725652
  timesteps_since_restore: 0
  training_iteration: 2
  trial_id: '83150_00004'
  
== Status ==
Memory usage on this node: 4.2/12.7 GiB
Using AsyncHyperBand: num_stopped=5
Bracket: Iter 2.000: 0.6787027716636658 | Iter 1.000: 0.5439504981040955
Resources requested: 2.0/2 CPUs, 1.0/1 GPUs, 0.0/7.48 GiB heap, 0.0/3.74 GiB objects (0.0/2.0 CPU_group_7647334c6aa32c88cd055440d9a148be, 0.0/1.0 GPU_group_7647334c6aa32c88cd055440d9a148be, 0.0/1.0 GPU_group_0_7647334c6aa32c88cd055440d9a148be, 0.0/2.0 CPU_group_0_7647334c6aa32c88cd055440d9a148be, 0.

[2m[36m(pid=1549)[0m GPU available: True, used: True
[2m[36m(pid=1549)[0m TPU available: False, using: 0 TPU cores
[2m[36m(pid=1549)[0m LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
[2m[36m(pid=1549)[0m 2021-07-08 06:27:50.999165: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudart.so.11.0
[2m[36m(pid=1549)[0m 
[2m[36m(pid=1549)[0m   | Name     | Type   | Params
[2m[36m(pid=1549)[0m ------------------------------------
[2m[36m(pid=1549)[0m 0 | backbone | ResNet | 11.2 M
[2m[36m(pid=1549)[0m ------------------------------------
[2m[36m(pid=1549)[0m 11.2 M    Trainable params
[2m[36m(pid=1549)[0m 0         Non-trainable params
[2m[36m(pid=1549)[0m 11.2 M    Total params
[2m[36m(pid=1549)[0m 44.687    Total estimated model params size (MB)
[2m[36m(pid=1549)[0m   f'Your {mode}_dataloader has `shuffle=True`, it is best practice to turn'


[2m[36m(pid=1549)[0m Training: 0it [00:00, ?it/s]Training:   0%|          | 0/1173 [00:00<?, ?it/s]Epoch 0:   0%|          | 0/1173 [00:00<?, ?it/s] 


[2m[36m(pid=1549)[0m   return torch.max_pool2d(input, kernel_size, stride, padding, dilation, ceil_mode)


Epoch 0:   2%|▏         | 20/1173 [00:08<07:49,  2.46it/s, loss=1.17, v_num=.]
Epoch 0:   3%|▎         | 40/1173 [00:16<07:37,  2.47it/s, loss=1.18, v_num=.]
Epoch 0:   5%|▌         | 60/1173 [00:24<07:28,  2.48it/s, loss=1.19, v_num=.]
Epoch 0:   7%|▋         | 80/1173 [00:32<07:19,  2.49it/s, loss=1.16, v_num=.]
Epoch 0:   9%|▊         | 100/1173 [00:40<07:13,  2.48it/s, loss=1.15, v_num=.]
Epoch 0:  10%|█         | 120/1173 [00:48<07:09,  2.45it/s, loss=1.15, v_num=.]
Epoch 0:  12%|█▏        | 140/1173 [00:57<07:04,  2.43it/s, loss=1.14, v_num=.]
Epoch 0:  14%|█▎        | 160/1173 [01:06<07:01,  2.40it/s, loss=1.14, v_num=.]
Epoch 0:  15%|█▌        | 180/1173 [01:15<06:58,  2.38it/s, loss=1.14, v_num=.]
Epoch 0:  17%|█▋        | 200/1173 [01:25<06:54,  2.35it/s, loss=1.16, v_num=.]
Epoch 0:  19%|█▉        | 220/1173 [01:34<06:50,  2.32it/s, loss=1.13, v_num=.]
Epoch 0:  20%|██        | 240/1173 [01:44<06:45,  2.30it/s, loss=1.15, v_num=.]
Epoch 0:  22%|██▏       | 260/1173 [01:53<06

[2m[36m(pid=1549)[0m   "The signature of `Callback.on_train_epoch_end` has changed in v1.3."


[2m[36m(pid=1549)[0m Epoch 0:  80%|████████  | 940/1173 [07:05<01:45,  2.21it/s, loss=0.999, v_num=.]
[2m[36m(pid=1549)[0m Validating: 0it [00:00, ?it/s][A
Validating:   0%|          | 0/235 [00:00<?, ?it/s][A
[2m[36m(pid=1549)[0m 
Epoch 0:  82%|████████▏ | 960/1173 [07:11<01:35,  2.22it/s, loss=0.999, v_num=.]
[2m[36m(pid=1549)[0m 
Epoch 0:  84%|████████▎ | 980/1173 [07:18<01:26,  2.24it/s, loss=0.999, v_num=.]
[2m[36m(pid=1549)[0m 
Epoch 0:  85%|████████▌ | 1000/1173 [07:26<01:17,  2.24it/s, loss=0.999, v_num=.]
[2m[36m(pid=1549)[0m 
Epoch 0:  87%|████████▋ | 1020/1173 [07:35<01:08,  2.24it/s, loss=0.999, v_num=.]
[2m[36m(pid=1549)[0m 
Epoch 0:  89%|████████▊ | 1040/1173 [07:44<00:59,  2.24it/s, loss=0.999, v_num=.]
[2m[36m(pid=1549)[0m 
Epoch 0:  90%|█████████ | 1060/1173 [07:53<00:50,  2.24it/s, loss=0.999, v_num=.]
[2m[36m(pid=1549)[0m 
Epoch 0:  92%|█████████▏| 1080/1173 [08:01<00:41,  2.24it/s, loss=0.999, v_num=.]
[2m[36m(pid=1549)[0m 
Epoch 0:

[2m[36m(pid=1549)[0m   "`Trainer.running_sanity_check` has been renamed to `Trainer.sanity_checking` and will be removed in v1.5."


Result for train_LensResnet_tune_checkpoint_83150_00005:
  auroc: 0.685188353061676
  date: 2021-07-08_06-36-41
  done: false
  experiment_id: 6b7b8f2f646b4f9eac9c418edeba319d
  hostname: 0b93bd12a956
  iterations_since_restore: 1
  loss: 0.9885507822036743
  node_ip: 172.28.0.2
  pid: 1549
  should_checkpoint: true
  time_since_restore: 538.3944439888
  time_this_iter_s: 538.3944439888
  time_total_s: 538.3944439888
  timestamp: 1625726201
  timesteps_since_restore: 0
  training_iteration: 1
  trial_id: '83150_00005'
  
== Status ==
Memory usage on this node: 4.2/12.7 GiB
Using AsyncHyperBand: num_stopped=5
Bracket: Iter 2.000: 0.6787027716636658 | Iter 1.000: 0.5652478933334351
Resources requested: 2.0/2 CPUs, 1.0/1 GPUs, 0.0/7.48 GiB heap, 0.0/3.74 GiB objects (0.0/2.0 CPU_group_0_7647334c6aa32c88cd055440d9a148be, 0.0/1.0 GPU_group_7647334c6aa32c88cd055440d9a148be, 0.0/1.0 GPU_group_0_7647334c6aa32c88cd055440d9a148be, 0.0/2.0 CPU_group_7647334c6aa32c88cd055440d9a148be, 0.0/1.0 accel

[2m[36m(pid=1549)[0m   "`Trainer.running_sanity_check` has been renamed to `Trainer.sanity_checking` and will be removed in v1.5."


Result for train_LensResnet_tune_checkpoint_83150_00005:
  auroc: 0.6825814247131348
  date: 2021-07-08_06-45-30
  done: false
  experiment_id: 6b7b8f2f646b4f9eac9c418edeba319d
  hostname: 0b93bd12a956
  iterations_since_restore: 2
  loss: 1.0902965068817139
  node_ip: 172.28.0.2
  pid: 1549
  should_checkpoint: true
  time_since_restore: 1067.6672236919403
  time_this_iter_s: 529.2727797031403
  time_total_s: 1067.6672236919403
  timestamp: 1625726730
  timesteps_since_restore: 0
  training_iteration: 2
  trial_id: '83150_00005'
  
== Status ==
Memory usage on this node: 4.2/12.7 GiB
Using AsyncHyperBand: num_stopped=5
Bracket: Iter 2.000: 0.6825814247131348 | Iter 1.000: 0.5652478933334351
Resources requested: 2.0/2 CPUs, 1.0/1 GPUs, 0.0/7.48 GiB heap, 0.0/3.74 GiB objects (0.0/1.0 GPU_group_7647334c6aa32c88cd055440d9a148be, 0.0/1.0 accelerator_type:T4, 0.0/2.0 CPU_group_7647334c6aa32c88cd055440d9a148be, 0.0/2.0 CPU_group_0_7647334c6aa32c88cd055440d9a148be, 0.0/1.0 GPU_group_0_764733

[2m[36m(pid=1549)[0m   "`Trainer.running_sanity_check` has been renamed to `Trainer.sanity_checking` and will be removed in v1.5."


Result for train_LensResnet_tune_checkpoint_83150_00005:
  auroc: 0.8451821208000183
  date: 2021-07-08_06-54-18
  done: true
  experiment_id: 6b7b8f2f646b4f9eac9c418edeba319d
  hostname: 0b93bd12a956
  iterations_since_restore: 3
  loss: 0.8032200336456299
  node_ip: 172.28.0.2
  pid: 1549
  should_checkpoint: true
  time_since_restore: 1596.2367482185364
  time_this_iter_s: 528.5695245265961
  time_total_s: 1596.2367482185364
  timestamp: 1625727258
  timesteps_since_restore: 0
  training_iteration: 3
  trial_id: '83150_00005'
  
== Status ==
Memory usage on this node: 4.3/12.7 GiB
Using AsyncHyperBand: num_stopped=6
Bracket: Iter 2.000: 0.6825814247131348 | Iter 1.000: 0.5652478933334351
Resources requested: 2.0/2 CPUs, 1.0/1 GPUs, 0.0/7.48 GiB heap, 0.0/3.74 GiB objects (0.0/1.0 GPU_group_7647334c6aa32c88cd055440d9a148be, 0.0/2.0 CPU_group_7647334c6aa32c88cd055440d9a148be, 0.0/1.0 accelerator_type:T4, 0.0/1.0 GPU_group_0_7647334c6aa32c88cd055440d9a148be, 0.0/2.0 CPU_group_0_7647334

[2m[36m(pid=1811)[0m GPU available: True, used: True
[2m[36m(pid=1811)[0m TPU available: False, using: 0 TPU cores
[2m[36m(pid=1811)[0m LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
[2m[36m(pid=1811)[0m 2021-07-08 06:54:39.209082: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudart.so.11.0
[2m[36m(pid=1811)[0m 
[2m[36m(pid=1811)[0m   | Name     | Type   | Params
[2m[36m(pid=1811)[0m ------------------------------------
[2m[36m(pid=1811)[0m 0 | backbone | ResNet | 11.2 M
[2m[36m(pid=1811)[0m ------------------------------------
[2m[36m(pid=1811)[0m 11.2 M    Trainable params
[2m[36m(pid=1811)[0m 0         Non-trainable params
[2m[36m(pid=1811)[0m 11.2 M    Total params
[2m[36m(pid=1811)[0m 44.687    Total estimated model params size (MB)
[2m[36m(pid=1811)[0m   f'Your {mode}_dataloader has `shuffle=True`, it is best practice to turn'


[2m[36m(pid=1811)[0m Training: 0it [00:00, ?it/s]Training:   0%|          | 0/2344 [00:00<?, ?it/s]Epoch 0:   0%|          | 0/2344 [00:00<?, ?it/s] 


[2m[36m(pid=1811)[0m   return torch.max_pool2d(input, kernel_size, stride, padding, dilation, ceil_mode)


Epoch 0:   1%|          | 20/2344 [00:04<08:04,  4.79it/s, loss=1.21, v_num=.]
Epoch 0:   2%|▏         | 40/2344 [00:08<07:51,  4.89it/s, loss=1.2, v_num=.] 
Epoch 0:   3%|▎         | 60/2344 [00:11<07:36,  5.01it/s, loss=1.18, v_num=.]
Epoch 0:   3%|▎         | 80/2344 [00:15<07:26,  5.07it/s, loss=1.19, v_num=.]
Epoch 0:   4%|▍         | 100/2344 [00:19<07:23,  5.06it/s, loss=1.21, v_num=.]
Epoch 0:   5%|▌         | 120/2344 [00:23<07:20,  5.05it/s, loss=1.19, v_num=.]
Epoch 0:   6%|▌         | 140/2344 [00:27<07:17,  5.04it/s, loss=1.17, v_num=.]
Epoch 0:   7%|▋         | 160/2344 [00:31<07:13,  5.04it/s, loss=1.17, v_num=.]
Epoch 0:   8%|▊         | 180/2344 [00:35<07:12,  5.01it/s, loss=1.15, v_num=.]
Epoch 0:   9%|▊         | 200/2344 [00:39<07:08,  5.01it/s, loss=1.16, v_num=.]
Epoch 0:   9%|▉         | 220/2344 [00:43<07:04,  5.01it/s, loss=1.15, v_num=.]
Epoch 0:  10%|█         | 240/2344 [00:47<07:00,  5.01it/s, loss=1.16, v_num=.]
Epoch 0:  11%|█         | 260/2344 [00:51<06

[2m[36m(pid=1811)[0m   "The signature of `Callback.on_train_epoch_end` has changed in v1.3."


[2m[36m(pid=1811)[0m Epoch 0:  80%|████████  | 1880/2344 [07:01<01:44,  4.46it/s, loss=1.01, v_num=.]
[2m[36m(pid=1811)[0m Validating: 0it [00:00, ?it/s][A
Validating:   0%|          | 0/469 [00:00<?, ?it/s][A
[2m[36m(pid=1811)[0m 
Epoch 0:  81%|████████  | 1900/2344 [07:05<01:39,  4.47it/s, loss=1.01, v_num=.]
[2m[36m(pid=1811)[0m 
Epoch 0:  82%|████████▏ | 1920/2344 [07:10<01:34,  4.46it/s, loss=1.01, v_num=.]
[2m[36m(pid=1811)[0m 
Epoch 0:  83%|████████▎ | 1940/2344 [07:14<01:30,  4.47it/s, loss=1.01, v_num=.]
[2m[36m(pid=1811)[0m 
Epoch 0:  84%|████████▎ | 1960/2344 [07:18<01:26,  4.47it/s, loss=1.01, v_num=.]
[2m[36m(pid=1811)[0m 
Epoch 0:  84%|████████▍ | 1980/2344 [07:23<01:21,  4.47it/s, loss=1.01, v_num=.]
[2m[36m(pid=1811)[0m 
Epoch 0:  85%|████████▌ | 2000/2344 [07:27<01:17,  4.47it/s, loss=1.01, v_num=.]
[2m[36m(pid=1811)[0m 
Epoch 0:  86%|████████▌ | 2020/2344 [07:32<01:12,  4.47it/s, loss=1.01, v_num=.]
[2m[36m(pid=1811)[0m 
Epoch 0:  87%

[2m[36m(pid=1811)[0m   "`Trainer.running_sanity_check` has been renamed to `Trainer.sanity_checking` and will be removed in v1.5."


Result for train_LensResnet_tune_checkpoint_83150_00006:
  auroc: 0.6666943430900574
  date: 2021-07-08_07-03-31
  done: false
  experiment_id: a314a9233c754abab55f490af4e6b86d
  hostname: 0b93bd12a956
  iterations_since_restore: 1
  loss: 1.012158989906311
  node_ip: 172.28.0.2
  pid: 1811
  should_checkpoint: true
  time_since_restore: 542.5436894893646
  time_this_iter_s: 542.5436894893646
  time_total_s: 542.5436894893646
  timestamp: 1625727811
  timesteps_since_restore: 0
  training_iteration: 1
  trial_id: '83150_00006'
  
== Status ==
Memory usage on this node: 4.3/12.7 GiB
Using AsyncHyperBand: num_stopped=6
Bracket: Iter 2.000: 0.6825814247131348 | Iter 1.000: 0.5865452885627747
Resources requested: 2.0/2 CPUs, 1.0/1 GPUs, 0.0/7.48 GiB heap, 0.0/3.74 GiB objects (0.0/2.0 CPU_group_7647334c6aa32c88cd055440d9a148be, 0.0/1.0 GPU_group_0_7647334c6aa32c88cd055440d9a148be, 0.0/1.0 accelerator_type:T4, 0.0/1.0 GPU_group_7647334c6aa32c88cd055440d9a148be, 0.0/2.0 CPU_group_0_7647334c6

[2m[36m(pid=1811)[0m   "`Trainer.running_sanity_check` has been renamed to `Trainer.sanity_checking` and will be removed in v1.5."


Result for train_LensResnet_tune_checkpoint_83150_00006:
  auroc: 0.8242172598838806
  date: 2021-07-08_07-12-21
  done: false
  experiment_id: a314a9233c754abab55f490af4e6b86d
  hostname: 0b93bd12a956
  iterations_since_restore: 2
  loss: 0.8677229285240173
  node_ip: 172.28.0.2
  pid: 1811
  should_checkpoint: true
  time_since_restore: 1073.1987454891205
  time_this_iter_s: 530.6550559997559
  time_total_s: 1073.1987454891205
  timestamp: 1625728341
  timesteps_since_restore: 0
  training_iteration: 2
  trial_id: '83150_00006'
  
== Status ==
Memory usage on this node: 4.3/12.7 GiB
Using AsyncHyperBand: num_stopped=6
Bracket: Iter 2.000: 0.698858767747879 | Iter 1.000: 0.5865452885627747
Resources requested: 2.0/2 CPUs, 1.0/1 GPUs, 0.0/7.48 GiB heap, 0.0/3.74 GiB objects (0.0/1.0 GPU_group_0_7647334c6aa32c88cd055440d9a148be, 0.0/2.0 CPU_group_7647334c6aa32c88cd055440d9a148be, 0.0/2.0 CPU_group_0_7647334c6aa32c88cd055440d9a148be, 0.0/1.0 GPU_group_7647334c6aa32c88cd055440d9a148be, 0.

In [None]:
# # create a ZipFile object
# with ZipFile('/content/sampleDir.zip', 'w') as zipObj:
#    # Iterate over all the files in directory
#    for folderName, subfolders, filenames in os.walk('drive/MyDrive/Logs/LensResNet_J'):
#        for filename in filenames:
#            #create complete filepath of file in directory
#            filePath = os.path.join(folderName, filename)
#            # Add file to zip
#            zipObj.write(filePath, basename(filePath))
shutil.make_archive('/content/folder', 'zip', 'drive/MyDrive/Logs/LensResNet_J')

'/content/folder.zip'

In [None]:
!tensorboard dev delete --experiment_id vbghQrP0RWWfyBY3KRip0g
# upload --logdir ./drive/MyDrive/Logs/ \
# --name "My experiments" \
# # --description "Lightning_StackGAN"

2021-07-07 22:48:05.538117: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudart.so.11.0
Data for the "text" plugin is now uploaded to TensorBoard.dev! Note that uploaded data is public. If you do not want to upload data for this plugin, use the "--plugins" command line argument.
Cannot delete experiment vbghQrP0RWWfyBY3KRip0g because it is owned by a different user.


# StackGAN:
Here we define the GAN module, that we shall use to generate representative images.

In [None]:
class Generator2(nn.Module):
    def __init__(self, image_channels: int = 1, ngf: int = 128,
                 ker: int = 4, strd: int = 2,
                 res_ker: int = 3, res_strd: int = 1, res_pad: int = 1):
        super().__init__()

        pad = int((ker - 2)/2)
        # 64 -> 32
        self.preprocessing = nn.Sequential(
            nn.Conv2d(image_channels, ngf, ker, strd, pad, bias=False),
            nn.ReLU(True)
        )
        # residuals
        self.residual = nn.Sequential(
            BasicBlock(ngf, ngf),
            BasicBlock(ngf, ngf),
            BasicBlock(ngf, ngf),
            BasicBlock(ngf, ngf),
            BasicBlock(ngf, ngf),
            BasicBlock(ngf, ngf),
        )
        self.ending_residual = nn.Sequential(
            nn.Conv2d(ngf, ngf, res_ker, res_strd, res_pad, bias=False),
            nn.BatchNorm2d(ngf),
            nn.ReLU(True)
        )

        # at this part, add the residual inputs from after the preprocessing

        image_width = 150 # upscaling should be factor of 2 increase
        mode = 'nearest' # upscaling method is nearest-neighbour
        self.main = nn.Sequential(
            # 32 -> 64
            nn.Upsample(image_width//2, mode=mode),
            nn.Conv2d(ngf, ngf*4, res_ker, res_strd, res_pad, bias=False),
            nn.BatchNorm2d(ngf*4),
            nn.ReLU(True),
            # 64 -> 128
            nn.Upsample(image_width, mode=mode),
            nn.Conv2d(ngf*4, image_channels, res_ker, res_strd, res_pad, bias=False),
            nn.Tanh()
        )

    def forward(self, in_x):
        x_p = self.preprocessing(in_x)
        x_r = x_p
        x_r = self.residual(x_r)
        x_r = self.ending_residual(x_r)
        # large residual connections
        x_f = x_r + x_p
        return self.main(x_f)

In [None]:
class StackGAN(pl.LightningModule):
    def __init__(self, config, noise_size: int = 100, image_width = 64,
                    num_classes: int = 3, image_channels: int = 1, b1: float = 0.5, **kwargs):
        super().__init__()
        self.save_hyperparameters(ignore = config)
        self.feature_maps = config['feature_maps']
        self.lr = config['learning_rate']
        # -------------------------------------
        # Need to create a subclass because we couldn't simply add/remove a layer;
        # there are two inputs of the superclas' forward method.
        self.G1 = DCGANGenerator(self.hparams.noise_size, self.feature_maps, self.hparams.image_channels).apply(self._weights_init)
        l = list(self.G1.gen[0])
        del l[1]
        self.G1.gen[0] = nn.Sequential(*l)
        self.G1.add_module('label_emb', nn.Embedding(self.hparams.num_classes, self.hparams.noise_size))
        # ------------------------------------
        self.D1 = DCGANDiscriminator(self.feature_maps, self.hparams.image_channels).apply(self._weights_init)
        # -------------------------------------
        self.G2 = Generator2(self.hparams.image_channels, self.feature_maps).apply(self._weights_init)
        # -------------------------------------
        self.D2 = DCGANDiscriminator(self.feature_maps, self.hparams.image_channels)
        #  steps to mutate the instance, not the class definition
        extra = self.D2._make_disc_block(self.feature_maps * 2, self.feature_maps * 2)
        l = list(self.D2.disc)
        l.insert(2, extra)
        self.D2.disc = nn.Sequential(*l)
        self.D2.apply(self._weights_init)
        # No need for subclassing as the forward method need not be modified.
        # -------------------------------------
        self.R = LensResnet(config, num_classes = 4).apply(self._weights_init)
        # -------------------------------------
        self.pretrained = LensResnet(config)
        ckpt = pl_load(os.path.join(
            '/content/drive/MyDrive/Logs/tune_LensResnet_asha_model_j/train_LensResnet_tune_checkpoint_e38cb_00000_0_batch_size=128,learning_rate=0.001_2021-07-06_17-52-11/checkpoint_epoch=17-step=1406',
            # '/content/drive/MyDrive/Logs/tune_LensResnet_asha_model_f/train_LensResnet_tune_checkpoint_e32ba_00000_0_batch_size=64,learning_rate=0.0001_2021-07-06_03-33-10/checkpoint_epoch=14-step=4689',
            "checkpoint"),
            map_location=lambda storage, loc: storage)
        self.pretrained._load_model_state(ckpt)
        # -------------------------------------
        self.criterion1 = nn.BCELoss()
        self.criterion2 = nn.CrossEntropyLoss()

    @staticmethod
    def _weights_init(m):
        classname = m.__class__.__name__
        if classname.find("Conv") != -1:
            torch.nn.init.normal_(m.weight, 0.0, 0.02)
        elif classname.find("BatchNorm") != -1:
            torch.nn.init.normal_(m.weight, 1.0, 0.02)
            torch.nn.init.zeros_(m.bias)

    def forward(self, noise, labels = None):
        if labels is None:
            labels = torch.randint(*noise.shape[:-1])                           # last dimension is the hidden dimension
        inp = torch.mul(noise, self.G1.label_emb(labels))
        out1 = self.G1(inp.view(-1, inp.shape[-1], 1, 1))
        out2 = self.G2(out1.detach())
        return out2, out1

    def training_step(self, batch, batch_idx, optimizer_idx):
        imgs, labels = batch
        temp2, temp1 = self(torch.randn(labels.shape[0], self.hparams.noise_size).type_as(imgs), labels)

        if optimizer_idx == 0:
            loss = self.criterion1(self.D1(temp1), torch.ones_like(labels, dtype=torch.float32))
            self.log('G1/train/loss/disc', loss)
            loss.add_(self.criterion2(self.R.backbone(self.G2(temp1)), labels))
            self.log('G1/train/loss/full', loss)

        elif optimizer_idx == 1:
            real, fake = self.D1(F.interpolate(imgs, self.hparams.image_width, mode='nearest')), self.D1(temp1.detach())
            prediction, target = torch.cat((real, fake)), torch.cat((torch.ones_like(real),torch.zeros_like(fake)))
            loss = self.criterion1(prediction, target)
            self.log('D1/train/loss', loss)

        elif optimizer_idx == 2:
            loss = self.criterion1(self.D2(temp2), torch.ones_like(labels, dtype=torch.float32))
            self.log('G2/train/loss/disc', loss)
            loss.add_(self.criterion2(self.R.backbone(temp2), labels))
            self.log('G2/train/loss/full', loss)

        elif optimizer_idx == 3:
            real, fake = self.D2(imgs), self.D2(temp2.detach())
            prediction, target = torch.cat((real, fake)), torch.cat((torch.ones_like(real),torch.zeros_like(fake)))
            loss = self.criterion1(prediction, target)
            self.log('D2/train/loss', loss)

        elif optimizer_idx == 4:
            real, fake = self.R.backbone(imgs), self.R.backbone(temp2.detach())
            prediction, target = torch.cat((real, fake)), torch.cat((labels, self.hparams.num_classes * torch.ones_like(labels)))
            loss = self.criterion2(prediction, target)
            self.log('R/train/loss', loss)
        
        return loss

    def configure_optimizers(self):
        opt_g1 = torch.optim.Adam(self.G1.parameters(), self.lr, (self.hparams.b1, 0.999))
        opt_d1 = torch.optim.Adam(self.D1.parameters(), self.lr, (self.hparams.b1, 0.999))
        opt_g2 = torch.optim.Adam(self.G2.parameters(), self.lr, (self.hparams.b1, 0.999))
        opt_d2 = torch.optim.Adam(self.D2.parameters(), self.lr, (self.hparams.b1, 0.999))
        opt_r = torch.optim.Adam(self.R.parameters(), self.lr, (self.hparams.b1, 0.999))
        return opt_g1, opt_d1, opt_g2, opt_d2, opt_r

    def validation_step(self, batch, batch_idx):
        imgs, labels = batch
        temp2, _ = self(torch.randn(labels.shape[0], self.hparams.noise_size).type_as(imgs), labels)
        return {'pred': self.pretrained(temp2.detach()), 'target': labels}

    def validation_epoch_end(self, listofDicts):
        prediction, target = torch.cat([x["pred"] for x in listofDicts]), torch.cat([x["target"] for x in listofDicts])
        aurocTensor = tm.functional.auroc(prediction, target, num_classes=self.hparams.num_classes, average=None)
        self.log('Pre/val/auroc', aurocTensor.min())
        fprList, tprList, _ = tm.functional.roc(prediction, target, num_classes=self.hparams.num_classes)
        
        f = plt.figure()
        colors = cycle(['aqua', 'darkorange', 'cornflowerblue'])
        for i, color in zip(range(self.hparams.num_classes), colors):
            plt.plot(fprList[i].cpu(), tprList[i].cpu(), color=color,
                    label='ROC curve of class {0} (area = {1:0.2f})'
                    ''.format(i, aurocTensor[i].cpu()))
        plt.plot([0, 1], [0, 1], 'k--')
        plt.xlim([0.0, 1.0])
        plt.ylim([0.0, 1.05])
        plt.xlabel('False Positive Rate')
        plt.ylabel('True Positive Rate')
        plt.title('Multi-class ROC')
        plt.legend(loc="lower right")

        self.logger.experiment.add_figure('StackGAN/val/ROC', f)
        f.savefig(str(tune.get_trial_dir())+'ROC_epoch_'+str(self.current_epoch)+'.pdf')

# Tune StackGAN:
Here we tune hyperparameters for generating images that resemble the images from input.

In [None]:
# __tune_train_checkpoint_begin
def train_StackGAN_tune_checkpoint(config,
                                   checkpoint_dir=None,
                                   num_epochs=10,
                                   num_gpus=1):
    data_dir = os.path.expanduser("/content/images/")
    trainer = pl.Trainer(
        # accumulate_grad_batches=2,
        # limit_train_batches=0.20,
        # limit_val_batches=0.20,
        num_sanity_val_steps=-1,
        max_epochs=num_epochs,
        prepare_data_per_node = False,
        # If fractional GPUs passed in, convert to int.
        gpus=math.ceil(num_gpus),
        # tpu_cores = 8,
        logger=TensorBoardLogger(save_dir=tune.get_trial_dir(), name="", version="."),
        # progress_bar_refresh_rate=1,
        callbacks=[
            TuneReportCheckpointCallback(
                metrics={
                    "lossG1": "G1/train/loss/full",
                    "lossG2": "G2/train/loss/full",
                    "lossD1": "D1/train/loss",
                    "lossD2": "D2/train/loss",
                    "lossR": "R/train/loss",
                    "auroc": "Pre/val/auroc",
                },
                filename="checkpoint",
                # on="training_end"
            )
        ],
        # stochastic_weight_avg=True,
        # works with only one optimizer
        )
    dm = NpyDataModule(config, data_dir)
    if checkpoint_dir:
        # Currently, this leads to errors:
        # model = StackGAN.load_from_checkpoint(
        #     os.path.join(checkpoint, "checkpoint"))
        # Workaround:
        ckpt = pl_load(
            os.path.join(checkpoint_dir, "checkpoint"),
            map_location=lambda storage, loc: storage)
        model = StackGAN._load_model_state(
            ckpt, config=config, 
            # data_dir=data_dir
            )
        trainer.current_epoch = ckpt["epoch"]
    else:
        model = StackGAN(config)

    trainer.fit(model, dm)
# __tune_train_checkpoint_end__


# __tune_asha_begin__
def tune_StackGAN_asha(num_samples=10, num_epochs=10, gpus_per_trial=1):
    config = {
        "learning_rate": tune.choice([1e-4]),
        "feature_maps": tune.choice([64]),
        "batch_size": tune.choice([128, 64]),
    }

    scheduler = ASHAScheduler(
        max_t=num_epochs,
        grace_period=1,
        reduction_factor=2)

    reporter = CLIReporter(
        # overwrite=True,
        parameter_columns=["learning_rate", "feature_maps", "batch_size"],
        metric_columns=["lossG1", "lossG2", "lossD1", "lossD2", "lossR", "auroc", "training_iteration"],
        )

    analysis = tune.run(
        tune.with_parameters(
            train_StackGAN_tune_checkpoint,
            num_epochs=num_epochs,
            num_gpus=gpus_per_trial),
        name="tune_StackGAN_asha_model_j",
        metric="auroc",
        mode="max",
        config=config,
        resources_per_trial={
            "cpu": 2,
            "gpu": gpus_per_trial,
            # "tpu": 8,
        },
        num_samples=num_samples,
        local_dir='./drive/MyDrive/Logs',
        scheduler=scheduler,
        progress_reporter=reporter,
        # restore='/content/drive/MyDrive/Logs/tune_StackGAN_1_asha_model_j/train_StackGAN_tune_checkpoint_fa25b_00000_0_batch_size=64,feature_maps=64,learning_rate=0.0001_2021-07-06_20-23-13/checkpoint_epoch=0-step=937',
        fail_fast = True,
        resume='PROMPT',
        )

    print("Best hyperparameters found were: ", analysis.best_config)

# __tune_asha_end__


# __tune_pbt_begin__
def tune_StackGAN_pbt(num_samples=10, num_epochs=10, gpus_per_trial=1):
    config = {
        "learning_rate": 1e-4,
        "feature_maps": 64,
        "batch_size": 64,
    }

    scheduler = PopulationBasedTraining(
        perturbation_interval=4,
        hyperparam_mutations={
            "learning_rate": [1e-4, 1e-3],
            "feature_maps": [64, 128],
            "batch_size": [32, 64, 128]
        })

    reporter = CLIReporter(
        # overwrite=True,
        parameter_columns=["learning_rate", "feature_maps", "batch_size"],
        metric_columns=["lossG1", "lossG2", "lossD1", "lossD2", "lossR", "auroc", "training_iteration"],
        )

    analysis = tune.run(
        # resume=True,
        tune.with_parameters(
            train_StackGAN_tune_checkpoint,
            num_epochs=num_epochs,
            num_gpus=gpus_per_trial),
        name="tune_StackGAN_pbt_model_j",
        metric="auroc",
        mode="max",
        resources_per_trial={
            "cpu": 2,
            "gpu": gpus_per_trial,
            # "tpu": 8,
        },
        config=config,
        num_samples=num_samples,
        scheduler=scheduler,
        progress_reporter=reporter,
        local_dir='./drive/MyDrive/Logs',
        # restore='/content/drive/MyDrive/Logs/tune_StackGAN_1_asha_model_j/train_StackGAN_tune_checkpoint_fa25b_00000_0_batch_size=64,feature_maps=64,learning_rate=0.0001_2021-07-06_20-23-13/checkpoint_epoch=0-step=937',
        fail_fast = True,
        # resume='PROMPT',
        )

    print("Best hyperparameters found were: ", analysis.best_config)

# __tune_pbt_end__


if __name__ == "__main__":
    import argparse

    parser = argparse.ArgumentParser()
    parser.add_argument(
        "--smoke-test", action="store_true", help="Finish quickly for testing")
    args, _ = parser.parse_known_args()

    if args.smoke_test:
        tune_StackGAN_asha(num_samples=1, num_epochs=6, gpus_per_trial=1)
        tune_StackGAN_pbt(num_samples=1, num_epochs=6, gpus_per_trial=1)
    else:
        # ASHA scheduler
        tune_StackGAN_asha(num_samples=2, num_epochs=1, gpus_per_trial=1)
        # Population based training
        # tune_StackGAN_pbt(num_samples=8, num_epochs=5, gpus_per_trial=1)

Resume from local directory? [y/N]: y


2021-07-07 15:06:21,237	INFO tune.py:467 -- TrialRunner resumed, ignoring new add_experiment.
[2m[36m(pid=2877)[0m 2021-07-07 15:06:21,249	ERROR worker.py:418 -- SystemExit was raised from the worker
[2m[36m(pid=2877)[0m Traceback (most recent call last):
[2m[36m(pid=2877)[0m   File "python/ray/_raylet.pyx", line 595, in ray._raylet.task_execution_handler
[2m[36m(pid=2877)[0m   File "python/ray/_raylet.pyx", line 453, in ray._raylet.execute_task
[2m[36m(pid=2877)[0m   File "python/ray/_raylet.pyx", line 490, in ray._raylet.execute_task
[2m[36m(pid=2877)[0m   File "python/ray/_raylet.pyx", line 497, in ray._raylet.execute_task
[2m[36m(pid=2877)[0m   File "python/ray/_raylet.pyx", line 501, in ray._raylet.execute_task
[2m[36m(pid=2877)[0m   File "python/ray/_raylet.pyx", line 451, in ray._raylet.execute_task.function_executor
[2m[36m(pid=2877)[0m   File "/usr/local/lib/python3.7/dist-packages/ray/_private/function_manager.py", line 563, in actor_method_executor

== Status ==
Memory usage on this node: 3.2/12.7 GiB
Using AsyncHyperBand: num_stopped=0
Bracket: Iter 1.000: None
Resources requested: 0/2 CPUs, 0/1 GPUs, 0.0/7.54 GiB heap, 0.0/3.77 GiB objects (0.0/1.0 GPU_group_989fc9f1de0f0c7a947b8c51120efc30, 0.0/2.0 CPU_group_989fc9f1de0f0c7a947b8c51120efc30, 0.0/2.0 CPU_group_0_989fc9f1de0f0c7a947b8c51120efc30, 0.0/1.0 accelerator_type:T4, 0.0/1.0 GPU_group_0_989fc9f1de0f0c7a947b8c51120efc30)
Result logdir: /content/drive/MyDrive/Logs/tune_StackGAN_asha_model_j
Number of trials: 2/2 (2 PENDING)
+--------------------------------------------+----------+-------+-----------------+----------------+--------------+
| Trial name                                 | status   | loc   |   learning_rate |   feature_maps |   batch_size |
|--------------------------------------------+----------+-------+-----------------+----------------+--------------|
| train_StackGAN_tune_checkpoint_1f244_00000 | PENDING  |       |          0.0001 |             64 |          

[2m[36m(pid=2878)[0m GPU available: True, used: True
[2m[36m(pid=2878)[0m TPU available: False, using: 0 TPU cores


== Status ==
Memory usage on this node: 1.9/12.7 GiB
Using AsyncHyperBand: num_stopped=0
Bracket: Iter 1.000: None
Resources requested: 2.0/2 CPUs, 1.0/1 GPUs, 0.0/7.54 GiB heap, 0.0/3.77 GiB objects (0.0/1.0 GPU_group_989fc9f1de0f0c7a947b8c51120efc30, 0.0/2.0 CPU_group_989fc9f1de0f0c7a947b8c51120efc30, 0.0/2.0 CPU_group_0_989fc9f1de0f0c7a947b8c51120efc30, 0.0/1.0 accelerator_type:T4, 0.0/1.0 GPU_group_0_989fc9f1de0f0c7a947b8c51120efc30)
Result logdir: /content/drive/MyDrive/Logs/tune_StackGAN_asha_model_j
Number of trials: 2/2 (1 PENDING, 1 RUNNING)
+--------------------------------------------+----------+-------+-----------------+----------------+--------------+
| Trial name                                 | status   | loc   |   learning_rate |   feature_maps |   batch_size |
|--------------------------------------------+----------+-------+-----------------+----------------+--------------|
| train_StackGAN_tune_checkpoint_1f244_00000 | RUNNING  |       |          0.0001 |            

[2m[36m(pid=2878)[0m LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


== Status ==
Memory usage on this node: 3.1/12.7 GiB
Using AsyncHyperBand: num_stopped=0
Bracket: Iter 1.000: None
Resources requested: 2.0/2 CPUs, 1.0/1 GPUs, 0.0/7.54 GiB heap, 0.0/3.77 GiB objects (0.0/1.0 GPU_group_11925c938e061af52a49e839fa9f7d2e, 0.0/1.0 accelerator_type:T4, 0.0/2.0 CPU_group_0_11925c938e061af52a49e839fa9f7d2e, 0.0/2.0 CPU_group_11925c938e061af52a49e839fa9f7d2e, 0.0/1.0 GPU_group_0_11925c938e061af52a49e839fa9f7d2e)
Result logdir: /content/drive/MyDrive/Logs/tune_StackGAN_asha_model_j
Number of trials: 2/2 (1 PENDING, 1 RUNNING)
+--------------------------------------------+----------+-------+-----------------+----------------+--------------+
| Trial name                                 | status   | loc   |   learning_rate |   feature_maps |   batch_size |
|--------------------------------------------+----------+-------+-----------------+----------------+--------------|
| train_StackGAN_tune_checkpoint_1f244_00000 | RUNNING  |       |          0.0001 |            

[2m[36m(pid=2878)[0m 2021-07-07 15:06:32.107077: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudart.so.11.0
[2m[36m(pid=2878)[0m 
[2m[36m(pid=2878)[0m   | Name       | Type               | Params
[2m[36m(pid=2878)[0m --------------------------------------------------
[2m[36m(pid=2878)[0m 0 | G1         | DCGANGenerator     | 3.6 M 
[2m[36m(pid=2878)[0m 1 | D1         | DCGANDiscriminator | 2.8 M 
[2m[36m(pid=2878)[0m 2 | G2         | Generator2         | 632 K 
[2m[36m(pid=2878)[0m 3 | D2         | DCGANDiscriminator | 3.0 M 
[2m[36m(pid=2878)[0m 4 | R          | LensResnet         | 11.2 M
[2m[36m(pid=2878)[0m 5 | pretrained | LensResnet         | 11.2 M
[2m[36m(pid=2878)[0m 6 | criterion1 | BCELoss            | 0     
[2m[36m(pid=2878)[0m 7 | criterion2 | CrossEntropyLoss   | 0     
[2m[36m(pid=2878)[0m --------------------------------------------------
[2m[36m(pid=2878)[0m 32.3 M    

[2m[36m(pid=2878)[0m Validation sanity check: 0it [00:00, ?it/s]Validation sanity check:   0%|          | 0/118 [00:00<?, ?it/s]


[2m[36m(pid=2878)[0m   return torch.max_pool2d(input, kernel_size, stride, padding, dilation, ceil_mode)


Validation sanity check:  17%|█▋        | 20/118 [00:11<00:54,  1.80it/s]
Validation sanity check:  34%|███▍      | 40/118 [00:22<00:43,  1.80it/s]
Validation sanity check:  51%|█████     | 60/118 [00:33<00:32,  1.78it/s]
Validation sanity check:  68%|██████▊   | 80/118 [00:45<00:21,  1.75it/s]
Validation sanity check:  85%|████████▍ | 100/118 [01:02<00:11,  1.54it/s]
Validation sanity check: 100%|██████████| 118/118 [01:17<00:00,  1.42it/s]


[2m[36m(pid=2878)[0m   "`Trainer.running_sanity_check` has been renamed to `Trainer.sanity_checking` and will be removed in v1.5."


[2m[36m(pid=2878)[0m                                                                           Training: 0it [00:00, ?it/s]Training:   0%|          | 0/587 [00:00<?, ?it/s]Epoch 0:   0%|          | 0/587 [00:00<?, ?it/s] 




KeyboardInterrupt: ignored