<a href="https://colab.research.google.com/github/souravraha/galaxy/blob/experimental/Lightning_Tune.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Install packages and import

In [2]:
# If you are running on Google Colab, uncomment below to install the necessary dependencies 
# before beginning the exercise.

print('Setting up colab environment')
!pip uninstall -y -q pyarrow
!pip install -q ray[debug] lightning-bolts
!pip install -U -q ray[tune]
# !pip install cloud-tpu-client==0.10 https://storage.googleapis.com/tpu-pytorch/wheels/torch_xla-1.8-cp37-cp37m-linux_x86_64.whl

# # A hack to force the runtime to restart, needed to include the above dependencies.
# print('Done installing! Restarting via forced crash (this is not an issue).')
# import os
# os._exit(0)

Setting up colab environment
[K     |████████████████████████████████| 51.6MB 61kB/s 
[K     |████████████████████████████████| 256kB 42.6MB/s 
[K     |████████████████████████████████| 81kB 10.7MB/s 
[K     |████████████████████████████████| 3.1MB 41.3MB/s 
[K     |████████████████████████████████| 133kB 64.7MB/s 
[K     |████████████████████████████████| 10.1MB 27.6MB/s 
[K     |████████████████████████████████| 71kB 10.6MB/s 
[K     |████████████████████████████████| 1.3MB 35.0MB/s 
[K     |████████████████████████████████| 81kB 12.2MB/s 
[K     |████████████████████████████████| 819kB 26.4MB/s 
[K     |████████████████████████████████| 235kB 52.7MB/s 
[K     |████████████████████████████████| 92kB 12.6MB/s 
[K     |████████████████████████████████| 296kB 52.9MB/s 
[K     |████████████████████████████████| 143kB 54.5MB/s 
[K     |████████████████████████████████| 10.6MB 31.4MB/s 
[K     |████████████████████████████████| 122kB 58.7MB/s 
[K     |█████████████████████

In [1]:
# If you are running on Google Colab, please install TensorFlow 2.0 by uncommenting below..

try:
  # %tensorflow_version only exists in Colab.
  %tensorflow_version 2.x
except Exception:
  pass

In [2]:
# __import_lightning_begin__
import math
# import shutil
import numpy as np              #
from matplotlib import pyplot as plt
from itertools import cycle
import torch
from torch import nn
import pytorch_lightning as pl
# from filelock import FileLock
from torch.utils.data import DataLoader, random_split
from torch.nn import functional as F
import torchvision.datasets as datasets
from torchvision.models import resnet18
from pl_bolts.models.self_supervised.resnets import BasicBlock                  # problem with resnet18
from pl_bolts.models.gans import DCGAN
from pl_bolts.models.gans.dcgan.components import DCGANDiscriminator, DCGANGenerator
import torchmetrics as tm
from torchvision import transforms
import os
from os.path import basename
# __import_lightning_end__

# __import_tune_begin__
from pytorch_lightning.loggers import TensorBoardLogger
from pytorch_lightning.utilities.cloud_io import load as pl_load
from ray import tune
from ray.tune import CLIReporter, JupyterNotebookReporter
from ray.tune.schedulers import ASHAScheduler, PopulationBasedTraining
from ray.tune.integration.pytorch_lightning import TuneReportCallback, \
    TuneReportCheckpointCallback
# __import_tune_end__



# Download and extract data

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
# 'a': 1Cjcw2EWorhdhJSGoWOdxsEUDxvl943dt, 'b': 15yXXC4h5VsytP3Ak1jfUSjQhdgP2s23K, 'c': 1vuQ-pLzoKT4Hd_V7949r9eND9E2fB_u_,
# 'd': , 'e': 1wFuasvb7PthxXtMUlsD13uzYHWlWt06H, 'f': 17l6H61tLAu26zGuei38r_T5ssjbYUeaJ, 
# 'g': 1SxQVosWeEjY3Pyn8LRXA11rLnZ9HK_7B, 'h': 1Atau0RH4oyLAiYReW-G9a8l9pUNltglF, 'i': 15lEgsR1p00KSHieaT9a1nkbJ86pDxwgp, 
# 'j': 1m0EQUbqZZeyl76XsQIKWU5Qd7jGmmWhB, 'k': , 'l': 1meTDi4aeWfdChOiXeLtUOGhjVDVu000e

# !rm -rf images
!gdown --id 17l6H61tLAu26zGuei38r_T5ssjbYUeaJ
!tar zxf ./model_f.tgz

# def prepare_data(data_dir: str = '/content'):
#     gdown.download('https://drive.google.com/uc?id=17l6H61tLAu26zGuei38r_T5ssjbYUeaJ', data_dir+'/model_f.tgz', quiet=True)
    
#     temp = tarfile.open(data_dir+'/model_f.tgz', 'r|gz')
#     temp.extractall()
#     temp.close()

Downloading...
From: https://drive.google.com/uc?id=17l6H61tLAu26zGuei38r_T5ssjbYUeaJ
To: /content/model_f.tgz
2.34GB [00:56, 41.3MB/s]


# DataModule
This creates dataloaders which need to be supplied to train, validate or test the module we have.

In [3]:
class NpyDataModule(pl.LightningDataModule):

    def __init__(self, config, img_width: int = 150, data_dir: str = '/content/images/'):
        super().__init__()
        # This method is not implemented
        # self.save_hyperparameters()
        self.batch_size = config['batch_size']
        self.data_dir = os.path.expanduser(data_dir)
        
        GLOBAL = np.load('/content/drive/MyDrive/git_repos/forging_new_worlds/GLOBAL_VALS_F.npz')
        self.transform = transforms.Compose([
            # transforms.ConvertImageDtype(torch.float32),
            # Can't use this, divides values by dtype.max, use float() in npyloader instead
            transforms.RandomHorizontalFlip(),
            transforms.RandomVerticalFlip(),
            transforms.Normalize(mean=(GLOBAL['VALS'][0],), std=(GLOBAL['VALS'][1],)),
            # this shift-scales the pixel values, N(mu, sigma) -> N(0, 1)
            transforms.Resize(img_width, transforms.InterpolationMode.NEAREST),
        ])
    
    @staticmethod
    def npy_loader(path):
        # s=np.load(path).astype('float',copy=False)
        return torch.from_numpy(np.load(path)).unsqueeze(0).float()
        # Convert to tenssor first, and then to float, otherwise final dtype 
        # would be float64, which would raise errors in conv layers      ###### type as

    def setup(self, stage: str = None):
        if stage in ('fit', None):
            self.full_set = datasets.DatasetFolder(os.path.join(self.data_dir,'train'),
                                                   self.npy_loader, 
                                                   ('.npy'), 
                                                   self.transform,
                                                   )
            self.train_set, self.val_set = random_split(self.full_set, [60000, 15000])            
            # self.val_set = datasets.DatasetFolder(os.path.join(self.data_dir,'val'), 
            #                                        self.npy_loader, 
            #                                        ('.npy'), 
            #                                        self.transform,
            #                                        )
            self.dims = tuple(self.train_set[0][0].shape)

        if stage in ('test', None):
            self.test_set = datasets.DatasetFolder(os.path.join(self.data_dir,'val'), 
                                                   self.npy_loader, 
                                                   ('.npy'), 
                                                   self.transform,
                                                   )
            self.dims = getattr(self, 'dims', self.test_set[0][0].shape)
    
    def train_dataloader(self):
        return DataLoader(self.train_set, self.batch_size, shuffle=True, num_workers=2)

    def val_dataloader(self):
        return DataLoader(self.val_set, self.batch_size, shuffle=True, num_workers=2)

    def test_dataloader(self):
        return DataLoader(self.test_set, self.batch_size, shuffle=True, num_workers=2)


# ResNet:
We modify a ResNet slightly for our purpose.

In [4]:
class LensResnet(pl.LightningModule):

    def __init__(self, config, image_channels: int = 1, num_classes: int = 3, **kwargs):
        super().__init__()
        self.save_hyperparameters(ignore=config)
        self.learning_rate = config['learning_rate']

        # init a pretrained resnet
        self.backbone = resnet18(num_classes = self.hparams.num_classes)
        self.backbone.conv1 = nn.Conv2d(self.hparams.image_channels, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
        #  can't merely change the in_channels since weights have to changed as well
        self.backbone.fc = nn.Sequential(
            nn.Dropout(0.5),
            self.backbone.fc
        )
        # self.backbone.
        # metrics = tm.MetricCollection([
        #     # tm.AUROC(self.hparams.num_classes, average='weighted'),
        #     # tm.ROC(self.hparams.num_classes),
        # #     tm.PrecisionRecallCurve(self.hparams.num_classes),
        # ])
        # self.train_metrics = metrics.clone(prefix='ResNet/train/')
        # self.val_metrics = metrics.clone(prefix='ResNet/val/')

    def configure_optimizers(self):
        return torch.optim.Adam(self.backbone.parameters(), self.learning_rate)

    def forward(self, x):
        return F.softmax(self.backbone(x), 1)

    def training_step(self, batch, batch_idx):
        imgs, labels = batch
        self.log('ResNet/train/auroc', tm.functional.auroc(self(imgs),labels, average='weighted', num_classes=self.hparams.num_classes))
        loss = F.cross_entropy(self.backbone(imgs), labels)
        self.log('ResNet/train/loss', loss)
        #  keep only scalars here, for no errors
        return loss

    def validation_step(self, batch, batch_idx):
        imgs, labels = batch
        self.log('ResNet/val/loss', F.cross_entropy(self.backbone(imgs), labels))
        #  keep only scalars here, for no errors
        return {'pred': self(imgs), 'target': labels}

    def validation_epoch_end(self, Listofdicts):
        prediction, target = torch.cat([x['pred'] for x in Listofdicts]), torch.cat([x['target'] for x in Listofdicts])
        aurocTensor = tm.functional.auroc(prediction, target, num_classes=self.hparams.num_classes, average=None)
        self.log('ResNet/val/auroc', aurocTensor.min())
        fprList, tprList, _ = tm.functional.roc(prediction, target, num_classes=self.hparams.num_classes)
        
        f = plt.figure()
        colors = cycle(['aqua', 'darkorange', 'cornflowerblue'])
        for i, color in zip(range(self.hparams.num_classes), colors):
            plt.plot(fprList[i].cpu(), tprList[i].cpu(), color=color,
                    label='ROC curve of class {0} (area = {1:0.2f})'
                    ''.format(i, aurocTensor[i].cpu()))
        plt.plot([0, 1], [0, 1], 'k--')
        plt.xlim([0.0, 1.0])
        plt.ylim([0.0, 1.05])
        plt.xlabel('False Positive Rate')
        plt.ylabel('True Positive Rate')
        plt.title('Multi-class ROC')
        plt.legend(loc='lower right')

        self.logger.experiment.add_figure('ResNet/val/ROC', f)
        f.savefig(str(tune.get_trial_dir())+'ROC_epoch_'+str(self.current_epoch)+'.pdf')

Trying out Auto Tuning of learning rate

In [None]:
# Can't work with multiple optimizers
config = {
    'learning_rate': 1e-4, 'batch_size': 128, 'feature_maps': 64,
}
dm = NpyDataModule(config)
generator = StackGAN(config)

trainer = pl.Trainer(
    # logger=,
    # checkpoint_callback=,
    default_root_dir='./drive/MyDrive/Logs/', 
    gpus=1,
    auto_select_gpus=True, 
    # tpu_cores=
    progress_bar_refresh_rate=1,
    # fast_dev_run=,
    max_epochs=5,
    # max_time=,
    # limit_train_batches=,
    # flush_logs_every_n_steps=,
    # log_every_n_steps=,
    # resume_from_checkpoint='./drive/MyDrive/Logs/lr_find_temp_model.ckpt',
    auto_lr_find = True,
    # auto_scale_batch_size=True,
    # prepare_data_per_node=,
    )

# Run learning rate finder
lr_finder = trainer.tuner.lr_find(generator, dm)

# # Results can be found in
# # lr_finder.results

# Plot with
fig = lr_finder.plot(suggest=True)
fig.show()

# Pick point based on plot, or get suggestion
new_lr = lr_finder.suggestion()

# # update hparams of the model
# model.hparams.lr = new_lr

# # Fit model
# trainer.fit(model)

# Tune ResNet hyperparameters:
Here we tune hyperparameters as we train our modified ResNet.

In [None]:
# __tune_train_checkpoint_begin
def train_LensResnet_tune_checkpoint(config,
                                    checkpoint_dir=None,
                                    num_epochs=10,
                                    num_gpus=1):
    data_dir = os.path.expanduser('/content/images/')

    trainer = pl.Trainer(
        max_epochs=num_epochs,
        prepare_data_per_node = False,
        num_sanity_val_steps=0,
        # If fractional GPUs passed in, convert to int.
        gpus=math.ceil(num_gpus),
        # tpu_cores = 8,
        logger=TensorBoardLogger(
            save_dir=tune.get_trial_dir(), name='', version='.'),
        # progress_bar_refresh_rate=1,
        callbacks=[
            TuneReportCheckpointCallback(
                metrics={
                    'loss': 'ResNet/val/loss',
                    'auroc': 'ResNet/val/auroc',
                },
                filename='checkpoint',
                # on='validation_end'
            )
        ],
        stochastic_weight_avg=True,
    )

    dm = NpyDataModule(config, data_dir)
    
    if checkpoint_dir:
        # Currently, this leads to errors:
        # model = LensResnet.load_from_checkpoint(
        #     os.path.join(checkpoint, 'checkpoint'))
        # Workaround:
        ckpt = pl_load(
            os.path.join(checkpoint_dir, 'checkpoint'),
            map_location=lambda storage, loc: storage)
        model = LensResnet._load_model_state(
            ckpt, config=config, 
            # data_dir=data_dir
            )
        trainer.current_epoch = ckpt['epoch']
    else:
        model = LensResnet(config, 
                        #  data_dir
                         )

    trainer.fit(model, dm)

# __tune_train_checkpoint_end__


# __tune_asha_begin__
def tune_LensResnet_asha(num_samples=10, num_epochs=10, gpus_per_trial=1):
    # config = {
    #     'learning_rate': tune.choice([1e-5, 1e-4, 1e-3, 1e-2]),
    #     'batch_size': tune.choice([128, 64, 32]),
    # }

    best = {'batch_size': 128, 'learning_rate': 0.0001}    

    scheduler = ASHAScheduler(
        max_t=num_epochs,
        grace_period=1,
        reduction_factor=2)

    reporter = CLIReporter(
        # overwrite=True,
        parameter_columns=['learning_rate', 'batch_size'],
        metric_columns=['loss', 'auroc', 'training_iteration'])

    analysis = tune.run(
        tune.with_parameters(
            train_LensResnet_tune_checkpoint,
            num_epochs=num_epochs,
            num_gpus=gpus_per_trial),
        name='LensResNet_J',                                                    # Change with dataset change
        metric='auroc',
        mode='max',
        config=best,
        resources_per_trial={
            'cpu': 2,
            'gpu': gpus_per_trial,
            # 'tpu': 8,
        },
        # num_samples=num_samples,
        local_dir='./drive/MyDrive/Logs',
        scheduler=scheduler,
        progress_reporter=reporter,
        fail_fast = True,
        restore = '/content/drive/MyDrive/Logs/LensResNet_J/train_LensResnet_tune_checkpoint_c74d9_00003_3_batch_size=128,learning_rate=0.0001_2021-07-08_01-03-43/checkpoint_epoch=2-step=1406',
        # '/content/drive/MyDrive/Logs/tune_LensResnet_asha_model_f/train_LensResnet_tune_checkpoint_e32ba_00000_0_batch_size=64,learning_rate=0.0001_2021-07-06_03-33-10/checkpoint_epoch=14-step=4689',
        # resume=True,
        )

    print('Best hyperparameters found were: ', analysis.best_config)
# __tune_asha_end__


# __tune_pbt_begin__
def tune_LensResnet_pbt(num_samples=10, num_epochs=10, gpus_per_trial=1):
    config = {
        'learning_rate': 1e-3,
        'batch_size': 64,
    }

    scheduler = PopulationBasedTraining(
        perturbation_interval=4,
        hyperparam_mutations={
            'learning_rate': [1e-5, 1e-4, 1e-3, 1e-2],
            'batch_size': [32, 64, 128]
        })

    reporter = CLIReporter(
        # overwrite=True,
        parameter_columns=['learning_rate', 'batch_size'],
        metric_columns=['loss', 'auroc', 'training_iteration'])

    analysis = tune.run(
        # resume=True,
        tune.with_parameters(
            train_LensResnet_tune_checkpoint,
            num_epochs=num_epochs,
            num_gpus=gpus_per_trial),
        metric='auroc',
        mode='max',
        resources_per_trial={
            'cpu': 2,
            'gpu': gpus_per_trial,
            # 'tpu': 8,
        },
        fail_fast = True,
        config=config,
        num_samples=num_samples,
        scheduler=scheduler,
        progress_reporter=reporter,
        local_dir='./drive/MyDrive/Logs' ,
        name='tune_LensResnet_pbt')

    print('Best hyperparameters found were: ', analysis.best_config)

# __tune_pbt_end__


if __name__ == '__main__':
    import argparse

    parser = argparse.ArgumentParser()
    parser.add_argument(
        '--smoke-test', action='store_true', help='Finish quickly for testing')
    args, _ = parser.parse_known_args()

    if args.smoke_test:
        tune_LensResnet_asha(num_samples=1, num_epochs=6, gpus_per_trial=1)
        tune_LensResnet_pbt(num_samples=1, num_epochs=6, gpus_per_trial=1)
    else:
        # ASHA scheduler
        tune_LensResnet_asha(num_samples=12, num_epochs=20, gpus_per_trial=1)
        # Population based training
        # tune_LensResnet_pbt(num_samples=10, num_epochs=10, gpus_per_trial=1)

# Experimental

Check if we can load checkpoints

In [5]:
def pretrained_LensResNets():
    ckptf = pl_load(os.path.join(
        '/content/drive/MyDrive/Logs/LensResNet_F/train_LensResnet_tune_checkpoint_efb38_00000_0_2021-07-09_04-20-04/checkpoint_epoch=9-step=16407',
        'checkpoint'), map_location=lambda storage, loc: storage)
    f = LensResnet._load_model_state(ckptf, config={'batch_size': 32, 'learning_rate': 0.0001})
    ckptj = pl_load(os.path.join(
        '/content/drive/MyDrive/Logs/LensResNet_J/train_LensResnet_tune_checkpoint_21355_00000_0_2021-07-09_16-17-17/checkpoint_epoch=27-step=4687',
        'checkpoint'), map_location=lambda storage, loc: storage)
    j = LensResnet._load_model_state(ckptj, config={'batch_size': 128, 'learning_rate': 0.0001})
    return f, j

def post_plotting(ax):
    ax.plot([0, 1], [0, 1], 'k--')
    ax.legend(loc='lower right')

In [25]:
class Stage1(DCGAN):
    def __init__(self, config, num_classes: int = 3, **kwargs):
        super().__init__(feature_maps_gen=config['ngf'], feature_maps_disc=config['ndf'], learning_rate=config['learning_rate'])
        self.save_hyperparameters(ignore=config)

        self.generator.add_module('emb', nn.Embedding(self.hparams.num_classes, self.hparams.latent_dim))

        self.modelF, self.modelJ = pretrained_LensResNets()

    def forward(self, noise, labels = None):
        if labels is None:
            labels = getattr(self, 'labels', 
                             torch.randint(self.hparams.num_classes, noise.shape[:-1], device=self.device))  # last dimension is the hidden dimension
        inp = noise.mul(self.generator.emb(labels))
        return self.generator(inp.view(-1, inp.shape[-1], 1, 1))

    def training_step(self, batch, batch_idx, optimizer_idx):
        real, self.labels = batch

        # Train discriminator
        result = None
        if optimizer_idx == 0:
            result = self._disc_step(real)

        # Train generator
        if optimizer_idx == 1:
            result = self._gen_step(real)

        return result

    def _disc_step(self, real: torch.Tensor) -> torch.Tensor:
        disc_loss = self._get_disc_loss(real)
        self.log('Stage1/D/train/loss', disc_loss, on_epoch=True)
        return disc_loss

    def _gen_step(self, real: torch.Tensor) -> torch.Tensor:
        gen_loss = self._get_gen_loss(real)
        self.log('Stage1/G/train/loss', gen_loss, on_epoch=True)
        return gen_loss

    def validation_step(self, batch, batch_idx):
        imgs, labels = batch
        out_64 = self(torch.randn(labels.shape[0], self.hparams.latent_dim).type_as(imgs), labels)
        out = F.interpolate(out_64, 150)
        return {'predF': self.modelF(out), 'predJ': self.modelJ(out), 'target': labels}

    def validation_epoch_end(self, listofDicts):
        target = torch.cat([x['target'] for x in listofDicts])
        f, ax = plt.subplots(1,2, subplot_kw={'xlim': [0,1], 'xlabel': 'False Positive Rate', 
                                              'ylim': [0,1.05], 'ylabel': 'True Positive Rate'},
                             figsize=[11, 5])
        letters = ['F', 'J']
        for l in range(2):
            prediction = torch.cat([x['pred' + str(letters[l])] for x in listofDicts])
            aurocTensor = tm.functional.auroc(prediction, target, num_classes=self.hparams.num_classes, average=None)
            self.log('Stage1/ResNet(' + str(letters[l]) + ')/val/auroc', aurocTensor.min())
            fprList, tprList, _ = tm.functional.roc(prediction, target, num_classes=self.hparams.num_classes)
            
            colors = cycle(['aqua', 'darkorange', 'cornflowerblue'])
            for i, color in zip(range(self.hparams.num_classes), colors):
                ax[l].plot(fprList[i].cpu(), tprList[i].cpu(), color=color,
                        label='ROC curve of class {0} (area = {1:0.2f})'
                        ''.format(i, aurocTensor[i].cpu()))
            post_plotting(ax[l])
            ax[l].set_title('Multi-class ROC (' + str(letters[l]) + ')')
        
        f.tight_layout()
        self.logger.experiment.add_figure('Stage1/ResNet/val/ROC', f)
        f.savefig(str(tune.get_trial_dir()) + 'ROC_epoch_' + str(self.current_epoch) + '.pdf')

In [24]:
dummy = Stage1.load_from_checkpoint(os.path.join('/content/drive/MyDrive/Logs/Stage1_pbt_F/train_Stage1_tune_checkpoint_228f8_00001_1_ndf=128,ngf=128_2021-07-11_05-30-43/checkpoint_epoch=0-step=937', 'checkpoint'))

TypeError: ignored

In [23]:
del dummy

Let's train Stage1 GAN

In [17]:
# __tune_train_checkpoint_begin
def train_Stage1_tune_checkpoint(config, checkpoint_dir=None, num_epochs=10, num_gpus=1):
    # data_dir = os.path.expanduser('/content/images/')
    trainer = pl.Trainer(
        # accumulate_grad_batches=2,
        # limit_train_batches=0.20,
        # limit_val_batches=0.20,
        # num_sanity_val_steps=-1,
        max_epochs=num_epochs,
        prepare_data_per_node = False,
        # If fractional GPUs passed in, convert to int.
        gpus=math.ceil(num_gpus),
        # tpu_cores = 8,
        logger=TensorBoardLogger(save_dir=tune.get_trial_dir(), name='', version='.'),
        # progress_bar_refresh_rate=1,
        callbacks=[
                   TuneReportCheckpointCallback(
                       {'lossG': 'Stage1/G/train/loss', 
                        'lossD': 'Stage1/D/train/loss', 
                        'auroc': 'Stage1/ResNet(F)/val/auroc', 
                        'auroc_cross': 'Stage1/ResNet(J)/val/auroc',
                        },
                   ),
        ],
        # stochastic_weight_avg=True,
        # works with only one optimizer
        # benchmark=True,
    )
    dm = NpyDataModule(config, 64)                                              # Specify image width here
    if checkpoint_dir:
        # Currently, this leads to errors:
        # model = Stage1.load_from_checkpoint(
        #     os.path.join(checkpoint, 'checkpoint'))
        # Workaround:
        ckpt = pl_load(os.path.join(checkpoint_dir, 'checkpoint'),
                       map_location=lambda storage, loc: storage)
        model = Stage1._load_model_state(ckpt, config=config)
        trainer.current_epoch = ckpt['epoch']
    else:
        model = Stage1(config)

    trainer.fit(model, dm)
# __tune_train_checkpoint_end__


# __tune_asha_begin__
def tune_Stage1_asha(num_samples=10, num_epochs=10, gpus_per_trial=1):
    analysis = tune.run(
        tune.with_parameters(train_Stage1_tune_checkpoint,
                             num_epochs=num_epochs,
                             num_gpus=gpus_per_trial),
        name='Stage1_F',
        metric='auroc',
        mode='max',
        config={'learning_rate': tune.choice([1e-5, 1e-4, 1e-3]),
                'ngf': tune.choice([128, 64, 32]),
                'ndf': tune.choice([128, 64, 32]),
                'batch_size': tune.choice([128, 64, 32]),
                },
        resources_per_trial={'cpu': 2,
                             'gpu': gpus_per_trial,
                             },
        local_dir='./drive/MyDrive/Logs/',
        scheduler = ASHAScheduler(max_t=num_epochs, grace_period=1,  reduction_factor=2),
        progress_reporter=JupyterNotebookReporter(
            overwrite=True,
            parameter_columns=['learning_rate', 'ngf', 'ndf', 'batch_size'],
            metric_columns=['lossG', 'lossD', 'auroc', 'auroc_cross', 'training_iteration'],
            ),
        fail_fast = True,
        # num_samples=num_samples,
        resume='PROMPT',
        )

    print('Best hyperparameters found were: ', analysis.best_config)

# __tune_asha_end__


# __tune_pbt_begin__
def tune_Stage1_pbt(num_samples=10, num_epochs=10, gpus_per_trial=1):
    analysis = tune.run(
        tune.with_parameters(train_Stage1_tune_checkpoint,
                             num_epochs=num_epochs,
                             num_gpus=gpus_per_trial),
        name='Stage1_pbt_F',
        metric='auroc',
        mode='max',
        config={'learning_rate': 1e-4,
                'ngf': tune.choice([128, 64, 32]),
                'ndf': tune.choice([128, 64, 32]),
                'batch_size': 64,
                },
        resources_per_trial={'cpu': 2,
                             'gpu': gpus_per_trial,
                             },
        local_dir='./drive/MyDrive/Logs',
        scheduler = PopulationBasedTraining(perturbation_interval=1,
                                        hyperparam_mutations={
                                            'learning_rate': tune.loguniform(1e-6, 1e-2),
                                            'batch_size': [128, 64, 32],
                                            },
                                        ),
        progress_reporter=JupyterNotebookReporter(
            overwrite=True,
            parameter_columns=['learning_rate', 'ngf', 'ndf', 'batch_size'],
            metric_columns=['lossG', 'lossD', 'auroc', 'auroc_cross', 'training_iteration'],
            ),
        fail_fast = True,
        num_samples=num_samples,
        # resume='PROMPT',
        )

    print('Best hyperparameters found were: ', analysis.best_config)

# __tune_pbt_end__


if __name__ == '__main__':
    import argparse

    parser = argparse.ArgumentParser()
    parser.add_argument(
        '--smoke-test', action='store_true', help='Finish quickly for testing')
    args, _ = parser.parse_known_args()

    if args.smoke_test:
        tune_Stage1_asha(num_samples=1, num_epochs=6, gpus_per_trial=1)
        tune_Stage1_pbt(num_samples=1, num_epochs=6, gpus_per_trial=1)
    else:
        # ASHA scheduler
        # tune_Stage1_asha(num_samples=12, num_epochs=5, gpus_per_trial=1)
        # Population based training
        tune_Stage1_pbt(num_samples=10, num_epochs=10, gpus_per_trial=1)

== Status ==
Memory usage on this node: 1.7/12.7 GiB
PopulationBasedTraining: 0 checkpoints, 0 perturbs
Resources requested: 0/2 CPUs, 0/1 GPUs, 0.0/7.51 GiB heap, 0.0/3.75 GiB objects (0.0/1.0 accelerator_type:T4)
Result logdir: /content/drive/MyDrive/Logs/Stage1_pbt_F
Number of trials: 10/10 (10 PENDING)
+------------------------------------------+----------+-------+-----------------+-------+-------+--------------+
| Trial name                               | status   | loc   |   learning_rate |   ngf |   ndf |   batch_size |
|------------------------------------------+----------+-------+-----------------+-------+-------+--------------|
| train_Stage1_tune_checkpoint_228f8_00000 | PENDING  |       |          0.0001 |   128 |    32 |           64 |
| train_Stage1_tune_checkpoint_228f8_00001 | PENDING  |       |          0.0001 |   128 |   128 |           64 |
| train_Stage1_tune_checkpoint_228f8_00002 | PENDING  |       |          0.0001 |    32 |    64 |           64 |
| train_Stage1

[2m[36m(pid=1266)[0m GPU available: True, used: True
[2m[36m(pid=1266)[0m TPU available: False, using: 0 TPU cores
[2m[36m(pid=1266)[0m LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


== Status ==
Memory usage on this node: 2.3/12.7 GiB
PopulationBasedTraining: 0 checkpoints, 0 perturbs
Resources requested: 2.0/2 CPUs, 1.0/1 GPUs, 0.0/7.51 GiB heap, 0.0/3.75 GiB objects (0.0/1.0 accelerator_type:T4)
Result logdir: /content/drive/MyDrive/Logs/Stage1_pbt_F
Number of trials: 10/10 (9 PENDING, 1 RUNNING)
+------------------------------------------+----------+-------+-----------------+-------+-------+--------------+
| Trial name                               | status   | loc   |   learning_rate |   ngf |   ndf |   batch_size |
|------------------------------------------+----------+-------+-----------------+-------+-------+--------------|
| train_Stage1_tune_checkpoint_228f8_00000 | RUNNING  |       |          0.0001 |   128 |    32 |           64 |
| train_Stage1_tune_checkpoint_228f8_00001 | PENDING  |       |          0.0001 |   128 |   128 |           64 |
| train_Stage1_tune_checkpoint_228f8_00002 | PENDING  |       |          0.0001 |    32 |    64 |           64 |


[2m[36m(pid=1266)[0m 2021-07-11 05:30:51.229461: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudart.so.11.0
[2m[36m(pid=1266)[0m 
[2m[36m(pid=1266)[0m   | Name          | Type               | Params
[2m[36m(pid=1266)[0m -----------------------------------------------------
[2m[36m(pid=1266)[0m 0 | generator     | DCGANGenerator     | 12.7 M
[2m[36m(pid=1266)[0m 1 | discriminator | DCGANDiscriminator | 693 K 
[2m[36m(pid=1266)[0m 2 | criterion     | BCELoss            | 0     
[2m[36m(pid=1266)[0m 3 | modelF        | LensResnet         | 11.2 M
[2m[36m(pid=1266)[0m 4 | modelJ        | LensResnet         | 11.2 M
[2m[36m(pid=1266)[0m -----------------------------------------------------
[2m[36m(pid=1266)[0m 35.7 M    Trainable params
[2m[36m(pid=1266)[0m 0         Non-trainable params
[2m[36m(pid=1266)[0m 35.7 M    Total params
[2m[36m(pid=1266)[0m 142.767   Total estimated model params s

[2m[36m(pid=1266)[0m Validation sanity check: 0it [00:00, ?it/s]Validation sanity check:   0%|          | 0/2 [00:00<?, ?it/s]
== Status ==
Memory usage on this node: 3.4/12.7 GiB
PopulationBasedTraining: 0 checkpoints, 0 perturbs
Resources requested: 2.0/2 CPUs, 1.0/1 GPUs, 0.0/7.51 GiB heap, 0.0/3.75 GiB objects (0.0/1.0 GPU_group_0_ecd3f63ef35a3c653f2d69781ac9fe69, 0.0/2.0 CPU_group_ecd3f63ef35a3c653f2d69781ac9fe69, 0.0/1.0 accelerator_type:T4, 0.0/1.0 GPU_group_ecd3f63ef35a3c653f2d69781ac9fe69, 0.0/2.0 CPU_group_0_ecd3f63ef35a3c653f2d69781ac9fe69)
Result logdir: /content/drive/MyDrive/Logs/Stage1_pbt_F
Number of trials: 10/10 (9 PENDING, 1 RUNNING)
+------------------------------------------+----------+-------+-----------------+-------+-------+--------------+
| Trial name                               | status   | loc   |   learning_rate |   ngf |   ndf |   batch_size |
|------------------------------------------+----------+-------+-----------------+-------+-------+-----------

[2m[36m(pid=1266)[0m   return torch.max_pool2d(input, kernel_size, stride, padding, dilation, ceil_mode)
[2m[36m(pid=1266)[0m   "`Trainer.running_sanity_check` has been renamed to `Trainer.sanity_checking` and will be removed in v1.5."


Epoch 0:   0%|          | 0/1173 [00:00<?, ?it/s] 
Epoch 0:   2%|▏         | 20/1173 [00:07<06:54,  2.78it/s, loss=1.69, v_num=.]
Epoch 0:   3%|▎         | 40/1173 [00:13<06:24,  2.94it/s, loss=1.95, v_num=.]
Epoch 0:   5%|▌         | 60/1173 [00:19<06:08,  3.02it/s, loss=2.09, v_num=.]
Epoch 0:   7%|▋         | 80/1173 [00:26<05:57,  3.06it/s, loss=2.13, v_num=.]
Epoch 0:   9%|▊         | 100/1173 [00:34<06:05,  2.94it/s, loss=2.22, v_num=.]
Epoch 0:  10%|█         | 120/1173 [00:44<06:33,  2.67it/s, loss=2.3, v_num=.] 
Epoch 0:  12%|█▏        | 140/1173 [00:55<06:48,  2.53it/s, loss=2.24, v_num=.]
Epoch 0:  14%|█▎        | 160/1173 [01:05<06:55,  2.44it/s, loss=2.31, v_num=.]
Epoch 0:  15%|█▌        | 180/1173 [01:15<06:57,  2.38it/s, loss=2.27, v_num=.]
Epoch 0:  17%|█▋        | 200/1173 [01:25<06:56,  2.34it/s, loss=2.38, v_num=.]
Epoch 0:  19%|█▉        | 220/1173 [01:35<06:53,  2.30it/s, loss=2.32, v_num=.]
Epoch 0:  20%|██        | 240/1173 [01:45<06:50,  2.27it/s, loss=2.38, v_

[2m[36m(pid=1266)[0m   "`Trainer.running_sanity_check` has been renamed to `Trainer.sanity_checking` and will be removed in v1.5."


Result for train_Stage1_tune_checkpoint_228f8_00000:
  auroc: 0.4618435502052307
  auroc_cross: 0.47367948293685913
  date: 2021-07-11_05-40-06
  done: false
  experiment_id: 14d26a48183a4a18b4f18beb9fafaf34
  hostname: bd0789e721af
  iterations_since_restore: 1
  lossD: 0.05285283178091049
  lossG: 5.041823387145996
  node_ip: 172.28.0.2
  pid: 1266
  should_checkpoint: true
  time_since_restore: 560.2282562255859
  time_this_iter_s: 560.2282562255859
  time_total_s: 560.2282562255859
  timestamp: 1625982006
  timesteps_since_restore: 0
  training_iteration: 1
  trial_id: 228f8_00000
  
== Status ==
Memory usage on this node: 4.5/12.7 GiB
PopulationBasedTraining: 0 checkpoints, 0 perturbs
Resources requested: 2.0/2 CPUs, 1.0/1 GPUs, 0.0/7.51 GiB heap, 0.0/3.75 GiB objects (0.0/1.0 GPU_group_0_ecd3f63ef35a3c653f2d69781ac9fe69, 0.0/2.0 CPU_group_ecd3f63ef35a3c653f2d69781ac9fe69, 0.0/1.0 accelerator_type:T4, 0.0/1.0 GPU_group_ecd3f63ef35a3c653f2d69781ac9fe69, 0.0/2.0 CPU_group_0_ecd3f63e

[2m[36m(pid=1266)[0m 2021-07-11 05:40:06,786	INFO trainable.py:76 -- Checkpoint size is 249768194 bytes
[2m[36m(pid=1265)[0m GPU available: True, used: True
[2m[36m(pid=1265)[0m TPU available: False, using: 0 TPU cores
[2m[36m(pid=1265)[0m LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
[2m[36m(pid=1265)[0m 2021-07-11 05:40:24.082666: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudart.so.11.0
[2m[36m(pid=1265)[0m 
[2m[36m(pid=1265)[0m   | Name          | Type               | Params
[2m[36m(pid=1265)[0m -----------------------------------------------------
[2m[36m(pid=1265)[0m 0 | generator     | DCGANGenerator     | 12.7 M
[2m[36m(pid=1265)[0m 1 | discriminator | DCGANDiscriminator | 11.0 M
[2m[36m(pid=1265)[0m 2 | criterion     | BCELoss            | 0     
[2m[36m(pid=1265)[0m 3 | modelF        | LensResnet         | 11.2 M
[2m[36m(pid=1265)[0m 4 | modelJ        | LensResnet         | 11.2 M


[2m[36m(pid=1265)[0m Validation sanity check: 0it [00:00, ?it/s]Validation sanity check:   0%|          | 0/2 [00:00<?, ?it/s]


[2m[36m(pid=1265)[0m   return torch.max_pool2d(input, kernel_size, stride, padding, dilation, ceil_mode)
[2m[36m(pid=1265)[0m   "`Trainer.running_sanity_check` has been renamed to `Trainer.sanity_checking` and will be removed in v1.5."


Epoch 0:   0%|          | 0/1173 [00:00<?, ?it/s] 
Epoch 0:   2%|▏         | 20/1173 [00:08<08:08,  2.36it/s, loss=6.41, v_num=.]
Epoch 0:   3%|▎         | 40/1173 [00:16<07:47,  2.42it/s, loss=5.16, v_num=.]
Epoch 0:   5%|▌         | 60/1173 [00:24<07:39,  2.42it/s, loss=6.23, v_num=.]
Epoch 0:   7%|▋         | 80/1173 [00:33<07:34,  2.41it/s, loss=5.27, v_num=.]
Epoch 0:   9%|▊         | 100/1173 [00:41<07:28,  2.39it/s, loss=5.31, v_num=.]
Epoch 0:  10%|█         | 120/1173 [00:50<07:24,  2.37it/s, loss=5.6, v_num=.] 
Epoch 0:  12%|█▏        | 140/1173 [00:59<07:21,  2.34it/s, loss=5.17, v_num=.]
Epoch 0:  14%|█▎        | 160/1173 [01:09<07:18,  2.31it/s, loss=5.39, v_num=.]
Epoch 0:  15%|█▌        | 180/1173 [01:18<07:11,  2.30it/s, loss=75.1, v_num=.]
Epoch 0:  17%|█▋        | 200/1173 [01:26<07:02,  2.30it/s, loss=88.8, v_num=.]
Epoch 0:  19%|█▉        | 220/1173 [01:35<06:52,  2.31it/s, loss=89.1, v_num=.]
Epoch 0:  20%|██        | 240/1173 [01:43<06:42,  2.32it/s, loss=89.2, v_

[2m[36m(pid=1265)[0m   "`Trainer.running_sanity_check` has been renamed to `Trainer.sanity_checking` and will be removed in v1.5."
2021-07-11 05:49:29,734	INFO pbt.py:490 -- [pbt]: no checkpoint for trial. Skip exploit for Trial train_Stage1_tune_checkpoint_228f8_00001


Result for train_Stage1_tune_checkpoint_228f8_00001:
  auroc: 0.3095346987247467
  auroc_cross: 0.381914347410202
  date: 2021-07-11_05-49-29
  done: false
  experiment_id: f111bebca8014281875ff9bfc982c872
  hostname: bd0789e721af
  iterations_since_restore: 1
  lossD: 79.15614318847656
  lossG: 100.0
  node_ip: 172.28.0.2
  pid: 1265
  should_checkpoint: true
  time_since_restore: 558.8177676200867
  time_this_iter_s: 558.8177676200867
  time_total_s: 558.8177676200867
  timestamp: 1625982569
  timesteps_since_restore: 0
  training_iteration: 1
  trial_id: 228f8_00001
  
== Status ==
Memory usage on this node: 4.6/12.7 GiB
PopulationBasedTraining: 0 checkpoints, 0 perturbs
Resources requested: 2.0/2 CPUs, 1.0/1 GPUs, 0.0/7.51 GiB heap, 0.0/3.75 GiB objects (0.0/1.0 GPU_group_0_ecd3f63ef35a3c653f2d69781ac9fe69, 0.0/1.0 accelerator_type:T4, 0.0/1.0 GPU_group_ecd3f63ef35a3c653f2d69781ac9fe69, 0.0/2.0 CPU_group_ecd3f63ef35a3c653f2d69781ac9fe69, 0.0/2.0 CPU_group_0_ecd3f63ef35a3c653f2d6978

[2m[36m(pid=1265)[0m 2021-07-11 05:49:30,867	INFO trainable.py:76 -- Checkpoint size is 373840194 bytes
[2m[36m(pid=1643)[0m GPU available: True, used: True
[2m[36m(pid=1643)[0m TPU available: False, using: 0 TPU cores
[2m[36m(pid=1643)[0m LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
[2m[36m(pid=1643)[0m 2021-07-11 05:49:52.996036: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudart.so.11.0
[2m[36m(pid=1643)[0m 
[2m[36m(pid=1643)[0m   | Name          | Type               | Params
[2m[36m(pid=1643)[0m -----------------------------------------------------
[2m[36m(pid=1643)[0m 0 | generator     | DCGANGenerator     | 1.1 M 
[2m[36m(pid=1643)[0m 1 | discriminator | DCGANDiscriminator | 2.8 M 
[2m[36m(pid=1643)[0m 2 | criterion     | BCELoss            | 0     
[2m[36m(pid=1643)[0m 3 | modelF        | LensResnet         | 11.2 M
[2m[36m(pid=1643)[0m 4 | modelJ        | LensResnet         | 11.2 M


[2m[36m(pid=1643)[0m Validation sanity check: 0it [00:00, ?it/s]Validation sanity check:   0%|          | 0/2 [00:00<?, ?it/s]


[2m[36m(pid=1643)[0m   return torch.max_pool2d(input, kernel_size, stride, padding, dilation, ceil_mode)
[2m[36m(pid=1643)[0m   "`Trainer.running_sanity_check` has been renamed to `Trainer.sanity_checking` and will be removed in v1.5."


Epoch 0:   0%|          | 0/1173 [00:00<?, ?it/s] 
Epoch 0:   2%|▏         | 20/1173 [00:06<06:03,  3.17it/s, loss=2.51, v_num=.]
Epoch 0:   3%|▎         | 40/1173 [00:12<05:50,  3.23it/s, loss=3.05, v_num=.]
Epoch 0:   5%|▌         | 60/1173 [00:18<05:40,  3.27it/s, loss=3.38, v_num=.]
Epoch 0:   7%|▋         | 80/1173 [00:27<06:15,  2.91it/s, loss=3.62, v_num=.]
Epoch 0:   9%|▊         | 100/1173 [00:37<06:37,  2.70it/s, loss=3.72, v_num=.]
Epoch 0:  10%|█         | 120/1173 [00:47<06:53,  2.55it/s, loss=3.86, v_num=.]
Epoch 0:  12%|█▏        | 140/1173 [00:57<07:02,  2.44it/s, loss=3.88, v_num=.]
Epoch 0:  14%|█▎        | 160/1173 [01:07<07:07,  2.37it/s, loss=4.11, v_num=.]
Epoch 0:  15%|█▌        | 180/1173 [01:17<07:08,  2.32it/s, loss=3.3, v_num=.] 
Epoch 0:  17%|█▋        | 200/1173 [01:27<07:07,  2.28it/s, loss=2.94, v_num=.]
Epoch 0:  19%|█▉        | 220/1173 [01:37<07:02,  2.25it/s, loss=3.15, v_num=.]
Epoch 0:  20%|██        | 240/1173 [01:47<06:57,  2.23it/s, loss=3.23, v_

[2m[36m(pid=1643)[0m   "`Trainer.running_sanity_check` has been renamed to `Trainer.sanity_checking` and will be removed in v1.5."


Result for train_Stage1_tune_checkpoint_228f8_00002:
  auroc: 0.47464263439178467
  auroc_cross: 0.4897451400756836
  date: 2021-07-11_05-59-09
  done: false
  experiment_id: 21ed1fd3506342328606b025c798018c
  hostname: bd0789e721af
  iterations_since_restore: 1
  lossD: 0.000612126721534878
  lossG: 8.432461738586426
  node_ip: 172.28.0.2
  pid: 1643
  should_checkpoint: true
  time_since_restore: 569.0690321922302
  time_this_iter_s: 569.0690321922302
  time_total_s: 569.0690321922302
  timestamp: 1625983149
  timesteps_since_restore: 0
  training_iteration: 1
  trial_id: 228f8_00002
  
== Status ==
Memory usage on this node: 4.9/12.7 GiB
PopulationBasedTraining: 1 checkpoints, 0 perturbs
Resources requested: 2.0/2 CPUs, 1.0/1 GPUs, 0.0/7.51 GiB heap, 0.0/3.75 GiB objects (0.0/1.0 accelerator_type:T4, 0.0/1.0 GPU_group_ecd3f63ef35a3c653f2d69781ac9fe69, 0.0/2.0 CPU_group_0_ecd3f63ef35a3c653f2d69781ac9fe69, 0.0/1.0 GPU_group_0_ecd3f63ef35a3c653f2d69781ac9fe69, 0.0/2.0 CPU_group_ecd3f63

[2m[36m(pid=1643)[0m 2021-07-11 05:59:09,700	INFO trainable.py:76 -- Checkpoint size is 135937282 bytes
[2m[36m(pid=1643)[0m 2021-07-11 05:59:10,239	INFO trainable.py:76 -- Checkpoint size is 135937282 bytes
[2m[36m(pid=1852)[0m GPU available: True, used: True
[2m[36m(pid=1852)[0m TPU available: False, using: 0 TPU cores
[2m[36m(pid=1852)[0m LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
[2m[36m(pid=1852)[0m 2021-07-11 05:59:30.239435: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudart.so.11.0
[2m[36m(pid=1852)[0m 
[2m[36m(pid=1852)[0m   | Name          | Type               | Params
[2m[36m(pid=1852)[0m -----------------------------------------------------
[2m[36m(pid=1852)[0m 0 | generator     | DCGANGenerator     | 1.1 M 
[2m[36m(pid=1852)[0m 1 | discriminator | DCGANDiscriminator | 2.8 M 
[2m[36m(pid=1852)[0m 2 | criterion     | BCELoss            | 0     
[2m[36m(pid=1852)[0m 3 | modelF    

[2m[36m(pid=1852)[0m Validation sanity check: 0it [00:00, ?it/s]
[2m[36m(pid=1852)[0m Validation sanity check:   0%|          | 0/2 [00:00<?, ?it/s]


[2m[36m(pid=1852)[0m   return torch.max_pool2d(input, kernel_size, stride, padding, dilation, ceil_mode)
[2m[36m(pid=1852)[0m   "`Trainer.running_sanity_check` has been renamed to `Trainer.sanity_checking` and will be removed in v1.5."


[2m[36m(pid=1852)[0m                                                               
Epoch 0:   0%|          | 0/1173 [00:00<?, ?it/s] 
Epoch 0:   2%|▏         | 20/1173 [00:06<05:59,  3.20it/s, loss=2.61, v_num=.]
Epoch 0:   3%|▎         | 40/1173 [00:11<05:35,  3.38it/s, loss=3.03, v_num=.]
Epoch 0:   5%|▌         | 60/1173 [00:17<05:28,  3.38it/s, loss=3.34, v_num=.]
Epoch 0:   7%|▋         | 80/1173 [00:26<06:06,  2.98it/s, loss=3.55, v_num=.]
Epoch 0:   9%|▊         | 100/1173 [00:36<06:27,  2.77it/s, loss=3.8, v_num=.] 
Epoch 0:  10%|█         | 120/1173 [00:45<06:40,  2.63it/s, loss=3.87, v_num=.]
Epoch 0:  12%|█▏        | 140/1173 [00:55<06:48,  2.53it/s, loss=4, v_num=.]   
Epoch 0:  14%|█▎        | 160/1173 [01:05<06:53,  2.45it/s, loss=3.99, v_num=.]
Epoch 0:  15%|█▌        | 180/1173 [01:15<06:55,  2.39it/s, loss=4.05, v_num=.]
Epoch 0:  17%|█▋        | 200/1173 [01:25<06:54,  2.35it/s, loss=4.11, v_num=.]
Epoch 0:  19%|█▉        | 220/1173 [01:34<06:51,  2.32it/s, loss

[2m[36m(pid=1852)[0m   "`Trainer.running_sanity_check` has been renamed to `Trainer.sanity_checking` and will be removed in v1.5."


Result for train_Stage1_tune_checkpoint_228f8_00003:
  auroc: 0.500049889087677
  auroc_cross: 0.49964848160743713
  date: 2021-07-11_06-08-49
  done: false
  experiment_id: 85b52e42878e431c9780c84a9f24f97c
  hostname: bd0789e721af
  iterations_since_restore: 1
  lossD: 0.00028767611365765333
  lossG: 9.288405418395996
  node_ip: 172.28.0.2
  pid: 1852
  should_checkpoint: true
  time_since_restore: 571.7321963310242
  time_this_iter_s: 571.7321963310242
  time_total_s: 571.7321963310242
  timestamp: 1625983729
  timesteps_since_restore: 0
  training_iteration: 1
  trial_id: 228f8_00003
  




== Status ==
Memory usage on this node: 5.2/12.7 GiB
PopulationBasedTraining: 2 checkpoints, 0 perturbs
Resources requested: 2.0/2 CPUs, 1.0/1 GPUs, 0.0/7.51 GiB heap, 0.0/3.75 GiB objects (0.0/1.0 accelerator_type:T4, 0.0/2.0 CPU_group_ecd3f63ef35a3c653f2d69781ac9fe69, 0.0/2.0 CPU_group_0_ecd3f63ef35a3c653f2d69781ac9fe69, 0.0/1.0 GPU_group_0_ecd3f63ef35a3c653f2d69781ac9fe69, 0.0/1.0 GPU_group_ecd3f63ef35a3c653f2d69781ac9fe69)
Current best trial: 228f8_00003 with auroc=0.500049889087677 and parameters={'learning_rate': 0.0001, 'ngf': 32, 'ndf': 64, 'batch_size': 64}
Result logdir: /content/drive/MyDrive/Logs/Stage1_pbt_F
Number of trials: 10/10 (3 PAUSED, 6 PENDING, 1 RUNNING)
+------------------------------------------+----------+-----------------+-----------------+-------+-------+--------------+-----------+--------------+----------+---------------+----------------------+
| Trial name                               | status   | loc             |   learning_rate |   ngf |   ndf |   batc

[2m[36m(pid=1852)[0m 2021-07-11 06:08:50,774	INFO trainable.py:76 -- Checkpoint size is 135937282 bytes
[2m[36m(pid=1852)[0m 2021-07-11 06:08:51,315	INFO trainable.py:76 -- Checkpoint size is 135937282 bytes
[2m[36m(pid=2051)[0m GPU available: True, used: True
[2m[36m(pid=2051)[0m TPU available: False, using: 0 TPU cores
[2m[36m(pid=2051)[0m LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
[2m[36m(pid=2051)[0m 2021-07-11 06:09:12.329882: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudart.so.11.0
[2m[36m(pid=2051)[0m 
[2m[36m(pid=2051)[0m   | Name          | Type               | Params
[2m[36m(pid=2051)[0m -----------------------------------------------------
[2m[36m(pid=2051)[0m 0 | generator     | DCGANGenerator     | 1.1 M 
[2m[36m(pid=2051)[0m 1 | discriminator | DCGANDiscriminator | 2.8 M 
[2m[36m(pid=2051)[0m 2 | criterion     | BCELoss            | 0     
[2m[36m(pid=2051)[0m 3 | modelF    

[2m[36m(pid=2051)[0m Validation sanity check: 0it [00:00, ?it/s]Validation sanity check:   0%|          | 0/2 [00:00<?, ?it/s]


[2m[36m(pid=2051)[0m   return torch.max_pool2d(input, kernel_size, stride, padding, dilation, ceil_mode)
[2m[36m(pid=2051)[0m   "`Trainer.running_sanity_check` has been renamed to `Trainer.sanity_checking` and will be removed in v1.5."


[2m[36m(pid=2051)[0m                                                               
Epoch 0:   0%|          | 0/1173 [00:00<?, ?it/s] 
Epoch 0:   2%|▏         | 20/1173 [00:06<06:05,  3.15it/s, loss=2.71, v_num=.]
Epoch 0:   3%|▎         | 40/1173 [00:12<05:44,  3.29it/s, loss=3.19, v_num=.]
Epoch 0:   5%|▌         | 60/1173 [00:19<05:59,  3.09it/s, loss=3.47, v_num=.]
Epoch 0:   7%|▋         | 80/1173 [00:28<06:33,  2.78it/s, loss=3.73, v_num=.]
Epoch 0:   9%|▊         | 100/1173 [00:38<06:52,  2.60it/s, loss=3.96, v_num=.]
Epoch 0:  10%|█         | 120/1173 [00:48<07:03,  2.49it/s, loss=3.96, v_num=.]
Epoch 0:  12%|█▏        | 140/1173 [00:58<07:09,  2.40it/s, loss=4.07, v_num=.]
Epoch 0:  14%|█▎        | 160/1173 [01:08<07:12,  2.34it/s, loss=4.13, v_num=.]
Epoch 0:  15%|█▌        | 180/1173 [01:18<07:12,  2.30it/s, loss=4.21, v_num=.]
Epoch 0:  17%|█▋        | 200/1173 [01:28<07:10,  2.26it/s, loss=4.23, v_num=.]
Epoch 0:  19%|█▉        | 220/1173 [01:38<07:06,  2.24it/s, loss

[2m[36m(pid=2051)[0m   "`Trainer.running_sanity_check` has been renamed to `Trainer.sanity_checking` and will be removed in v1.5."


Result for train_Stage1_tune_checkpoint_228f8_00004:
  auroc: 0.4998002350330353
  auroc_cross: 0.4958365857601166
  date: 2021-07-11_06-18-41
  done: false
  experiment_id: b1f9e7363ff94f8686e051c875426844
  hostname: bd0789e721af
  iterations_since_restore: 1
  lossD: 0.00012563096242956817
  lossG: 9.67261028289795
  node_ip: 172.28.0.2
  pid: 2051
  should_checkpoint: true
  time_since_restore: 583.1923429965973
  time_this_iter_s: 583.1923429965973
  time_total_s: 583.1923429965973
  timestamp: 1625984321
  timesteps_since_restore: 0
  training_iteration: 1
  trial_id: 228f8_00004
  
== Status ==
Memory usage on this node: 5.4/12.7 GiB
PopulationBasedTraining: 3 checkpoints, 0 perturbs
Resources requested: 2.0/2 CPUs, 1.0/1 GPUs, 0.0/7.51 GiB heap, 0.0/3.75 GiB objects (0.0/1.0 GPU_group_0_ecd3f63ef35a3c653f2d69781ac9fe69, 0.0/1.0 accelerator_type:T4, 0.0/2.0 CPU_group_ecd3f63ef35a3c653f2d69781ac9fe69, 0.0/1.0 GPU_group_ecd3f63ef35a3c653f2d69781ac9fe69, 0.0/2.0 CPU_group_0_ecd3f63

[2m[36m(pid=2051)[0m 2021-07-11 06:18:42,361	INFO trainable.py:76 -- Checkpoint size is 135937282 bytes
[2m[36m(pid=2051)[0m 2021-07-11 06:18:42,860	INFO trainable.py:76 -- Checkpoint size is 135937282 bytes
[2m[36m(pid=2188)[0m GPU available: True, used: True
[2m[36m(pid=2188)[0m TPU available: False, using: 0 TPU cores
[2m[36m(pid=2188)[0m LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
[2m[36m(pid=2188)[0m 2021-07-11 06:19:04.404932: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudart.so.11.0
[2m[36m(pid=2188)[0m 
[2m[36m(pid=2188)[0m   | Name          | Type               | Params
[2m[36m(pid=2188)[0m -----------------------------------------------------
[2m[36m(pid=2188)[0m 0 | generator     | DCGANGenerator     | 12.7 M
[2m[36m(pid=2188)[0m 1 | discriminator | DCGANDiscriminator | 2.8 M 
[2m[36m(pid=2188)[0m 2 | criterion     | BCELoss            | 0     
[2m[36m(pid=2188)[0m 3 | modelF    

[2m[36m(pid=2188)[0m Validation sanity check: 0it [00:00, ?it/s]Validation sanity check:   0%|          | 0/2 [00:00<?, ?it/s]


[2m[36m(pid=2188)[0m   return torch.max_pool2d(input, kernel_size, stride, padding, dilation, ceil_mode)
[2m[36m(pid=2188)[0m   "`Trainer.running_sanity_check` has been renamed to `Trainer.sanity_checking` and will be removed in v1.5."


Epoch 0:   0%|          | 0/1173 [00:00<?, ?it/s] 
Epoch 0:   2%|▏         | 20/1173 [00:06<06:30,  2.95it/s, loss=2.85, v_num=.]
Epoch 0:   3%|▎         | 40/1173 [00:12<06:00,  3.14it/s, loss=3.39, v_num=.]
Epoch 0:   5%|▌         | 60/1173 [00:19<05:59,  3.09it/s, loss=3.62, v_num=.]
Epoch 0:   7%|▋         | 80/1173 [00:28<06:31,  2.79it/s, loss=3.62, v_num=.]
Epoch 0:   9%|▊         | 100/1173 [00:38<06:50,  2.61it/s, loss=3.65, v_num=.]
Epoch 0:  10%|█         | 120/1173 [00:47<07:00,  2.50it/s, loss=3.56, v_num=.]
Epoch 0:  12%|█▏        | 140/1173 [00:57<07:06,  2.42it/s, loss=3.65, v_num=.]
Epoch 0:  14%|█▎        | 160/1173 [01:07<07:08,  2.37it/s, loss=5.15, v_num=.]
Epoch 0:  15%|█▌        | 180/1173 [01:17<07:07,  2.32it/s, loss=3.55, v_num=.]
Epoch 0:  17%|█▋        | 200/1173 [01:27<07:05,  2.29it/s, loss=3.43, v_num=.]
Epoch 0:  19%|█▉        | 220/1173 [01:37<07:02,  2.26it/s, loss=2.87, v_num=.]
Epoch 0:  20%|██        | 240/1173 [01:47<06:58,  2.23it/s, loss=2.89, v_

[2m[36m(pid=2188)[0m   "`Trainer.running_sanity_check` has been renamed to `Trainer.sanity_checking` and will be removed in v1.5."


Result for train_Stage1_tune_checkpoint_228f8_00005:
  auroc: 0.502286970615387
  auroc_cross: 0.4919411540031433
  date: 2021-07-11_06-28-39
  done: false
  experiment_id: 97f940ca42e54c6d8664a3c16e8ffffa
  hostname: bd0789e721af
  iterations_since_restore: 1
  lossD: 100.0
  lossG: 0.0
  node_ip: 172.28.0.2
  pid: 2188
  should_checkpoint: true
  time_since_restore: 589.414802312851
  time_this_iter_s: 589.414802312851
  time_total_s: 589.414802312851
  timestamp: 1625984919
  timesteps_since_restore: 0
  training_iteration: 1
  trial_id: 228f8_00005
  
== Status ==
Memory usage on this node: 5.7/12.7 GiB
PopulationBasedTraining: 4 checkpoints, 0 perturbs
Resources requested: 2.0/2 CPUs, 1.0/1 GPUs, 0.0/7.51 GiB heap, 0.0/3.75 GiB objects (0.0/1.0 GPU_group_0_ecd3f63ef35a3c653f2d69781ac9fe69, 0.0/1.0 accelerator_type:T4, 0.0/1.0 GPU_group_ecd3f63ef35a3c653f2d69781ac9fe69, 0.0/2.0 CPU_group_ecd3f63ef35a3c653f2d69781ac9fe69, 0.0/2.0 CPU_group_0_ecd3f63ef35a3c653f2d69781ac9fe69)
Current

[2m[36m(pid=2188)[0m 2021-07-11 06:28:40,749	INFO trainable.py:76 -- Checkpoint size is 274610498 bytes
[2m[36m(pid=2188)[0m 2021-07-11 06:28:41,805	INFO trainable.py:76 -- Checkpoint size is 274610498 bytes
[2m[36m(pid=2191)[0m GPU available: True, used: True
[2m[36m(pid=2191)[0m TPU available: False, using: 0 TPU cores
[2m[36m(pid=2191)[0m LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
[2m[36m(pid=2191)[0m 2021-07-11 06:29:04.167224: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudart.so.11.0
[2m[36m(pid=2191)[0m 
[2m[36m(pid=2191)[0m   | Name          | Type               | Params
[2m[36m(pid=2191)[0m -----------------------------------------------------
[2m[36m(pid=2191)[0m 0 | generator     | DCGANGenerator     | 1.1 M 
[2m[36m(pid=2191)[0m 1 | discriminator | DCGANDiscriminator | 2.8 M 
[2m[36m(pid=2191)[0m 2 | criterion     | BCELoss            | 0     
[2m[36m(pid=2191)[0m 3 | modelF    

[2m[36m(pid=2191)[0m Validation sanity check: 0it [00:00, ?it/s]Validation sanity check:   0%|          | 0/2 [00:00<?, ?it/s]


[2m[36m(pid=2191)[0m   return torch.max_pool2d(input, kernel_size, stride, padding, dilation, ceil_mode)
[2m[36m(pid=2191)[0m   "`Trainer.running_sanity_check` has been renamed to `Trainer.sanity_checking` and will be removed in v1.5."


[2m[36m(pid=2191)[0m                                                               
Epoch 0:   0%|          | 0/1173 [00:00<?, ?it/s] 
Epoch 0:   2%|▏         | 20/1173 [00:06<06:27,  2.98it/s, loss=2.46, v_num=.]
Epoch 0:   3%|▎         | 40/1173 [00:12<06:03,  3.12it/s, loss=2.78, v_num=.]
Epoch 0:   5%|▌         | 60/1173 [00:20<06:27,  2.87it/s, loss=3.12, v_num=.]
Epoch 0:   7%|▋         | 80/1173 [00:31<07:06,  2.56it/s, loss=3.35, v_num=.]
Epoch 0:   9%|▊         | 100/1173 [00:41<07:28,  2.39it/s, loss=3.58, v_num=.]
Epoch 0:  10%|█         | 120/1173 [00:52<07:36,  2.31it/s, loss=3.78, v_num=.]
Epoch 0:  12%|█▏        | 140/1173 [01:02<07:41,  2.24it/s, loss=3.76, v_num=.]
Epoch 0:  14%|█▎        | 160/1173 [01:12<07:41,  2.20it/s, loss=3.84, v_num=.]
Epoch 0:  15%|█▌        | 180/1173 [01:23<07:38,  2.17it/s, loss=3.81, v_num=.]
Epoch 0:  17%|█▋        | 200/1173 [01:32<07:32,  2.15it/s, loss=3.87, v_num=.]
Epoch 0:  19%|█▉        | 220/1173 [01:42<07:25,  2.14it/s, loss

[2m[36m(pid=2191)[0m   "`Trainer.running_sanity_check` has been renamed to `Trainer.sanity_checking` and will be removed in v1.5."


Result for train_Stage1_tune_checkpoint_228f8_00006:
  auroc: 0.5008350610733032
  auroc_cross: 0.49981212615966797
  date: 2021-07-11_06-38-49
  done: false
  experiment_id: 248d66ec7739450ba44e779c8ce972ca
  hostname: bd0789e721af
  iterations_since_restore: 1
  lossD: 0.0002792597806546837
  lossG: 8.948115348815918
  node_ip: 172.28.0.2
  pid: 2191
  should_checkpoint: true
  time_since_restore: 598.3700771331787
  time_this_iter_s: 598.3700771331787
  time_total_s: 598.3700771331787
  timestamp: 1625985529
  timesteps_since_restore: 0
  training_iteration: 1
  trial_id: 228f8_00006
  
== Status ==
Memory usage on this node: 6.2/12.7 GiB
PopulationBasedTraining: 5 checkpoints, 0 perturbs
Resources requested: 2.0/2 CPUs, 1.0/1 GPUs, 0.0/7.51 GiB heap, 0.0/3.75 GiB objects (0.0/1.0 GPU_group_ecd3f63ef35a3c653f2d69781ac9fe69, 0.0/1.0 accelerator_type:T4, 0.0/1.0 GPU_group_0_ecd3f63ef35a3c653f2d69781ac9fe69, 0.0/2.0 CPU_group_ecd3f63ef35a3c653f2d69781ac9fe69, 0.0/2.0 CPU_group_0_ecd3f6

[2m[36m(pid=2191)[0m 2021-07-11 06:38:50,267	INFO trainable.py:76 -- Checkpoint size is 135937282 bytes
[2m[36m(pid=2191)[0m 2021-07-11 06:38:50,810	INFO trainable.py:76 -- Checkpoint size is 135937282 bytes
[2m[36m(pid=2521)[0m GPU available: True, used: True
[2m[36m(pid=2521)[0m TPU available: False, using: 0 TPU cores
[2m[36m(pid=2521)[0m LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
[2m[36m(pid=2521)[0m 2021-07-11 06:39:13.352508: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudart.so.11.0
[2m[36m(pid=2521)[0m 
[2m[36m(pid=2521)[0m   | Name          | Type               | Params
[2m[36m(pid=2521)[0m -----------------------------------------------------
[2m[36m(pid=2521)[0m 0 | generator     | DCGANGenerator     | 12.7 M
[2m[36m(pid=2521)[0m 1 | discriminator | DCGANDiscriminator | 2.8 M 
[2m[36m(pid=2521)[0m 2 | criterion     | BCELoss            | 0     
[2m[36m(pid=2521)[0m 3 | modelF    

[2m[36m(pid=2521)[0m Validation sanity check: 0it [00:00, ?it/s]Validation sanity check:   0%|          | 0/2 [00:00<?, ?it/s]


[2m[36m(pid=2521)[0m   return torch.max_pool2d(input, kernel_size, stride, padding, dilation, ceil_mode)
[2m[36m(pid=2521)[0m   "`Trainer.running_sanity_check` has been renamed to `Trainer.sanity_checking` and will be removed in v1.5."


Epoch 0:   0%|          | 0/1173 [00:00<?, ?it/s] 
Epoch 0:   2%|▏         | 20/1173 [00:06<06:27,  2.98it/s, loss=2.91, v_num=.]
Epoch 0:   3%|▎         | 40/1173 [00:15<07:22,  2.56it/s, loss=3.32, v_num=.]
Epoch 0:   5%|▌         | 60/1173 [00:25<07:56,  2.33it/s, loss=3.54, v_num=.]
Epoch 0:   7%|▋         | 80/1173 [00:36<08:12,  2.22it/s, loss=3.65, v_num=.]
Epoch 0:   9%|▊         | 100/1173 [00:46<08:18,  2.15it/s, loss=3.69, v_num=.]
Epoch 0:  10%|█         | 120/1173 [00:57<08:20,  2.10it/s, loss=3.61, v_num=.]
Epoch 0:  12%|█▏        | 140/1173 [01:07<08:20,  2.06it/s, loss=3.59, v_num=.]
Epoch 0:  14%|█▎        | 160/1173 [01:18<08:14,  2.05it/s, loss=3.62, v_num=.]
Epoch 0:  15%|█▌        | 180/1173 [01:28<08:08,  2.03it/s, loss=3.57, v_num=.]
Epoch 0:  17%|█▋        | 200/1173 [01:38<08:00,  2.03it/s, loss=3.58, v_num=.]
Epoch 0:  19%|█▉        | 220/1173 [01:48<07:51,  2.02it/s, loss=3.58, v_num=.]
Epoch 0:  20%|██        | 240/1173 [01:58<07:42,  2.02it/s, loss=3.54, v_

[2m[36m(pid=2521)[0m   "`Trainer.running_sanity_check` has been renamed to `Trainer.sanity_checking` and will be removed in v1.5."


Result for train_Stage1_tune_checkpoint_228f8_00007:
  auroc: 0.5541192889213562
  auroc_cross: 0.5520529747009277
  date: 2021-07-11_06-49-14
  done: false
  experiment_id: 20de904516d8412688144064562e4339
  hostname: bd0789e721af
  iterations_since_restore: 1
  lossD: 0.012883458286523819
  lossG: 5.992850303649902
  node_ip: 172.28.0.2
  pid: 2521
  should_checkpoint: true
  time_since_restore: 615.6806035041809
  time_this_iter_s: 615.6806035041809
  time_total_s: 615.6806035041809
  timestamp: 1625986154
  timesteps_since_restore: 0
  training_iteration: 1
  trial_id: 228f8_00007
  
== Status ==
Memory usage on this node: 6.4/12.7 GiB
PopulationBasedTraining: 6 checkpoints, 0 perturbs
Resources requested: 2.0/2 CPUs, 1.0/1 GPUs, 0.0/7.51 GiB heap, 0.0/3.75 GiB objects (0.0/1.0 GPU_group_ecd3f63ef35a3c653f2d69781ac9fe69, 0.0/2.0 CPU_group_0_ecd3f63ef35a3c653f2d69781ac9fe69, 0.0/1.0 GPU_group_0_ecd3f63ef35a3c653f2d69781ac9fe69, 0.0/1.0 accelerator_type:T4, 0.0/2.0 CPU_group_ecd3f63e

[2m[36m(pid=2521)[0m 2021-07-11 06:49:15,322	INFO trainable.py:76 -- Checkpoint size is 274610498 bytes
[2m[36m(pid=2521)[0m 2021-07-11 06:49:16,488	INFO trainable.py:76 -- Checkpoint size is 274610498 bytes
[2m[36m(pid=2650)[0m GPU available: True, used: True
[2m[36m(pid=2650)[0m TPU available: False, using: 0 TPU cores
[2m[36m(pid=2650)[0m LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
[2m[36m(pid=2650)[0m 2021-07-11 06:49:36.122229: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudart.so.11.0
[2m[36m(pid=2650)[0m 
[2m[36m(pid=2650)[0m   | Name          | Type               | Params
[2m[36m(pid=2650)[0m -----------------------------------------------------
[2m[36m(pid=2650)[0m 0 | generator     | DCGANGenerator     | 1.1 M 
[2m[36m(pid=2650)[0m 1 | discriminator | DCGANDiscriminator | 2.8 M 
[2m[36m(pid=2650)[0m 2 | criterion     | BCELoss            | 0     
[2m[36m(pid=2650)[0m 3 | modelF    

[2m[36m(pid=2650)[0m Validation sanity check: 0it [00:00, ?it/s]Validation sanity check:   0%|          | 0/2 [00:00<?, ?it/s]


[2m[36m(pid=2650)[0m   return torch.max_pool2d(input, kernel_size, stride, padding, dilation, ceil_mode)
[2m[36m(pid=2650)[0m   "`Trainer.running_sanity_check` has been renamed to `Trainer.sanity_checking` and will be removed in v1.5."


Epoch 0:   0%|          | 0/1173 [00:00<?, ?it/s] 
Epoch 0:   2%|▏         | 20/1173 [00:06<06:34,  2.92it/s, loss=2.62, v_num=.]
Epoch 0:   3%|▎         | 40/1173 [00:13<06:08,  3.07it/s, loss=3.13, v_num=.]
Epoch 0:   5%|▌         | 60/1173 [00:19<05:58,  3.10it/s, loss=3.53, v_num=.]
Epoch 0:   7%|▋         | 80/1173 [00:29<06:47,  2.68it/s, loss=3.79, v_num=.]
Epoch 0:   9%|▊         | 100/1173 [00:40<07:16,  2.46it/s, loss=3.91, v_num=.]
Epoch 0:  10%|█         | 120/1173 [00:51<07:30,  2.34it/s, loss=4.11, v_num=.]
Epoch 0:  12%|█▏        | 140/1173 [01:02<07:38,  2.25it/s, loss=4.08, v_num=.]
Epoch 0:  14%|█▎        | 160/1173 [01:12<07:41,  2.19it/s, loss=4.06, v_num=.]
Epoch 0:  15%|█▌        | 180/1173 [01:23<07:39,  2.16it/s, loss=4.13, v_num=.]
Epoch 0:  17%|█▋        | 200/1173 [01:33<07:35,  2.13it/s, loss=4.16, v_num=.]
Epoch 0:  19%|█▉        | 220/1173 [01:44<07:31,  2.11it/s, loss=11.7, v_num=.]
Epoch 0:  20%|██        | 240/1173 [01:54<07:25,  2.10it/s, loss=4.48, v_

[2m[36m(pid=2650)[0m   "`Trainer.running_sanity_check` has been renamed to `Trainer.sanity_checking` and will be removed in v1.5."
2021-07-11 06:59:37,971	INFO pbt.py:543 -- [exploit] transferring weights from trial train_Stage1_tune_checkpoint_228f8_00006 (score 0.5008350610733032) -> train_Stage1_tune_checkpoint_228f8_00008 (score 0.4650954008102417)
2021-07-11 06:59:37,978	INFO pbt.py:558 -- [explore] perturbed config from {'learning_rate': 0.0001, 'batch_size': 64} -> {'learning_rate': 0.00041336110351827344, 'batch_size': 128}


Result for train_Stage1_tune_checkpoint_228f8_00008:
  auroc: 0.4650954008102417
  auroc_cross: 0.5
  date: 2021-07-11_06-59-37
  done: false
  experiment_id: afc6a7241f6144408dc823a1e931ac1c
  hostname: bd0789e721af
  iterations_since_restore: 1
  lossD: 0.0007319121505133808
  lossG: 7.817005634307861
  node_ip: 172.28.0.2
  pid: 2650
  should_checkpoint: true
  time_since_restore: 614.1262702941895
  time_this_iter_s: 614.1262702941895
  time_total_s: 614.1262702941895
  timestamp: 1625986777
  timesteps_since_restore: 0
  training_iteration: 1
  trial_id: 228f8_00008
  
== Status ==
Memory usage on this node: 7.0/12.7 GiB
PopulationBasedTraining: 6 checkpoints, 1 perturbs
Resources requested: 2.0/2 CPUs, 1.0/1 GPUs, 0.0/7.51 GiB heap, 0.0/3.75 GiB objects (0.0/2.0 CPU_group_0_ecd3f63ef35a3c653f2d69781ac9fe69, 0.0/1.0 accelerator_type:T4, 0.0/1.0 GPU_group_ecd3f63ef35a3c653f2d69781ac9fe69, 0.0/2.0 CPU_group_ecd3f63ef35a3c653f2d69781ac9fe69, 0.0/1.0 GPU_group_0_ecd3f63ef35a3c653f2d69

[2m[36m(pid=2650)[0m 2021-07-11 06:59:38,850	INFO trainable.py:378 -- Restored on 172.28.0.2 from checkpoint: /content/drive/MyDrive/Logs/Stage1_pbt_F/train_Stage1_tune_checkpoint_228f8_00008_8_ndf=64,ngf=32_2021-07-11_06-38-50/checkpoint_tmp83b1bf/./
[2m[36m(pid=2650)[0m 2021-07-11 06:59:38,850	INFO trainable.py:385 -- Current state after restoring: {'_iteration': 1, '_timesteps_total': None, '_time_total': 598.3700771331787, '_episodes_total': None}
[2m[36m(pid=2650)[0m 2021-07-11 06:59:40,204	INFO trainable.py:76 -- Checkpoint size is 135937282 bytes


== Status ==
Memory usage on this node: 4.4/12.7 GiB
PopulationBasedTraining: 6 checkpoints, 1 perturbs
Resources requested: 2.0/2 CPUs, 1.0/1 GPUs, 0.0/7.51 GiB heap, 0.0/3.75 GiB objects (0.0/1.0 accelerator_type:T4, 0.0/1.0 GPU_group_ecd3f63ef35a3c653f2d69781ac9fe69, 0.0/2.0 CPU_group_0_ecd3f63ef35a3c653f2d69781ac9fe69, 0.0/1.0 GPU_group_0_ecd3f63ef35a3c653f2d69781ac9fe69, 0.0/2.0 CPU_group_ecd3f63ef35a3c653f2d69781ac9fe69)
Current best trial: 228f8_00007 with auroc=0.5541192889213562 and parameters={'learning_rate': 0.0001, 'ngf': 128, 'ndf': 64, 'batch_size': 64}
Result logdir: /content/drive/MyDrive/Logs/Stage1_pbt_F
Number of trials: 10/10 (8 PAUSED, 1 PENDING, 1 RUNNING)
+------------------------------------------+----------+-------+-----------------+-------+-------+--------------+-----------+---------------+----------+---------------+----------------------+
| Trial name                               | status   | loc   |   learning_rate |   ngf |   ndf |   batch_size |     loss

[2m[36m(pid=2845)[0m GPU available: True, used: True
[2m[36m(pid=2845)[0m TPU available: False, using: 0 TPU cores


== Status ==
Memory usage on this node: 4.6/12.7 GiB
PopulationBasedTraining: 6 checkpoints, 1 perturbs
Resources requested: 2.0/2 CPUs, 1.0/1 GPUs, 0.0/7.51 GiB heap, 0.0/3.75 GiB objects (0.0/1.0 accelerator_type:T4, 0.0/1.0 GPU_group_ecd3f63ef35a3c653f2d69781ac9fe69, 0.0/2.0 CPU_group_0_ecd3f63ef35a3c653f2d69781ac9fe69, 0.0/1.0 GPU_group_0_ecd3f63ef35a3c653f2d69781ac9fe69, 0.0/2.0 CPU_group_ecd3f63ef35a3c653f2d69781ac9fe69)
Current best trial: 228f8_00007 with auroc=0.5541192889213562 and parameters={'learning_rate': 0.0001, 'ngf': 128, 'ndf': 64, 'batch_size': 64}
Result logdir: /content/drive/MyDrive/Logs/Stage1_pbt_F
Number of trials: 10/10 (8 PAUSED, 1 PENDING, 1 RUNNING)
+------------------------------------------+----------+-------+-----------------+-------+-------+--------------+-----------+---------------+----------+---------------+----------------------+
| Trial name                               | status   | loc   |   learning_rate |   ngf |   ndf |   batch_size |     loss

[2m[36m(pid=2845)[0m LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
[2m[36m(pid=2845)[0m 2021-07-11 07:00:01.571625: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudart.so.11.0
[2m[36m(pid=2845)[0m 
[2m[36m(pid=2845)[0m   | Name          | Type               | Params
[2m[36m(pid=2845)[0m -----------------------------------------------------
[2m[36m(pid=2845)[0m 0 | generator     | DCGANGenerator     | 12.7 M
[2m[36m(pid=2845)[0m 1 | discriminator | DCGANDiscriminator | 693 K 
[2m[36m(pid=2845)[0m 2 | criterion     | BCELoss            | 0     
[2m[36m(pid=2845)[0m 3 | modelF        | LensResnet         | 11.2 M
[2m[36m(pid=2845)[0m 4 | modelJ        | LensResnet         | 11.2 M
[2m[36m(pid=2845)[0m -----------------------------------------------------
[2m[36m(pid=2845)[0m 35.7 M    Trainable params
[2m[36m(pid=2845)[0m 0         Non-trainable params
[2m[36m(pid=2845)[0m 35.7 M    Total param

[2m[36m(pid=2845)[0m Validation sanity check: 0it [00:00, ?it/s]Validation sanity check:   0%|          | 0/2 [00:00<?, ?it/s]


[2m[36m(pid=2845)[0m   return torch.max_pool2d(input, kernel_size, stride, padding, dilation, ceil_mode)
[2m[36m(pid=2845)[0m   "`Trainer.running_sanity_check` has been renamed to `Trainer.sanity_checking` and will be removed in v1.5."


[2m[36m(pid=2845)[0m                                                               Training: 0it [00:00, ?it/s]
Epoch 0:   0%|          | 0/1173 [00:00<?, ?it/s] 
Epoch 0:   2%|▏         | 20/1173 [00:06<06:41,  2.87it/s, loss=1.69, v_num=.]
Epoch 0:   3%|▎         | 40/1173 [00:13<06:31,  2.90it/s, loss=1.98, v_num=.]
Epoch 0:   5%|▌         | 60/1173 [00:24<07:37,  2.43it/s, loss=2.01, v_num=.]
Epoch 0:   7%|▋         | 80/1173 [00:35<08:04,  2.26it/s, loss=2.03, v_num=.]
Epoch 0:   9%|▊         | 100/1173 [00:46<08:18,  2.15it/s, loss=2.08, v_num=.]
Epoch 0:  10%|█         | 120/1173 [00:57<08:21,  2.10it/s, loss=2.18, v_num=.]
Epoch 0:  12%|█▏        | 140/1173 [01:07<08:19,  2.07it/s, loss=2.17, v_num=.]
Epoch 0:  14%|█▎        | 160/1173 [01:18<08:15,  2.04it/s, loss=2.19, v_num=.]
Epoch 0:  15%|█▌        | 180/1173 [01:28<08:10,  2.03it/s, loss=2.24, v_num=.]
Epoch 0:  17%|█▋        | 200/1173 [01:39<08:02,  2.02it/s, loss=2.26, v_num=.]
Epoch 0:  19%|█▉        | 220/1173 

[2m[36m(pid=2845)[0m   "`Trainer.running_sanity_check` has been renamed to `Trainer.sanity_checking` and will be removed in v1.5."
2021-07-11 07:10:16,745	INFO pbt.py:543 -- [exploit] transferring weights from trial train_Stage1_tune_checkpoint_228f8_00005 (score 0.502286970615387) -> train_Stage1_tune_checkpoint_228f8_00009 (score 0.43624940514564514)
2021-07-11 07:10:16,747	INFO pbt.py:558 -- [explore] perturbed config from {'learning_rate': 0.0001, 'batch_size': 64} -> {'learning_rate': 0.00012, 'batch_size': 32}


Result for train_Stage1_tune_checkpoint_228f8_00009:
  auroc: 0.43624940514564514
  auroc_cross: 0.4027217924594879
  date: 2021-07-11_07-10-16
  done: false
  experiment_id: 8df578031a8d4345b8903f16ea692868
  hostname: bd0789e721af
  iterations_since_restore: 1
  lossD: 0.004046842455863953
  lossG: 6.178808212280273
  node_ip: 172.28.0.2
  pid: 2845
  should_checkpoint: true
  time_since_restore: 628.9234342575073
  time_this_iter_s: 628.9234342575073
  time_total_s: 628.9234342575073
  timestamp: 1625987416
  timesteps_since_restore: 0
  training_iteration: 1
  trial_id: 228f8_00009
  
== Status ==
Memory usage on this node: 7.1/12.7 GiB
PopulationBasedTraining: 6 checkpoints, 2 perturbs
Resources requested: 2.0/2 CPUs, 1.0/1 GPUs, 0.0/7.51 GiB heap, 0.0/3.75 GiB objects (0.0/2.0 CPU_group_ecd3f63ef35a3c653f2d69781ac9fe69, 0.0/1.0 GPU_group_0_ecd3f63ef35a3c653f2d69781ac9fe69, 0.0/2.0 CPU_group_0_ecd3f63ef35a3c653f2d69781ac9fe69, 0.0/1.0 accelerator_type:T4, 0.0/1.0 GPU_group_ecd3f63

[2m[36m(pid=2845)[0m 2021-07-11 07:10:18,185	INFO trainable.py:378 -- Restored on 172.28.0.2 from checkpoint: /content/drive/MyDrive/Logs/Stage1_pbt_F/train_Stage1_tune_checkpoint_228f8_00009_9_ndf=32,ngf=128_2021-07-11_06-49-16/checkpoint_tmp441b2a/./
[2m[36m(pid=2845)[0m 2021-07-11 07:10:18,186	INFO trainable.py:385 -- Current state after restoring: {'_iteration': 1, '_timesteps_total': None, '_time_total': 589.414802312851, '_episodes_total': None}
[2m[36m(pid=2845)[0m 2021-07-11 07:10:23,553	INFO trainable.py:76 -- Checkpoint size is 274610498 bytes
2021-07-11 07:10:31,808	INFO trainable.py:76 -- Checkpoint size is 373840194 bytes


== Status ==
Memory usage on this node: 5.1/12.7 GiB
PopulationBasedTraining: 6 checkpoints, 2 perturbs
Resources requested: 2.0/2 CPUs, 1.0/1 GPUs, 0.0/7.51 GiB heap, 0.0/3.75 GiB objects (0.0/2.0 CPU_group_ecd3f63ef35a3c653f2d69781ac9fe69, 0.0/1.0 GPU_group_0_ecd3f63ef35a3c653f2d69781ac9fe69, 0.0/1.0 GPU_group_ecd3f63ef35a3c653f2d69781ac9fe69, 0.0/2.0 CPU_group_0_ecd3f63ef35a3c653f2d69781ac9fe69, 0.0/1.0 accelerator_type:T4)
Current best trial: 228f8_00007 with auroc=0.5541192889213562 and parameters={'learning_rate': 0.0001, 'ngf': 128, 'ndf': 64, 'batch_size': 64}
Result logdir: /content/drive/MyDrive/Logs/Stage1_pbt_F
Number of trials: 10/10 (9 PAUSED, 1 RUNNING)
+------------------------------------------+----------+-------+-----------------+-------+-------+--------------+-----------+---------------+----------+---------------+----------------------+
| Trial name                               | status   | loc   |   learning_rate |   ngf |   ndf |   batch_size |     lossG |        

[2m[36m(pid=2978)[0m 2021-07-11 07:10:38,658	INFO trainable.py:378 -- Restored on 172.28.0.2 from checkpoint: /content/drive/MyDrive/Logs/Stage1_pbt_F/train_Stage1_tune_checkpoint_228f8_00001_1_ndf=128,ngf=128_2021-07-11_05-30-43/checkpoint_tmpc15e82/./
[2m[36m(pid=2978)[0m 2021-07-11 07:10:38,658	INFO trainable.py:385 -- Current state after restoring: {'_iteration': 1, '_timesteps_total': None, '_time_total': 558.8177676200867, '_episodes_total': None}
[2m[36m(pid=2978)[0m GPU available: True, used: True
[2m[36m(pid=2978)[0m TPU available: False, using: 0 TPU cores
[2m[36m(pid=2978)[0m 2021-07-11 07:10:39,302	ERROR function_runner.py:254 -- Runner Thread raised error.
[2m[36m(pid=2978)[0m Traceback (most recent call last):
[2m[36m(pid=2978)[0m   File "/usr/local/lib/python3.7/dist-packages/ray/tune/function_runner.py", line 248, in run
[2m[36m(pid=2978)[0m     self._entrypoint()
[2m[36m(pid=2978)[0m   File "/usr/local/lib/python3.7/dist-packages/ray/tune/fun

Result for train_Stage1_tune_checkpoint_228f8_00001:
  auroc: 0.3095346987247467
  auroc_cross: 0.381914347410202
  date: 2021-07-11_05-49-29
  done: false
  experiment_id: f111bebca8014281875ff9bfc982c872
  experiment_tag: 1_ndf=128,ngf=128
  hostname: bd0789e721af
  iterations_since_restore: 1
  lossD: 79.15614318847656
  lossG: 100.0
  node_ip: 172.28.0.2
  pid: 1265
  should_checkpoint: true
  time_since_restore: 558.8177676200867
  time_this_iter_s: 558.8177676200867
  time_total_s: 558.8177676200867
  timestamp: 1625982569
  timesteps_since_restore: 0
  training_iteration: 1
  trial_id: 228f8_00001
  
== Status ==
Memory usage on this node: 5.1/12.7 GiB
PopulationBasedTraining: 6 checkpoints, 2 perturbs
Resources requested: 0/2 CPUs, 0/1 GPUs, 0.0/7.51 GiB heap, 0.0/3.75 GiB objects (0.0/1.0 GPU_group_0_ecd3f63ef35a3c653f2d69781ac9fe69, 0.0/2.0 CPU_group_0_ecd3f63ef35a3c653f2d69781ac9fe69, 0.0/1.0 GPU_group_ecd3f63ef35a3c653f2d69781ac9fe69, 0.0/2.0 CPU_group_ecd3f63ef35a3c653f2d6

TuneError: ignored

In [18]:
!cat /content/drive/MyDrive/Logs/Stage1_pbt_F/train_Stage1_tune_checkpoint_228f8_00001_1_ndf=128,ngf=128_2021-07-11_05-30-43/error.txt

Failure # 1 (occurred at 2021-07-11_07-10-39)
Traceback (most recent call last):
  File "/usr/local/lib/python3.7/dist-packages/ray/tune/trial_runner.py", line 718, in _process_trial
    results = self.trial_executor.fetch_result(trial)
  File "/usr/local/lib/python3.7/dist-packages/ray/tune/ray_trial_executor.py", line 688, in fetch_result
    result = ray.get(trial_future[0], timeout=DEFAULT_GET_TIMEOUT)
  File "/usr/local/lib/python3.7/dist-packages/ray/_private/client_mode_hook.py", line 62, in wrapper
    return func(*args, **kwargs)
  File "/usr/local/lib/python3.7/dist-packages/ray/worker.py", line 1495, in get
    raise value.as_instanceof_cause()
ray.exceptions.RayTaskError(TuneError): [36mray::ImplicitFunc.train_buffered()[39m (pid=2978, ip=172.28.0.2)
  File "python/ray/_raylet.pyx", line 501, in ray._raylet.execute_task
  File "python/ray/_raylet.pyx", line 451, in ray._raylet.execute_task.function_executor
  File "/usr/local/lib/python3.7/dist-packages/ray/_private/funct

In [None]:
def pretrained_Stage1s():
    ckptf = pl_load(os.path.join(
        '/content/drive/MyDrive/Logs/Stage1_F/train_Stage1_tune_checkpoint_',
        'checkpoint'), map_location=lambda storage, loc: storage)
    f = Stage1._load_model_state(ckptf, config={'batch_size': , 'ngf': , 'ndf': , 'learning_rate': })
    ckptj = pl_load(os.path.join(
        '/content/drive/MyDrive/Logs/Stage1_J/train_Stage1_tune_checkpoint_',
        'checkpoint'), map_location=lambda storage, loc: storage)
    j = Stage1._load_model_state(ckptj, config={'batch_size': , 'ngf': , 'ndf': , 'learning_rate': })
    return f, j

In [None]:
class Generator2(nn.Module):
    def __init__(self, ngf: int = 128, image_channels: int = 1):
        super().__init__()

        ker, strd = 4, 2
        pad = int((ker - 2)/2)
        res_ker, res_strd, res_pad = 3, 1, 1
        
        # 64 -> 32
        self.preprocessing = nn.Sequential(
            nn.Conv2d(image_channels, ngf, ker, strd, pad, bias=False),
            nn.ReLU(True)
        )
        # residuals
        self.residual = nn.Sequential(
            BasicBlock(ngf, ngf),
            BasicBlock(ngf, ngf),
            BasicBlock(ngf, ngf),
            BasicBlock(ngf, ngf),
            BasicBlock(ngf, ngf),
            BasicBlock(ngf, ngf),
        )
        self.ending_residual = nn.Sequential(
            nn.Conv2d(ngf, ngf, res_ker, res_strd, res_pad, bias=False),
            nn.BatchNorm2d(ngf),
            nn.ReLU(True)
        )

        # at this part, add the residual inputs from after the preprocessing

        image_width = 150 # upscaling should be factor of 2 increase
        mode = 'nearest' # upscaling method is nearest-neighbour
        self.main = nn.Sequential(
            # 32 -> 64
            nn.Upsample(image_width//2, mode=mode),
            nn.Conv2d(ngf, ngf*4, res_ker, res_strd, res_pad, bias=False),
            nn.BatchNorm2d(ngf*4),
            nn.ReLU(True),
            # 64 -> 128
            nn.Upsample(image_width, mode=mode),
            nn.Conv2d(ngf*4, image_channels, res_ker, res_strd, res_pad, bias=False),
            nn.Tanh()
        )

    def forward(self, in_x):
        x_p = self.preprocessing(in_x)
        x_r = x_p
        x_r = self.residual(x_r)
        x_r = self.ending_residual(x_r)
        # large residual connections
        x_f = x_r + x_p
        return self.main(x_f)

In [None]:
class Stage2(DCGAN):
    def __init__(self, checkpoint_dir: str, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.save_hyperparameters()

        self.generator = Generator2(self.hparams.feature_maps_gen, self.hparams.image_channels)
        
        extra = self.discriminator._make_disc_block(self.hparams.feature_maps_disc * 2, self.hparams.feature_maps_disc * 2)
        l = list(self.discriminator.disc)
        l.insert(2, extra)
        self.discriminator.disc = nn.Sequential(*l)
        self.discriminator.apply(self._weights_init)        

    def forward(self, noise):
        return self.generator(noise)

    def _disc_step(self, real: torch.Tensor) -> torch.Tensor:
        disc_loss = self._get_disc_loss(real)
        self.log('Stage2/D/train/loss', disc_loss, on_epoch=True)
        return disc_loss

    def _gen_step(self, real: torch.Tensor) -> torch.Tensor:
        gen_loss = self._get_gen_loss(real)
        self.log('Stage2/G/train/loss', gen_loss, on_epoch=True)
        return gen_loss

    def _get_noise(self, n_samples: int, latent_dim: int):
        # Currently, this leads to errors:
        # model = StackGAN.load_from_checkpoint(
        #     os.path.join(checkpoint, 'checkpoint'))
        # Workaround:
        self.lowres = getattr(self, 'lowres', pretrained_Stage1s()[0])          # Choose the data this was pre-trained on
        return self.lowres(torch.randn(n_samples, latent_dim, device=self.device))

    def validation_step(self, batch, batch_idx):
        imgs, labels = batch
        out = self(torch.randn(labels.shape[0], self.hparams.latent_dim).type_as(imgs), labels)
        # out = F.interpolate(out_64, 150)
        return {'predF': self.modelF(out), 'predJ': self.modelJ(out), 'target': labels}

    def validation_epoch_end(self, listofDicts):
        target = torch.cat([x['target'] for x in listofDicts])
        f, ax = plt.subplots(1,2, subplot_kw={'xlim': [0,1], 'xlabel': 'False Positive Rate', 
                                              'ylim': [0,1.05], 'ylabel': 'True Positive Rate'},
                             figsize=[11, 5])
        letters = ['F', 'J']
        for l in range(2):
            prediction = torch.cat([x['pred' + str(letters[l])] for x in listofDicts])
            aurocTensor = tm.functional.auroc(prediction, target, num_classes=self.hparams.num_classes, average=None)
            self.log('Stage2/ResNet(' + str(letters[l]) + ')/val/auroc', aurocTensor.min())
            fprList, tprList, _ = tm.functional.roc(prediction, target, num_classes=self.hparams.num_classes)
            
            colors = cycle(['aqua', 'darkorange', 'cornflowerblue'])
            for i, color in zip(range(self.hparams.num_classes), colors):
                ax[l].plot(fprList[i].cpu(), tprList[i].cpu(), color=color,
                        label='ROC curve of class {0} (area = {1:0.2f})'
                        ''.format(i, aurocTensor[i].cpu()))
            post_plotting(ax[l])
            ax[l].set_title('Multi-class ROC (' + str(letters[l]) + ')')
        
        f.tight_layout()
        self.logger.experiment.add_figure('Stage2/ResNet/val/ROC', f)
        f.savefig(str(tune.get_trial_dir()) + 'ROC_epoch_' + str(self.current_epoch) + '.pdf')

In [None]:
# __tune_train_checkpoint_begin
def train_Stage2_tune_checkpoint(config, checkpoint_dir=None, num_epochs=10, num_gpus=1):
    # data_dir = os.path.expanduser('/content/images/')
    trainer = pl.Trainer(
        # accumulate_grad_batches=2,
        # limit_train_batches=0.20,
        # limit_val_batches=0.20,
        # num_sanity_val_steps=-1,
        max_epochs=num_epochs,
        prepare_data_per_node = False,
        # If fractional GPUs passed in, convert to int.
        gpus=math.ceil(num_gpus),
        # tpu_cores = 8,
        logger=TensorBoardLogger(save_dir=tune.get_trial_dir(), name='', version='.'),
        # progress_bar_refresh_rate=1,
        callbacks=[
                   TuneReportCheckpointCallback(
                       {'lossG': 'Stage2/G/train/loss', 
                        'lossD': 'Stage2/D/train/loss', 
                        'auroc': 'Stage2/ResNet(F)/val/auroc', 
                        'auroc_cross': 'Stage2/ResNet(J)/val/auroc',
                        },
                   ),
        ],
        # stochastic_weight_avg=True,
        # works with only one optimizer
        # benchmark=True,
    )
    dm = NpyDataModule(config, 64)
    if checkpoint_dir:
        # Currently, this leads to errors:
        # model = Stage2.load_from_checkpoint(
        #     os.path.join(checkpoint, 'checkpoint'))
        # Workaround:
        ckpt = pl_load(os.path.join(checkpoint_dir, 'checkpoint'),
                       map_location=lambda storage, loc: storage)
        model = Stage2._load_model_state(ckpt, config=config)
        trainer.current_epoch = ckpt['epoch']
    else:
        model = Stage2(config)

    trainer.fit(model, dm)
# __tune_train_checkpoint_end__


# __tune_asha_begin__
def tune_Stage2_asha(num_samples=10, num_epochs=10, gpus_per_trial=1):
    analysis = tune.run(
        tune.with_parameters(train_Stage2_tune_checkpoint,
                             num_epochs=num_epochs,
                             num_gpus=gpus_per_trial),
        name='Stage2_F',
        metric='auroc',
        mode='max',
        config={'learning_rate': tune.choice([1e-5, 1e-4, 1e-3]),
                'ngf': tune.choice([128, 64, 32]),
                'ndf': tune.choice([128, 64, 32]),
                'batch_size': tune.choice([128, 64, 32]),
                },
        resources_per_trial={'cpu': 2,
                             'gpu': gpus_per_trial,
                             },
        num_samples=num_samples,
        local_dir='./drive/MyDrive/Logs',
        scheduler = ASHAScheduler(max_t=num_epochs, grace_period=1,  reduction_factor=2),
        progress_reporter=CLIReporter(
            # overwrite=True,
            parameter_columns=['learning_rate', 'ngf', 'ndf', 'batch_size'],
            metric_columns=['lossG', 'lossD', 'auroc', 'auroc_cross', 'training_iteration'],
            ),
        fail_fast = True,
        # resume='PROMPT',
        )

    print('Best hyperparameters found were: ', analysis.best_config)

# __tune_asha_end__


# __tune_pbt_begin__
def tune_Stage2_pbt(num_samples=10, num_epochs=10, gpus_per_trial=1):
    analysis = tune.run(
        tune.with_parameters(train_Stage2_tune_checkpoint,
                             num_epochs=num_epochs,
                             num_gpus=gpus_per_trial),
        name='Stage2_F',
        metric='auroc',
        mode='max',
        config={'learning_rate': tune.choice([1e-5, 1e-4, 1e-3]),
                'ngf': tune.choice([128, 64, 32]),
                'ndf': tune.choice([128, 64, 32]),
                'batch_size': tune.choice([128, 64, 32]),
                },
        resources_per_trial={'cpu': 2,
                             'gpu': gpus_per_trial,
                             },
        num_samples=num_samples,
        local_dir='./drive/MyDrive/Logs',
        scheduler = PopulationBasedTraining(perturbation_interval=4,
                                        hyperparam_mutations={
                                            'learning_rate': tune.choice([1e-5, 1e-4, 1e-3]),
                                            # 'ngf': tune.choice([128, 64, 32]),
                                            # 'ndf': tune.choice([128, 64, 32]),
                                            'batch_size': tune.choice([128, 64, 32]),
                                            },
                                        ),
        progress_reporter=CLIReporter(
            # overwrite=True,
            parameter_columns=['learning_rate', 'ngf', 'ndf', 'batch_size'],
            metric_columns=['lossG', 'lossD', 'auroc', 'auroc_cross', 'training_iteration'],
            ),
        fail_fast = True,
        # resume='PROMPT',
        )

    print('Best hyperparameters found were: ', analysis.best_config)

# __tune_pbt_end__


if __name__ == '__main__':
    import argparse

    parser = argparse.ArgumentParser()
    parser.add_argument(
        '--smoke-test', action='store_true', help='Finish quickly for testing')
    args, _ = parser.parse_known_args()

    if args.smoke_test:
        tune_Stage2_asha(num_samples=1, num_epochs=6, gpus_per_trial=1)
        tune_Stage2_pbt(num_samples=1, num_epochs=6, gpus_per_trial=1)
    else:
        # ASHA scheduler
        tune_Stage2_asha(num_samples=12, num_epochs=3, gpus_per_trial=1)
        # Population based training
        # tune_Stage2_pbt(num_samples=8, num_epochs=5, gpus_per_trial=1)

# StackGAN:
Here we define the GAN module, that we shall use to generate representative images.

In [None]:
class StackGAN(pl.LightningModule):
    def __init__(self, config, noise_size: int = 100, image_width = 64,
                    num_classes: int = 3, image_channels: int = 1, b1: float = 0.5, **kwargs):
        super().__init__()
        self.save_hyperparameters(ignore = config)
        self.feature_maps = config['feature_maps']
        self.lr = config['learning_rate']
        # -------------------------------------
        # Need to create a subclass because we couldn't simply add/remove a layer;
        # there are two inputs of the superclas' forward method.
        self.G1 = DCGANGenerator(self.hparams.noise_size, self.feature_maps, self.hparams.image_channels).apply(self._weights_init)
        l = list(self.G1.gen[0])
        del l[1]
        self.G1.gen[0] = nn.Sequential(*l)
        self.G1.add_module('label_emb', nn.Embedding(self.hparams.num_classes, self.hparams.noise_size))
        # ------------------------------------
        self.D1 = DCGANDiscriminator(self.feature_maps, self.hparams.image_channels).apply(self._weights_init)
        # -------------------------------------
        self.G2 = Generator2(self.hparams.image_channels, self.feature_maps).apply(self._weights_init)
        # -------------------------------------
        self.D2 = DCGANDiscriminator(self.feature_maps, self.hparams.image_channels)
        #  steps to mutate the instance, not the class definition
        extra = self.D2._make_disc_block(self.feature_maps * 2, self.feature_maps * 2)
        l = list(self.D2.disc)
        l.insert(2, extra)
        self.D2.disc = nn.Sequential(*l)
        self.D2.apply(self._weights_init)
        # No need for subclassing as the forward method need not be modified.
        # -------------------------------------
        self.R = LensResnet(config, num_classes = 4).apply(self._weights_init)
        # -------------------------------------
        self.pretrained = LensResnet(config)
        ckpt = pl_load(os.path.join(
            '/content/drive/MyDrive/Logs/tune_LensResnet_asha_model_j/train_LensResnet_tune_checkpoint_e38cb_00000_0_batch_size=128,learning_rate=0.001_2021-07-06_17-52-11/checkpoint_epoch=17-step=1406',
            # '/content/drive/MyDrive/Logs/tune_LensResnet_asha_model_f/train_LensResnet_tune_checkpoint_e32ba_00000_0_batch_size=64,learning_rate=0.0001_2021-07-06_03-33-10/checkpoint_epoch=14-step=4689',
            'checkpoint'),
            map_location=lambda storage, loc: storage)
        self.pretrained._load_model_state(ckpt)
        # -------------------------------------
        self.criterion1 = nn.BCELoss()
        self.criterion2 = nn.CrossEntropyLoss()

    @staticmethod
    def _weights_init(m):
        classname = m.__class__.__name__
        if classname.find('Conv') != -1:
            torch.nn.init.normal_(m.weight, 0.0, 0.02)
        elif classname.find('BatchNorm') != -1:
            torch.nn.init.normal_(m.weight, 1.0, 0.02)
            torch.nn.init.zeros_(m.bias)

    def forward(self, noise, labels = None):
        if labels is None:
            labels = torch.randint(self.hparams.num_classes, noise.shape[:-1])                           # last dimension is the hidden dimension
        inp = torch.mul(noise, self.G1.label_emb(labels))
        out1 = self.G1(inp.view(-1, inp.shape[-1], 1, 1))
        out2 = self.G2(out1.detach())
        return out2, out1

    def training_step(self, batch, batch_idx, optimizer_idx):
        imgs, labels = batch
        temp2, temp1 = self(torch.randn(labels.shape[0], self.hparams.noise_size).type_as(imgs), labels)

        if optimizer_idx == 0:
            loss = self.criterion1(self.D1(temp1), torch.ones_like(labels, dtype=torch.float32))
            self.log('G1/train/loss/disc', loss)
            loss.add_(self.criterion2(self.R.backbone(self.G2(temp1)), labels))
            self.log('G1/train/loss/full', loss)

        elif optimizer_idx == 1:
            real, fake = self.D1(F.interpolate(imgs, self.hparams.image_width, mode='nearest')), self.D1(temp1.detach())
            prediction, target = torch.cat((real, fake)), torch.cat((torch.ones_like(real),torch.zeros_like(fake)))
            loss = self.criterion1(prediction, target)
            self.log('D1/train/loss', loss)

        elif optimizer_idx == 2:
            loss = self.criterion1(self.D2(temp2), torch.ones_like(labels, dtype=torch.float32))
            self.log('G2/train/loss/disc', loss)
            loss.add_(self.criterion2(self.R.backbone(temp2), labels))
            self.log('G2/train/loss/full', loss)

        elif optimizer_idx == 3:
            real, fake = self.D2(imgs), self.D2(temp2.detach())
            prediction, target = torch.cat((real, fake)), torch.cat((torch.ones_like(real),torch.zeros_like(fake)))
            loss = self.criterion1(prediction, target)
            self.log('D2/train/loss', loss)

        elif optimizer_idx == 4:
            real, fake = self.R.backbone(imgs), self.R.backbone(temp2.detach())
            prediction, target = torch.cat((real, fake)), torch.cat((labels, self.hparams.num_classes * torch.ones_like(labels)))
            loss = self.criterion2(prediction, target)
            self.log('R/train/loss', loss)
        
        return loss

    def configure_optimizers(self):
        opt_g1 = torch.optim.Adam(self.G1.parameters(), self.lr, (self.hparams.b1, 0.999))
        opt_d1 = torch.optim.Adam(self.D1.parameters(), self.lr, (self.hparams.b1, 0.999))
        opt_g2 = torch.optim.Adam(self.G2.parameters(), self.lr, (self.hparams.b1, 0.999))
        opt_d2 = torch.optim.Adam(self.D2.parameters(), self.lr, (self.hparams.b1, 0.999))
        opt_r = torch.optim.Adam(self.R.parameters(), self.lr, (self.hparams.b1, 0.999))
        return opt_g1, opt_d1, opt_g2, opt_d2, opt_r

    def validation_step(self, batch, batch_idx):
        imgs, labels = batch
        temp2, _ = self(torch.randn(labels.shape[0], self.hparams.noise_size).type_as(imgs), labels)
        return {'pred': self.pretrained(temp2.detach()), 'target': labels}

    def validation_epoch_end(self, listofDicts):
        prediction, target = torch.cat([x['pred'] for x in listofDicts]), torch.cat([x['target'] for x in listofDicts])
        aurocTensor = tm.functional.auroc(prediction, target, num_classes=self.hparams.num_classes, average=None)
        self.log('Pre/val/auroc', aurocTensor.min())
        fprList, tprList, _ = tm.functional.roc(prediction, target, num_classes=self.hparams.num_classes)
        
        f = plt.figure()
        colors = cycle(['aqua', 'darkorange', 'cornflowerblue'])
        for i, color in zip(range(self.hparams.num_classes), colors):
            plt.plot(fprList[i].cpu(), tprList[i].cpu(), color=color,
                    label='ROC curve of class {0} (area = {1:0.2f})'
                    ''.format(i, aurocTensor[i].cpu()))
        plt.plot([0, 1], [0, 1], 'k--')
        plt.xlim([0.0, 1.0])
        plt.ylim([0.0, 1.05])
        plt.xlabel('False Positive Rate')
        plt.ylabel('True Positive Rate')
        plt.title('Multi-class ROC')
        plt.legend(loc='lower right')

        self.logger.experiment.add_figure('StackGAN/val/ROC', f)
        f.savefig(str(tune.get_trial_dir())+'ROC_epoch_'+str(self.current_epoch)+'.pdf')

# Tune StackGAN:
Here we tune hyperparameters for generating images that resemble the images from input.

In [None]:
# __tune_train_checkpoint_begin
def train_StackGAN_tune_checkpoint(config,
                                   checkpoint_dir=None,
                                   num_epochs=10,
                                   num_gpus=1):
    data_dir = os.path.expanduser('/content/images/')
    trainer = pl.Trainer(
        # accumulate_grad_batches=2,
        # limit_train_batches=0.20,
        # limit_val_batches=0.20,
        num_sanity_val_steps=-1,
        max_epochs=num_epochs,
        prepare_data_per_node = False,
        # If fractional GPUs passed in, convert to int.
        gpus=math.ceil(num_gpus),
        # tpu_cores = 8,
        logger=TensorBoardLogger(save_dir=tune.get_trial_dir(), name='', version='.'),
        # progress_bar_refresh_rate=1,
        callbacks=[
            TuneReportCheckpointCallback(
                metrics={
                    'lossG1': 'G1/train/loss/full',
                    'lossG2': 'G2/train/loss/full',
                    'lossD1': 'D1/train/loss',
                    'lossD2': 'D2/train/loss',
                    'lossR': 'R/train/loss',
                    'auroc': 'Pre/val/auroc',
                },
                filename='checkpoint',
                # on='training_end'
            )
        ],
        # stochastic_weight_avg=True,
        # works with only one optimizer
        )
    dm = NpyDataModule(config, data_dir)
    if checkpoint_dir:
        # Currently, this leads to errors:
        # model = StackGAN.load_from_checkpoint(
        #     os.path.join(checkpoint, 'checkpoint'))
        # Workaround:
        ckpt = pl_load(
            os.path.join(checkpoint_dir, 'checkpoint'),
            map_location=lambda storage, loc: storage)
        model = StackGAN._load_model_state(
            ckpt, config=config, 
            # data_dir=data_dir
            )
        trainer.current_epoch = ckpt['epoch']
    else:
        model = StackGAN(config)

    trainer.fit(model, dm)
# __tune_train_checkpoint_end__


# __tune_asha_begin__
def tune_StackGAN_asha(num_samples=10, num_epochs=10, gpus_per_trial=1):
    config = {
        'learning_rate': tune.choice([1e-4]),
        'feature_maps': tune.choice([64]),
        'batch_size': tune.choice([128, 64]),
    }

    scheduler = ASHAScheduler(
        max_t=num_epochs,
        grace_period=1,
        reduction_factor=2)

    reporter = CLIReporter(
        # overwrite=True,
        parameter_columns=['learning_rate', 'feature_maps', 'batch_size'],
        metric_columns=['lossG1', 'lossG2', 'lossD1', 'lossD2', 'lossR', 'auroc', 'training_iteration'],
        )

    analysis = tune.run(
        tune.with_parameters(
            train_StackGAN_tune_checkpoint,
            num_epochs=num_epochs,
            num_gpus=gpus_per_trial),
        name='tune_StackGAN_asha_model_j',
        metric='auroc',
        mode='max',
        config=config,
        resources_per_trial={
            'cpu': 2,
            'gpu': gpus_per_trial,
            # 'tpu': 8,
        },
        num_samples=num_samples,
        local_dir='./drive/MyDrive/Logs',
        scheduler=scheduler,
        progress_reporter=reporter,
        # restore='/content/drive/MyDrive/Logs/tune_StackGAN_1_asha_model_j/train_StackGAN_tune_checkpoint_fa25b_00000_0_batch_size=64,feature_maps=64,learning_rate=0.0001_2021-07-06_20-23-13/checkpoint_epoch=0-step=937',
        fail_fast = True,
        resume='PROMPT',
        )

    print('Best hyperparameters found were: ', analysis.best_config)

# __tune_asha_end__


# __tune_pbt_begin__
def tune_StackGAN_pbt(num_samples=10, num_epochs=10, gpus_per_trial=1):
    config = {
        'learning_rate': 1e-4,
        'feature_maps': 64,
        'batch_size': 64,
    }

    scheduler = PopulationBasedTraining(
        perturbation_interval=4,
        hyperparam_mutations={
            'learning_rate': [1e-4, 1e-3],
            'feature_maps': [64, 128],
            'batch_size': [32, 64, 128]
        })

    reporter = CLIReporter(
        # overwrite=True,
        parameter_columns=['learning_rate', 'feature_maps', 'batch_size'],
        metric_columns=['lossG1', 'lossG2', 'lossD1', 'lossD2', 'lossR', 'auroc', 'training_iteration'],
        )

    analysis = tune.run(
        # resume=True,
        tune.with_parameters(
            train_StackGAN_tune_checkpoint,
            num_epochs=num_epochs,
            num_gpus=gpus_per_trial),
        name='tune_StackGAN_pbt_model_j',
        metric='auroc',
        mode='max',
        resources_per_trial={
            'cpu': 2,
            'gpu': gpus_per_trial,
            # 'tpu': 8,
        },
        config=config,
        num_samples=num_samples,
        scheduler=scheduler,
        progress_reporter=reporter,
        local_dir='./drive/MyDrive/Logs',
        # restore='/content/drive/MyDrive/Logs/tune_StackGAN_1_asha_model_j/train_StackGAN_tune_checkpoint_fa25b_00000_0_batch_size=64,feature_maps=64,learning_rate=0.0001_2021-07-06_20-23-13/checkpoint_epoch=0-step=937',
        fail_fast = True,
        # resume='PROMPT',
        )

    print('Best hyperparameters found were: ', analysis.best_config)

# __tune_pbt_end__


if __name__ == '__main__':
    import argparse

    parser = argparse.ArgumentParser()
    parser.add_argument(
        '--smoke-test', action='store_true', help='Finish quickly for testing')
    args, _ = parser.parse_known_args()

    if args.smoke_test:
        tune_StackGAN_asha(num_samples=1, num_epochs=6, gpus_per_trial=1)
        tune_StackGAN_pbt(num_samples=1, num_epochs=6, gpus_per_trial=1)
    else:
        # ASHA scheduler
        tune_StackGAN_asha(num_samples=2, num_epochs=1, gpus_per_trial=1)
        # Population based training
        # tune_StackGAN_pbt(num_samples=8, num_epochs=5, gpus_per_trial=1)