<a href="https://colab.research.google.com/github/souravraha/galaxy/blob/experimental/Lightning_Tune.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Prerequisites

## Install/uninstall packages

In [1]:
# If you are running on Google Colab, uncomment below to install the necessary dependencies 
# before beginning the exercise.

print('Setting up colab environment')
# !pip uninstall -y -q pyarrow
!pip install -q lightning-bolts GPy
!pip install -q ray[debug] ray[default]
!pip install -U -q ray[tune]
# !pip install cloud-tpu-client==0.10 https://storage.googleapis.com/tpu-pytorch/wheels/torch_xla-1.8-cp37-cp37m-linux_x86_64.whl

# # A hack to force the runtime to restart, needed to include the above dependencies.
# print('Done installing! Restarting via forced crash (this is not an issue).')
# import os
# os._exit(0)

Setting up colab environment
[K     |████████████████████████████████| 253 kB 4.1 MB/s 
[K     |████████████████████████████████| 959 kB 34.2 MB/s 
[K     |████████████████████████████████| 919 kB 18.3 MB/s 
[K     |████████████████████████████████| 282 kB 22.4 MB/s 
[K     |████████████████████████████████| 829 kB 32.0 MB/s 
[K     |████████████████████████████████| 119 kB 45.6 MB/s 
[K     |████████████████████████████████| 636 kB 52.8 MB/s 
[K     |████████████████████████████████| 1.3 MB 33.9 MB/s 
[K     |████████████████████████████████| 71 kB 8.9 MB/s 
[K     |████████████████████████████████| 142 kB 47.3 MB/s 
[K     |████████████████████████████████| 294 kB 50.6 MB/s 
[?25h  Building wheel for future (setup.py) ... [?25l[?25hdone
  Building wheel for GPy (setup.py) ... [?25l[?25hdone
  Building wheel for paramz (setup.py) ... [?25l[?25hdone
[K     |████████████████████████████████| 49.6 MB 6.0 kB/s 
[K     |████████████████████████████████| 72 kB 504 kB/s 


## Import libraries

In [2]:
import os
import math
import numpy as np
from itertools import cycle
from matplotlib import pyplot as plt
# ------------------------------------
import torch
from torch import nn
from torch.nn import functional as F
from torch.utils.data import DataLoader, random_split
# ------------------------------------
from torchvision.models import resnet18
from torchvision.utils import save_image, make_grid
from torchvision import transforms, datasets
# ------------------------------------
import torchmetrics as tm
# ------------------------------------
import pytorch_lightning as pl
from pl_bolts.models.gans import DCGAN
# from Ca
from pl_bolts.callbacks import ModuleDataMonitor
from pytorch_lightning.loggers import TensorBoardLogger
from pl_bolts.models.self_supervised.resnets import BasicBlock
from pytorch_lightning.utilities.cloud_io import load as pl_load
# ------------------------------------
from ray import tune
from ray.tune.stopper import TrialPlateauStopper
from ray.tune import CLIReporter, JupyterNotebookReporter
from ray.tune.schedulers import ASHAScheduler, PopulationBasedTraining
from ray.tune.schedulers.pb2 import PB2
from ray.tune.integration.pytorch_lightning import TuneReportCallback, TuneReportCheckpointCallback
# ------------------------------------

# Download and extract data

In [45]:
from google.colab import drive
# drive.flush_and_unmount()
drive.mount('/content/drive')
# %cp drive/MyDrive/ml/Callbacks/confused_logits.py ./
# from confused_logits import ConfusedLogitCallback

Mounted at /content/drive


Here choose the model you wish to use for training/testing. Don't forget to make modifications in the following sections:

1.   GLOBAL in class definition of npyImageData.
2.   correct assignment of metric keys while defining the training wrapper for Tune.
3.   name of the experiment initiated/resumed.

In [4]:
# 'a': 1Cjcw2EWorhdhJSGoWOdxsEUDxvl943dt, 'b': 15yXXC4h5VsytP3Ak1jfUSjQhdgP2s23K, 'c': 1vuQ-pLzoKT4Hd_V7949r9eND9E2fB_u_,
# 'd': , 'e': 1wFuasvb7PthxXtMUlsD13uzYHWlWt06H, 'f': 17l6H61tLAu26zGuei38r_T5ssjbYUeaJ, 
# 'g': 1SxQVosWeEjY3Pyn8LRXA11rLnZ9HK_7B, 'h': 1Atau0RH4oyLAiYReW-G9a8l9pUNltglF, 'i': 15lEgsR1p00KSHieaT9a1nkbJ86pDxwgp, 
# 'j': 1m0EQUbqZZeyl76XsQIKWU5Qd7jGmmWhB, 'k': , 'l': 1meTDi4aeWfdChOiXeLtUOGhjVDVu000e

# !rm -rf images
!gdown --id 17l6H61tLAu26zGuei38r_T5ssjbYUeaJ
!tar zxf ./model_f.tgz
!rm ./model_f.tgz

# def prepare_data(data_dir: str = '/content'):
#     with FileLock(os.path.expanduser(data_dir+'.lock')):
#         gdown.download('https://drive.google.com/uc?id=17l6H61tLAu26zGuei38r_T5ssjbYUeaJ', data_dir+'/model_j.tgz', quiet=True)
        
#         temp = tarfile.open(data_dir+'/model_j.tgz', 'r|gz')
#         temp.extractall()
#         temp.close()

Downloading...
From: https://drive.google.com/uc?id=17l6H61tLAu26zGuei38r_T5ssjbYUeaJ
To: /content/model_f.tgz
2.34GB [00:27, 85.1MB/s]


# Class definitions

## DataModule
This creates dataloaders which need to be supplied to train, validate or test the module we have.

In [5]:
class npyImageData(pl.LightningDataModule):
    def __init__(self, config, img_width: int = 150, data_dir: str = '/content/images/'):
        super().__init__()
        # This method is not implemented
        # self.save_hyperparameters()
        self.bs = config['bs']
        self.data_dir = os.path.expanduser(data_dir)
        
        # Change the source file containing mean and stdv when changing dataset ------------------------------------------------------
        self.transform = transforms.Compose([
            transforms.RandomHorizontalFlip(),
            transforms.RandomVerticalFlip(),
            # F : [mean=71.75926373866668, std=96.139484964214, min=5.0, range=961.0]
            # J : [mean=50.271541595458984, std=94.8838882446289, min=0, range=1007.0]
            transforms.Normalize(mean=(5,), std=(961,)),
            transforms.Normalize(mean=(0.5,), std=(0.5,)),
            # this shift-scales the pixel values -> [-1, 1]
            transforms.Resize(img_width, transforms.InterpolationMode.NEAREST),
        ])

    @staticmethod
    def npy_loader(path):
        # s=np.load(path).astype('float',copy=False)
        return torch.from_numpy(np.load(path)).unsqueeze(0).float()
        # Convert to tenssor first, and then to float, otherwise final dtype 
        # would be float64, which would raise errors in conv layers      ###### type as

    def setup(self, stage: str = None):
        if stage in ('fit', None):
            self.train_set = datasets.DatasetFolder(os.path.join(self.data_dir,'train'), 
                self.npy_loader, ('.npy'), self.transform,)
            # self.train_set, self.val_set = random_split(self.full_set, [60000, 15000])            
            self.val_set = datasets.DatasetFolder(os.path.join(self.data_dir,'val'),  
                self.npy_loader, ('.npy'), self.transform,)
            self.dims = tuple(self.train_set[0][0].shape)

        if stage in ('test', None):
            self.test_set = datasets.DatasetFolder(os.path.join(self.data_dir,'val'),  
                self.npy_loader, ('.npy'), self.transform,)
            self.dims = getattr(self, 'dims', self.test_set[0][0].shape)
    
    def train_dataloader(self):
        return DataLoader(self.train_set, self.bs, shuffle=True, num_workers=os.cpu_count(), pin_memory=False)

    def val_dataloader(self):
        return DataLoader(self.val_set, self.bs, shuffle=True, num_workers=os.cpu_count(), pin_memory=False)

    def test_dataloader(self):
        return DataLoader(self.test_set, self.bs, shuffle=True, num_workers=os.cpu_count(), pin_memory=False)

## ResNet
We modify a ResNet slightly for our purpose.

In [6]:
PRE_F_RESNET = '/content/drive/MyDrive/Logs/F/LensResnet/PRETRAINED.pth'
PRE_J_RESNET = '/content/drive/MyDrive/Logs/J/LensResnet/PRETRAINED.pth'

class LensResnet(pl.LightningModule):
    def __init__(self, config, image_channels: int = 1, num_classes: int = 3, **kwargs):
        super().__init__()
        self.save_hyperparameters(ignore=config)
        self.lr = config['lr']

        # --------------------------------------------------------------------------
        self.backbone = torch.load(PRE_F_RESNET, map_location=self.device)
        
        self.train_metrics = tm.MetricCollection([tm.AUROC(self.hparams.num_classes, average='weighted'),],
            prefix='LensResnet/train/'
        )
        self.val_metrics = tm.MetricCollection([tm.AUROC(self.hparams.num_classes, average=None),
                                                tm.ROC(self.hparams.num_classes),],
        )

    def configure_optimizers(self):
        return torch.optim.Adam(self.backbone.parameters(), self.lr)

    def forward(self, x, prob=False):
        logits = self.backbone(x)
        return F.softmax(logits, 1) if prob else logits

    def training_step(self, batch, batch_idx):
        imgs, labels = batch
        self.last_logits = self(imgs)
        loss = F.cross_entropy(self.last_logits, labels)
        self.log('LensResnet/train/loss', loss)
        #  keep only scalars here, for no errors
        
        preds = F.softmax(self.last_logits, 1)
        self.train_metrics.update(preds, labels)
        try:
            self.log_dict(self.train_metrics.compute(), prog_bar=True)
        except Exception as f:
            print(f)
        finally:            
            # self.train_metrics.reset()
            # self.log_dict automatically resets at the end of epoch
            return loss

    def validation_step(self, batch, batch_idx):
        imgs, labels = batch
        logits = self(imgs)
        loss = F.cross_entropy(logits, labels)
        self.log('LensResnet/val/loss', loss)
        #  keep only scalars here, for no errors
        
        preds = F.softmax(logits, 1)
        self.val_metrics.update(preds, labels)
        # if batch_idx % 5 ==0:
        #     print('val', batch_idx, len(self.val_metrics.AUROC.target))

    def validation_epoch_end(self, Listofdicts):
        # prediction, target = torch.cat([x['pred'] for x in Listofdicts]), torch.cat([x['target'] for x in Listofdicts])
        # aurocTensor = tm.functional.auroc(prediction, target, num_classes=self.hparams.num_classes, average=None)
        try:
            scoresDict = self.val_metrics.compute()
        except Exception as f:
            # print(f)
            print(torch.unique(torch.stack(self.val_metrics.AUROC.target)).tolist())
        else:
            self.log('LensResnet/val/auroc', scoresDict['AUROC'].min(), prog_bar=True)
            fprList, tprList, _ = scoresDict['ROC']  # tm.functional.roc(prediction, target, num_classes=self.hparams.num_classes)
            
            f = plt.figure()
            colors = cycle(['aqua', 'darkorange', 'cornflowerblue'])
            for i, color in zip(range(self.hparams.num_classes), colors):
                plt.plot(fprList[i].cpu(), tprList[i].cpu(), color=color,
                        label='ROC curve of class {0} (area = {1:0.2f})'
                        ''.format(i, scoresDict['AUROC'][i]))
            plt.plot([0, 1], [0, 1], 'k--')
            plt.xlim([0.0, 1.0])
            plt.ylim([0.0, 1.05])
            plt.xlabel('False Positive Rate')
            plt.ylabel('True Positive Rate')
            plt.title('Multi-class ROC')
            plt.legend(loc='lower right')

            self.logger.experiment.add_figure('LensResnet/val/ROC', f)
            f.savefig(str(self.trainer.log_dir) + '/ROC_epoch_{:02d}.pdf'.format(self.current_epoch))
        finally:
            self.val_metrics.reset()

## Stage 1
Here we subclass a DCGAN to create our low resolution GAN.

In [None]:
m = Stage1({'lr':0.001, 'n_fmaps': 128, 'bs': 8})

In [46]:
BEST_F_RESNET = '/content/drive/MyDrive/Logs/F/LensResnet/pbt_tanh/train_LensResnet_eb619_00000_0_2021-09-02_19-42-34/checkpoint_epoch=2-step=28124'
BEST_J_RESNET = '/content/drive/MyDrive/Logs/J/LensResnet/pbt_tanh_fine/train_LensResnet_93609_00000_0_2021-09-02_21-06-00/checkpoint_epoch=2-step=28124'

def post_plotting(ax):
    ax.plot([0, 1], [0, 1], 'k--')
    ax.legend(loc='lower right')

class Stage1(DCGAN):
    def __init__(self, config, num_classes: int = 3, **kwargs):
        super().__init__(feature_maps_gen=config['n_fmaps'], feature_maps_disc=config['n_fmaps'], learning_rate=config['lr'])
        self.save_hyperparameters(ignore=config)

        self.generator.add_module('emb', nn.Embedding(self.hparams.num_classes, self.hparams.latent_dim))
        
        del self.discriminator.disc[-1][-1]
        self.criterion = nn.BCEWithLogitsLoss()

        lensF = LensResnet.load_from_checkpoint(os.path.join(BEST_F_RESNET, 'checkpoint')).eval()
        self.modelF = lensF.backbone
        # self.modelF = torch.load(PRE_F_RESNET, map_location=self.device).eval()
        self.modelF.fc = nn.Identity()
        self.lastF = lensF.backbone.fc
        
        lensJ = LensResnet.load_from_checkpoint(os.path.join(BEST_J_RESNET, 'checkpoint')).eval()
        self.modelJ = lensJ.backbone
        # self.modelJ = torch.load(PRE_J_RESNET, map_location=self.device).eval()
        self.modelJ.fc = nn.Identity()
        self.lastJ = lensJ.backbone.fc

        self.val_metrics = tm.MetricCollection(
            {
                'FID_F' : tm.FID(self.modelF),
                'FID_J' : tm.FID(self.modelJ),
            },
            prefix='Stage1/val/',
        )

    def forward(self, noise, labels):
        inp = noise.mul(self.generator.emb(labels))
        return super().forward(inp)

    def training_step(self, batch, batch_idx, optimizer_idx):
        real, labels = batch
        fake = self._get_fake_pred(labels).type_as(real)

        # Train discriminator
        result = None
        if optimizer_idx == 0:
            result = self._disc_step(real, fake.detach())

        # Train generator
        if optimizer_idx == 1:
            result = self._gen_step(fake)

        return result

    def _disc_step(self, real, fake):
        disc_loss = self._get_disc_loss(real, fake)
        self.log('Stage1/D/train/loss', disc_loss)
        return disc_loss

    def _gen_step(self, fake):
        gen_loss = self._get_gen_loss(fake)
        self.log('Stage1/G/train/loss', gen_loss)
        return gen_loss

    def _get_disc_loss(self, real, fake):
        # Train with real
        real_pred = self.discriminator(real)
        real_gt = torch.ones_like(real_pred)
        real_loss = self.criterion(real_pred, real_gt)

        # Train with fake
        fake_pred = self.discriminator(fake)
        fake_gt = torch.zeros_like(fake_pred)
        fake_loss = self.criterion(fake_pred, fake_gt)

        disc_loss = real_loss + fake_loss

        return disc_loss

    def _get_gen_loss(self, fake):
        # Train with fake
        fake_pred = self.discriminator(fake)
        fake_gt = torch.ones_like(fake_pred)
        gen_loss = self.criterion(fake_pred, fake_gt)

        return gen_loss

    def _get_fake_pred(self, labels):
        batch_size = len(labels)
        noise = self._get_noise(batch_size, self.hparams.latent_dim)
        fake = self(noise, labels)
        # fake_pred = self.discriminator(fake)

        return fake

    def validation_step(self, batch, batch_idx):
        imgs64, labels = batch
        self.val_metrics.update(F.interpolate(imgs64, 150), real=True)

        fake = F.interpolate(self._get_fake_pred(labels), 150).type_as(imgs64)
        self.val_metrics.update(fake, real=False)
        return {'predF': F.softmax(self.lastF(self.modelF(fake)), 1), 
                'predJ': F.softmax(self.lastJ(self.modelJ(fake)), 1), 
                'target': labels}

    def validation_epoch_end(self, ListofDicts):
        self.log_dict(self.val_metrics)
        
        target = torch.cat([x['target'] for x in ListofDicts])
        fig, ax = plt.subplots(1,2, 
            subplot_kw={'xlim': [0,1], 'xlabel': 'False Positive Rate', 'ylim': [0,1.05], 
                        'ylabel': 'True Positive Rate',
            },
            figsize=[11, 5],
        )
        for j, letter in enumerate(['F', 'J']):
            prediction = torch.cat([x['pred' + letter] for x in ListofDicts])
            aurocTensor = tm.functional.auroc(prediction, target, num_classes=self.hparams.num_classes, average=None)
            self.log('Stage1/LensResnet(' + letter + ')/val/auroc', aurocTensor.min())
            fprList, tprList, _ = tm.functional.roc(prediction, target, num_classes=self.hparams.num_classes)
            
            colors = cycle(['aqua', 'darkorange', 'cornflowerblue'])
            for i, color in zip(range(self.hparams.num_classes), colors):
                ax[j].plot(fprList[i].cpu(), tprList[i].cpu(), color=color,
                        label='ROC curve of class {0} (area = {1:0.2f})'
                        ''.format(i, aurocTensor[i]))
            post_plotting(ax[j])
            ax[j].set_title('Multi-class ROC (' + letter + ')')
        
        fig.tight_layout()
        self.logger.experiment.add_figure('Stage1/LensResnet/val/ROC', fig)
        fig.savefig(str(self.trainer.log_dir) + '/ROC_epoch_{:02d}.pdf'.format(self.current_epoch))

        labels = torch.arange(self.hparams.num_classes, device=self.device)
        fake = F.interpolate(self._get_fake_pred(labels), 150)
        save_image(fake, 
                   str(self.trainer.log_dir) + '/Fake_epoch_{:02d}.pdf'.format(self.current_epoch), 
                  #  kwargs for make_grid
                   normalize=True, value_range=(-1,1))

In [None]:
features=[]
def hook(self, _, output):
    features.append(output)
handles = [i.register_forward_hook(hook) for i in m.generator.gen]
labels = torch.arange(m.hparams.num_classes, device=m.device)
_ = m._get_fake_pred(labels)

In [None]:
[i.remove() for i in handles]
del handles

In [None]:
for i, tensor in enumerate(features):
    shape = tensor.shape
    print(math.ceil(math.sqrt(shape[1]))*shape[2])
    save_image(tensor, 
                'features_{:02d}.png'.format(i), 
                #  kwargs for make_grid
                normalize=True, value_range=(-1,1), nrow=3)
    # [i.remove() for i in handles]

## LensGAN128
Here we extend the Stage1 gan to create 128 by 128 pixel images.

In [35]:
class LensGAN128(Stage1):
    def __init__(self, config, num_classes: int = 3):
        super().__init__(config, num_classes=num_classes)
        
        self.generator.gen[0] = self.generator._make_gen_block(self.hparams.feature_maps_gen * 16, self.hparams.feature_maps_gen * 8)
        first = self.generator._make_gen_block(self.hparams.latent_dim, self.hparams.feature_maps_gen * 16, kernel_size=4, stride=1, padding=0)
        self.generator.gen = nn.Sequential(first, *list(self.generator.gen))

        self.discriminator.disc[-1] = self.discriminator._make_disc_block(self.hparams.feature_maps_disc * 8, self.hparams.feature_maps_disc * 16)
        last = self.discriminator._make_disc_block(self.hparams.feature_maps_disc * 16, 1, kernel_size=4, stride=1, padding=0, last_block=True)[0]
        self.discriminator.disc = nn.Sequential(*list(self.discriminator.disc), last)

In [32]:
m = Stage1({'lr': 1e-4, 'n_fmaps':7, 'bs': 8}, 4)



## Stage 2
Here we subclass a DCGAN to create our high resolution GAN.

In [None]:
class Generator2(nn.Module):
    def __init__(self, ngf: int = 128, image_channels: int = 1, res_depth: int = 6):
        super().__init__()

        ker, strd = 4, 2
        pad = int((ker - 2)/2)
        res_ker, res_strd, res_pad = 3, 1, 1
        
        # 64 -> 32
        self.preprocessing = nn.Sequential(
            nn.Conv2d(image_channels, ngf, ker, strd, pad, bias=False),
            nn.ReLU(True)
        )
        # residuals
        layer = []
        for _ in range(res_depth):
            layer.append(BasicBlock(ngf, ngf))
        self.residual = nn.Sequential(*layer)
        
        self.ending_residual = nn.Sequential(
            nn.Conv2d(ngf, ngf, res_ker, res_strd, res_pad, bias=False),
            nn.BatchNorm2d(ngf),
            nn.ReLU(True)
        )

        # at this part, add the residual inputs from after the preprocessing

        image_width = 150 # upscaling should be factor of 2 increase
        mode = 'nearest' # upscaling method is nearest-neighbour
        self.main = nn.Sequential(
            # 32 -> 75
            nn.Upsample(image_width//2, mode=mode),
            nn.Conv2d(ngf, ngf*4, res_ker, res_strd, res_pad, bias=False),
            nn.BatchNorm2d(ngf*4),
            nn.ReLU(True),
            # 75 -> 150
            nn.Upsample(image_width, mode=mode),
            nn.Conv2d(ngf*4, image_channels, res_ker, res_strd, res_pad, bias=False),
            nn.Tanh()
        )

    def forward(self, in_x):
        x_p = self.preprocessing(in_x)
        x_r = x_p
        x_r = self.residual(x_r)
        x_r = self.ending_residual(x_r)
        # large residual connections
        x_f = x_r + x_p
        return self.main(x_f)

In [None]:
BEST_F_STAGE1 = '/content/drive/MyDrive/Logs/F/Stage1/pbt_tanh_1/train_Stage1_90727_00001_1_n_fmaps=16_2021-08-30_08-02-05/checkpoint_epoch=4-step=1988/'
BEST_J_STAGE1 ='/content/drive/MyDrive/Logs/J/Stage1/pbt_tanh/train_Stage1_28c03_00003_3_n_fmaps=64_2021-08-31_20-45-44/checkpoint_epoch=2-step=1403/'

In [None]:
class Stage2(DCGAN):
    def __init__(self, config, num_classes: int = 3, **kwargs):
        super().__init__(feature_maps_gen=config['n_fmaps'], feature_maps_disc=config['n_fmaps'], learning_rate=config['learning_rate'])
        self.save_hyperparameters(ignore=config)

        self.generator = Generator2(self.hparams.feature_maps_gen, self.hparams.image_channels, config['res_depth'])

        # These are better as attributes, instead of being returned by a method
        self.modelF = getattr(self, 'modelF', LensResnet.load_from_checkpoint(os.path.join(BEST_RESNET_F, 'checkpoint')).eval())
        self.modelJ = getattr(self, 'modelJ', LensResnet.load_from_checkpoint(os.path.join(BEST_RESNET_J, 'checkpoint')).eval())
        # Workaround:
        self.lowres = getattr(self, 'lowres', Stage1.load_from_checkpoint(os.path.join(BEST_STAGE1_F, 'checkpoint')).eval())
        
        metrics = tm.MetricCollection(
            [
             tm.AUROC(num_classes=self.hparams.num_classes, compute_on_step=False, average=None), 
             tm.ROC(num_classes=self.hparams.num_classes, compute_on_step=False),
            ]
        )
        self.metricsF = metrics.clone()
        self.metricsJ = metrics.clone()

    def forward(self, noise):
        return self.generator(noise)

    def training_step(self, batch, batch_idx, optimizer_idx):
        real, self.labels = batch

        # Train discriminator
        result = None
        if optimizer_idx == 0:
            result = self._disc_step(real)

        # Train generator
        if optimizer_idx == 1:
            result = self._gen_step(real)

        return result

    def _disc_step(self, real):
        disc_loss = self._get_disc_loss(real)
        self.log('Stage2/D/train/loss', disc_loss, on_epoch=True)
        return disc_loss

    def _gen_step(self, real):
        gen_loss = self._get_gen_loss(real)
        self.log('Stage2/G/train/loss', gen_loss, on_epoch=True)
        return gen_loss

    def _get_gen_loss(self, real: torch.Tensor) -> torch.Tensor:
        # Train with fake
        fake_pred = self._get_fake_pred(real)
        fake_gt = torch.ones_like(fake_pred)
        gen_loss = self.criterion(fake_pred, fake_gt)

        # class_pred =  self._get_class_pred(len(real))
        # gen_loss += F.cross_entropy(class_pred, self.labels)

        return gen_loss

    def _get_class_pred(self, batch_size) -> torch.Tensor:
        # ----------------------------------------------------------------------------------------------------------------
        return self.modelF.backbone(self(self._get_noise(batch_size, self.hparams.latent_dim)))

    def _get_noise(self, n_samples: int, latent_dim: int, labels = None):
        # can't use self in function definition
        if labels is None:
            labels = self.labels
            # getattr(self, 'labels', torch.randint(self.hparams.num_classes, (n_samples,), device=self.device))  # last dimension is the hidden dimension
        return self.lowres(super()._get_noise(n_samples, latent_dim), labels)

    def validation_step(self, batch, batch_idx):
        imgs, labels = batch
        out = self(self._get_noise(labels.shape[0], self.hparams.latent_dim, labels))
        self.metricsF.update(self.modelF(out), labels)
        self.metricsJ.update(self.modelJ(out), labels)
        # out = Fig.interpolate(out_64, 150)

    def validation_epoch_end(self, listofDicts):
        fig, ax = plt.subplots(1,2, 
            subplot_kw={'xlim': [0,1], 'xlabel': 'False Positive Rate', 'ylim': [0,1.05], 
                        'ylabel': 'True Positive Rate',
            },
            figsize=[11, 5],
        )
        for j, letter in enumerate(['F', 'J']):
            output = getattr(self, 'metrics' + letter).compute()
            self.log('Stage2/ResNet(' + letter + ')/val/auroc', output['AUROC'].min())
            fprList, tprList, _ = output['ROC']
            
            colors = cycle(['aqua', 'darkorange', 'cornflowerblue'])
            for i, color in zip(range(self.hparams.num_classes), colors):
                ax[j].plot(fprList[i].cpu(), tprList[i].cpu(), color=color,
                        label='ROC curve of class {0} (area = {1:0.2f})'
                        ''.format(i, output['AUROC'][i]))
            post_plotting(ax[j])
            ax[j].set_title('Multi-class ROC (' + letter + ')')
        
        fig.tight_layout()
        self.logger.experiment.add_figure('Stage2/ResNet/val/ROC', fig)
        fig.savefig(str(self.trainer.log_dir) + '/ROC_epoch_{:02d}.pdf'.format(self.current_epoch))

        labels = torch.arange(self.hparams.num_classes, device=self.device)
        save_image(self(self._get_noise(labels.shape[0], self.hparams.latent_dim, labels)), 
                   str(self.trainer.log_dir) + '/Fake_epoch_{:02d}.pdf'.format(self.current_epoch), 
                  #  kwargs for make_grid
                   normalize=True, value_range=(-1,1))

    def on_fit_end(self):
        delattr(self, 'modelF')
        delattr(self, 'modelJ')
        delattr(self, 'labels')
        delattr(self, 'lowres')

## StackGAN:
Here we define the full GAN module, that we shall use to generate representative images.

In [None]:
class StackGAN(pl.LightningModule):
    def __init__(self, config, noise_size: int = 100, image_width = 64,
                    num_classes: int = 3, image_channels: int = 1, b1: float = 0.5, **kwargs):
        super().__init__()
        self.save_hyperparameters(ignore = config)
        self.feature_maps = config['feature_maps']
        self.lr = config['learning_rate']
        # -------------------------------------
        # Need to create a subclass because we couldn't simply add/remove a layer;
        # there are two inputs of the superclas' forward method.
        self.G1 = DCGANGenerator(self.hparams.noise_size, self.feature_maps, self.hparams.image_channels).apply(self._weights_init)
        l = list(self.G1.gen[0])
        del l[1]
        self.G1.gen[0] = nn.Sequential(*l)
        self.G1.add_module('label_emb', nn.Embedding(self.hparams.num_classes, self.hparams.noise_size))
        # ------------------------------------
        self.D1 = DCGANDiscriminator(self.feature_maps, self.hparams.image_channels).apply(self._weights_init)
        # -------------------------------------
        self.G2 = Generator2(self.hparams.image_channels, self.feature_maps).apply(self._weights_init)
        # -------------------------------------
        self.D2 = DCGANDiscriminator(self.feature_maps, self.hparams.image_channels)
        #  steps to mutate the instance, not the class definition
        extra = self.D2._make_disc_block(self.feature_maps * 2, self.feature_maps * 2)
        l = list(self.D2.disc)
        l.insert(2, extra)
        self.D2.disc = nn.Sequential(*l)
        self.D2.apply(self._weights_init)
        # No need for subclassing as the forward method need not be modified.
        # -------------------------------------
        self.R = LensResnet(config, num_classes = 4).apply(self._weights_init)
        # -------------------------------------
        self.pretrained = LensResnet(config)
        ckpt = pl_load(os.path.join(
            '/content/drive/MyDrive/Logs/tune_LensResnet_asha_model_j/train_LensResnet_tune_checkpoint_e38cb_00000_0_batch_size=128,learning_rate=0.001_2021-07-06_17-52-11/checkpoint_epoch=17-step=1406',
            # '/content/drive/MyDrive/Logs/tune_LensResnet_asha_model_f/train_LensResnet_tune_checkpoint_e32ba_00000_0_batch_size=64,learning_rate=0.0001_2021-07-06_03-33-10/checkpoint_epoch=14-step=4689',
            'checkpoint'),
            map_location=lambda storage, loc: storage)
        self.pretrained._load_model_state(ckpt)
        # -------------------------------------
        self.criterion1 = nn.BCELoss()
        self.criterion2 = nn.CrossEntropyLoss()

    @staticmethod
    def _weights_init(m):
        classname = m.__class__.__name__
        if classname.find('Conv') != -1:
            torch.nn.init.normal_(m.weight, 0.0, 0.02)
        elif classname.find('BatchNorm') != -1:
            torch.nn.init.normal_(m.weight, 1.0, 0.02)
            torch.nn.init.zeros_(m.bias)

    def forward(self, noise, labels = None):
        if labels is None:
            labels = torch.randint(self.hparams.num_classes, noise.shape[:-1])                           # last dimension is the hidden dimension
        inp = torch.mul(noise, self.G1.label_emb(labels))
        out1 = self.G1(inp.view(-1, inp.shape[-1], 1, 1))
        out2 = self.G2(out1.detach())
        return out2, out1

    def training_step(self, batch, batch_idx, optimizer_idx):
        imgs, labels = batch
        temp2, temp1 = self(torch.randn(labels.shape[0], self.hparams.noise_size).type_as(imgs), labels)

        if optimizer_idx == 0:
            loss = self.criterion1(self.D1(temp1), torch.ones_like(labels, dtype=torch.float32))
            self.log('G1/train/loss/disc', loss)
            loss.add_(self.criterion2(self.R.backbone(self.G2(temp1)), labels))
            self.log('G1/train/loss/full', loss)

        elif optimizer_idx == 1:
            real, fake = self.D1(F.interpolate(imgs, self.hparams.image_width, mode='nearest')), self.D1(temp1.detach())
            prediction, target = torch.cat((real, fake)), torch.cat((torch.ones_like(real),torch.zeros_like(fake)))
            loss = self.criterion1(prediction, target)
            self.log('D1/train/loss', loss)

        elif optimizer_idx == 2:
            loss = self.criterion1(self.D2(temp2), torch.ones_like(labels, dtype=torch.float32))
            self.log('G2/train/loss/disc', loss)
            loss.add_(self.criterion2(self.R.backbone(temp2), labels))
            self.log('G2/train/loss/full', loss)

        elif optimizer_idx == 3:
            real, fake = self.D2(imgs), self.D2(temp2.detach())
            prediction, target = torch.cat((real, fake)), torch.cat((torch.ones_like(real),torch.zeros_like(fake)))
            loss = self.criterion1(prediction, target)
            self.log('D2/train/loss', loss)

        elif optimizer_idx == 4:
            real, fake = self.R.backbone(imgs), self.R.backbone(temp2.detach())
            prediction, target = torch.cat((real, fake)), torch.cat((labels, self.hparams.num_classes * torch.ones_like(labels)))
            loss = self.criterion2(prediction, target)
            self.log('R/train/loss', loss)
        
        return loss

    def configure_optimizers(self):
        opt_g1 = torch.optim.Adam(self.G1.parameters(), self.lr, (self.hparams.b1, 0.999))
        opt_d1 = torch.optim.Adam(self.D1.parameters(), self.lr, (self.hparams.b1, 0.999))
        opt_g2 = torch.optim.Adam(self.G2.parameters(), self.lr, (self.hparams.b1, 0.999))
        opt_d2 = torch.optim.Adam(self.D2.parameters(), self.lr, (self.hparams.b1, 0.999))
        opt_r = torch.optim.Adam(self.R.parameters(), self.lr, (self.hparams.b1, 0.999))
        return opt_g1, opt_d1, opt_g2, opt_d2, opt_r

    def validation_step(self, batch, batch_idx):
        imgs, labels = batch
        temp2, _ = self(torch.randn(labels.shape[0], self.hparams.noise_size).type_as(imgs), labels)
        return {'pred': self.pretrained(temp2.detach()), 'target': labels}

    def validation_epoch_end(self, listofDicts):
        prediction, target = torch.cat([x['pred'] for x in listofDicts]), torch.cat([x['target'] for x in listofDicts])
        aurocTensor = tm.functional.auroc(prediction, target, num_classes=self.hparams.num_classes, average=None)
        self.log('Pre/val/auroc', aurocTensor.min())
        fprList, tprList, _ = tm.functional.roc(prediction, target, num_classes=self.hparams.num_classes)
        
        f = plt.figure()
        colors = cycle(['aqua', 'darkorange', 'cornflowerblue'])
        for i, color in zip(range(self.hparams.num_classes), colors):
            plt.plot(fprList[i].cpu(), tprList[i].cpu(), color=color,
                    label='ROC curve of class {0} (area = {1:0.2f})'
                    ''.format(i, aurocTensor[i].cpu()))
        plt.plot([0, 1], [0, 1], 'k--')
        plt.xlim([0.0, 1.0])
        plt.ylim([0.0, 1.05])
        plt.xlabel('False Positive Rate')
        plt.ylabel('True Positive Rate')
        plt.title('Multi-class ROC')
        plt.legend(loc='lower right')

        self.logger.experiment.add_figure('StackGAN/val/ROC', f)
        f.savefig(str(tune.get_trial_dir())+'ROC_epoch_'+str(self.current_epoch)+'.pdf')

# Tune Hyperparameters


## ResNet
Here we tune hyperparameters as we train our modified ResNet.

In [None]:
%rm -rf ./drive/MyDrive/Logs/F/LensResnet/pbt_tanh

In [None]:
# __tune_train_checkpoint_begin
def train_LensResnet(config, checkpoint_dir=None, num_epochs=10, num_gpus=torch.cuda.device_count()):
    # print(os.cpu_count(), torch.cuda.device_count())
    kwargs = {
        # 'limit_train_batches' : 0.005,
        # 'limit_val_batches' : 0.005,
        'progress_bar_refresh_rate' : math.ceil(8250//config['bs']),
        'max_epochs' : num_epochs,
        'prepare_data_per_node' : False,
        # If fractional GPUs passed in, convert to int.
        'gpus' : math.ceil(num_gpus),
        'logger' : TensorBoardLogger(save_dir=tune.get_trial_dir(), name='', version='.'),
        'callbacks' : [
            TuneReportCheckpointCallback(
                {
                    'loss': 'LensResnet/val/loss', 
                    'auroc': 'LensResnet/val/auroc', 
                },
            ),
            ModuleDataMonitor(['backbone.layer2', 'backbone.layer4', 'backbone.fc']),
            ConfusedLogitCallback(5),
        ],
        'stochastic_weight_avg' : True,
        # works with only one optimizer
        'benchmark' : True,
        'precision' : 16,     # can't use on cpu
        # 'track_grad_norm': 2,
        # 'gradient_clip_val' : 0.5, 
        # 'gradient_clip_algorithm' : 'value',
    }
    
    dm = npyImageData(config)                                              # Specify image width here    
    if checkpoint_dir is not None:
        kwargs['resume_from_checkpoint'] = os.path.join(checkpoint_dir, 'checkpoint')
        # model = LensResnet.load_from_checkpoint(kwargs['resume_from_checkpoint'], config=config)
    # else:

    model = LensResnet(config)
    trainer = pl.Trainer(**kwargs)

    trainer.fit(model, dm)
# __tune_train_checkpoint_end__

# __tune_pbt_begin__
def tune_LensResnet_pbt(num_samples=10, num_epochs=10, gpus_per_trial=torch.cuda.device_count()):
    # print(os.cpu_count(), torch.cuda.device_count())
    analysis = tune.run(
        tune.with_parameters(
            train_LensResnet,
            num_epochs=num_epochs,
            num_gpus=gpus_per_trial
        ),
        # Change the folder name when changing dataset--------------------------------------------------------------------------
        name='J/LensResnet/pbt_tanh_fine',
        metric='loss',
        mode='min',
        # stop=TrialPlateauStopper('auroc'),
        resources_per_trial={'cpu': os.cpu_count(), 'gpu': gpus_per_trial},
        local_dir='./drive/MyDrive/Logs',
        # config={'lr': tune.choice([1e-4, 1e-3, 1e-5, 1e-2, 1e-6, 1e-1, 1e-7]),
        #         'bs': tune.grid_search([8, 16, 32, 64, 128]),
        #         },
        # scheduler = pbtScheduler(max_t=num_epochs, grace_period=2, reduction_factor=2),
        # Can't use RB2 as it requires mutations to be continuous
        config={'lr': 1e-5,
                'bs': 8,
                # RuntimeError: stack expects each tensor to be equal size, but got [128] at entry 0 and [120] at entry 585
                },
        scheduler = PopulationBasedTraining(time_attr='training_iteration', quantile_fraction=0.4,
                                            resample_probability=0.2,  perturbation_interval=1,
                                            hyperparam_mutations={
                                                'lr': tune.loguniform(1e-6, 1e-4),
                                                'bs': [8, 16, 32, 64, 128],
                                            },
        ),
        progress_reporter=JupyterNotebookReporter(
            overwrite=False,
            parameter_columns=['lr', 'bs'],
            metric_columns=['loss', 'auroc', 'training_iteration'],
        ),
        fail_fast = True,
        # reuse_actors=True,
        num_samples=num_samples,
        # resume='PROMPT',
    )
    BEST_J_RESNET = analysis.best_checkpoint
    print('Best checkpoint path found is: ', BEST_J_RESNET)

# __tune_pbt_end__

if __name__ == '__main__':
    import argparse

    parser = argparse.ArgumentParser()
    parser.add_argument(
        '--smoke-test', action='store_true', help='Finish quickly for testing')
    args, _ = parser.parse_known_args()

    if args.smoke_test:
        tune_LensResnet_pbt(num_samples=1, num_epochs=6, gpus_per_trial=torch.cuda.device_count())
    else:
        # pbt scheduler
        tune_LensResnet_pbt(num_samples=1, num_epochs=5, gpus_per_trial=torch.cuda.device_count())

Trial name,status,loc,lr,bs
train_LensResnet_93609_00000,PENDING,,1e-05,8


[2m[36m(pid=2129)[0m Using native 16bit precision.
[2m[36m(pid=2129)[0m GPU available: True, used: True
[2m[36m(pid=2129)[0m TPU available: False, using: 0 TPU cores
[2m[36m(pid=2129)[0m IPU available: False, using: 0 IPUs
[2m[36m(pid=2129)[0m LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
[2m[36m(pid=2129)[0m 
[2m[36m(pid=2129)[0m   | Name          | Type             | Params
[2m[36m(pid=2129)[0m ---------------------------------------------------
[2m[36m(pid=2129)[0m 0 | backbone      | ResNet           | 11.2 M
[2m[36m(pid=2129)[0m 1 | train_metrics | MetricCollection | 0     
[2m[36m(pid=2129)[0m 2 | val_metrics   | MetricCollection | 0     
[2m[36m(pid=2129)[0m ---------------------------------------------------
[2m[36m(pid=2129)[0m 11.2 M    Trainable params
[2m[36m(pid=2129)[0m 0         Non-trainable params
[2m[36m(pid=2129)[0m 11.2 M    Total params
[2m[36m(pid=2129)[0m 44.687    Total estimated model params size (MB)
[2m[36m(pid=21

[2m[36m(pid=2129)[0m Validation sanity check: 0it [00:00, ?it/s]Validation sanity check:   0%|          | 0/2 [00:00<?, ?it/s]


[2m[36m(pid=2129)[0m   return torch.max_pool2d(input, kernel_size, stride, padding, dilation, ceil_mode)
[2m[36m(pid=2129)[0m   "`Trainer.running_sanity_check` has been renamed to `Trainer.sanity_checking` and will be removed in v1.5."


Epoch 0:   0%|          | 0/10313 [00:00<00:02, 4120.14it/s]  
Epoch 0:  10%|▉         | 1031/10313 [01:32<13:48, 11.21it/s, loss=1.63, v_num=., LensResnet/train/AUROC=0.695]
Epoch 0:  10%|▉         | 1031/10313 [01:48<16:17,  9.49it/s, loss=1.63, v_num=., LensResnet/train/AUROC=0.695]
Epoch 0:  20%|█▉        | 2062/10313 [03:06<12:25, 11.07it/s, loss=1.28, v_num=., LensResnet/train/AUROC=0.714]
Epoch 0:  20%|█▉        | 2062/10313 [03:18<13:14, 10.38it/s, loss=1.28, v_num=., LensResnet/train/AUROC=0.714]
Epoch 0:  30%|██▉       | 3093/10313 [04:45<11:06, 10.84it/s, loss=0.945, v_num=., LensResnet/train/AUROC=0.735]
Epoch 0:  30%|██▉       | 3093/10313 [04:58<11:37, 10.36it/s, loss=0.945, v_num=., LensResnet/train/AUROC=0.735]
Epoch 0:  40%|███▉      | 4124/10313 [06:29<09:44, 10.60it/s, loss=0.958, v_num=., LensResnet/train/AUROC=0.754]
Epoch 0:  50%|████▉     | 5155/10313 [08:16<08:16, 10.39it/s, loss=1.02, v_num=., LensResnet/train/AUROC=0.773] 
Epoch 0:  60%|█████▉    | 6186/10313 

[2m[36m(pid=2129)[0m   "`Trainer.running_sanity_check` has been renamed to `Trainer.sanity_checking` and will be removed in v1.5."


Result for train_LensResnet_93609_00000:
  auroc: 0.9341036677360535
  date: 2021-09-02_21-23-39
  done: false
  experiment_id: 657e38bdbb97442d81533414ecfad8f8
  hostname: 77ab158a08cd
  iterations_since_restore: 1
  loss: 0.4412280321121216
  node_ip: 172.28.0.2
  pid: 2129
  should_checkpoint: true
  time_since_restore: 1048.4524285793304
  time_this_iter_s: 1048.4524285793304
  time_total_s: 1048.4524285793304
  timestamp: 1630617819
  timesteps_since_restore: 0
  training_iteration: 1
  trial_id: '93609_00000'
  


Trial name,status,loc,lr,bs,loss,auroc,training_iteration
train_LensResnet_93609_00000,RUNNING,172.28.0.2:2129,1e-05,8,0.441228,0.934104,1


[2m[36m(pid=2129)[0m   "The signature of `Callback.on_train_epoch_end` has changed in v1.3."


[2m[36m(pid=2129)[0m Epoch 0: 100%|██████████| 10313/10313 [17:09<00:00, 10.02it/s, loss=0.573, v_num=., LensResnet/train/AUROC=0.823, LensResnet/val/auroc=0.934]
                                                             [A
Epoch 1:   0%|          | 0/10313 [00:00<00:05, 1873.29it/s, loss=0.573, v_num=., LensResnet/train/AUROC=0.823, LensResnet/val/auroc=0.934]
Epoch 1:  10%|▉         | 1031/10313 [02:16<20:30,  7.54it/s, loss=0.699, v_num=., LensResnet/train/AUROC=0.833, LensResnet/val/auroc=0.934]
Epoch 1:  20%|█▉        | 2062/10313 [04:40<18:43,  7.34it/s, loss=0.555, v_num=., LensResnet/train/AUROC=0.841, LensResnet/val/auroc=0.934]
Epoch 1:  30%|██▉       | 3093/10313 [07:03<16:28,  7.30it/s, loss=0.515, v_num=., LensResnet/train/AUROC=0.849, LensResnet/val/auroc=0.934]
Epoch 1:  40%|███▉      | 4124/10313 [09:32<14:18,  7.21it/s, loss=0.378, v_num=., LensResnet/train/AUROC=0.855, LensResnet/val/auroc=0.934]
Epoch 1:  50%|████▉     | 5155/10313 [12:06<12:06,  7.10it/s, lo

[2m[36m(pid=2129)[0m   "`Trainer.running_sanity_check` has been renamed to `Trainer.sanity_checking` and will be removed in v1.5."


Result for train_LensResnet_93609_00000:
  auroc: 0.9561334848403931
  date: 2021-09-02_21-47-49
  done: false
  experiment_id: 657e38bdbb97442d81533414ecfad8f8
  hostname: 77ab158a08cd
  iterations_since_restore: 2
  loss: 0.3701208233833313
  node_ip: 172.28.0.2
  pid: 2129
  should_checkpoint: true
  time_since_restore: 2498.2101757526398
  time_this_iter_s: 1449.7577471733093
  time_total_s: 2498.2101757526398
  timestamp: 1630619269
  timesteps_since_restore: 0
  training_iteration: 2
  trial_id: '93609_00000'
  


Trial name,status,loc,lr,bs,loss,auroc,training_iteration
train_LensResnet_93609_00000,RUNNING,172.28.0.2:2129,1e-05,8,0.370121,0.956133,2


[2m[36m(pid=2129)[0m Epoch 1: 100%|██████████| 10313/10313 [24:08<00:00,  7.12it/s, loss=0.396, v_num=., LensResnet/train/AUROC=0.883, LensResnet/val/auroc=0.956]
                                                             [A
Epoch 2:   0%|          | 0/10313 [00:00<00:04, 2420.26it/s, loss=0.396, v_num=., LensResnet/train/AUROC=0.883, LensResnet/val/auroc=0.956]




Epoch 2:  10%|▉         | 1031/10313 [03:08<28:15,  5.47it/s, loss=0.321, v_num=., LensResnet/train/AUROC=0.888, LensResnet/val/auroc=0.956]
Epoch 2:  20%|█▉        | 2062/10313 [06:18<25:13,  5.45it/s, loss=0.468, v_num=., LensResnet/train/AUROC=0.892, LensResnet/val/auroc=0.956]
Epoch 2:  30%|██▉       | 3093/10313 [09:28<22:07,  5.44it/s, loss=0.388, v_num=., LensResnet/train/AUROC=0.895, LensResnet/val/auroc=0.956]
Epoch 2:  40%|███▉      | 4124/10313 [12:44<19:06,  5.40it/s, loss=0.594, v_num=., LensResnet/train/AUROC=0.899, LensResnet/val/auroc=0.956]
Epoch 2:  50%|████▉     | 5155/10313 [16:03<16:03,  5.35it/s, loss=0.348, v_num=., LensResnet/train/AUROC=0.902, LensResnet/val/auroc=0.956]
Epoch 2:  60%|█████▉    | 6186/10313 [19:25<12:57,  5.31it/s, loss=0.404, v_num=., LensResnet/train/AUROC=0.905, LensResnet/val/auroc=0.956]
Epoch 2:  70%|██████▉   | 7217/10313 [22:49<09:47,  5.27it/s, loss=0.374, v_num=., LensResnet/train/AUROC=0.908, LensResnet/val/auroc=0.956]
Epoch 2:  80%

In [None]:
!cat /content/drive/MyDrive/Logs/F/LensResnet/pbt_tanh_finetune/train_LensResnet_ed4b1_00000_0_2021-08-30_02-56-52/error.txt

## Stage 1
Here we tune hyperparameters as we train our modified DCGAN.

In [11]:
%rm -rf ./drive/MyDrive/Logs/F/Stage1/pbt_tanh/

In [12]:
# __tune_train_checkpoint_begin
def train_Stage1(config, checkpoint_dir=None, num_epochs=10, num_gpus=torch.cuda.device_count()):
    # print(os.cpu_count(), torch.cuda.device_count())
    kwargs = {
        'limit_train_batches' : 0.1,
        'limit_val_batches' : 0.1,
        'progress_bar_refresh_rate' : math.ceil(8250*0.1//config['bs']),
        'max_epochs' : num_epochs,
        'prepare_data_per_node' : False,
        # If fractional GPUs passed in, convert to int.
        'gpus' : math.ceil(num_gpus),
        'logger' : TensorBoardLogger(save_dir=tune.get_trial_dir(), name='', version='.'),
        'callbacks' : [
            TuneReportCheckpointCallback(
                {
                    'loss_G': 'Stage1/G/train/loss', 
                    'loss_D': 'Stage1/D/train/loss', 
                    # Switch up the FID vlues when training on different dataset -----------------------------------------------
                    'FID': 'Stage1/val/FID_F', 
                    'FID_cross': 'Stage1/val/FID_J',
                    'auroc': 'Stage1/LensResnet(F)/val/auroc',
                    'auroc_cross': 'Stage1/LensResnet(J)/val/auroc',
                },
            ),
            ModuleDataMonitor(True),
        ],
        # 'stochastic_weight_avg' : True,
        # works with only one optimizer
        'benchmark' : True,
        'precision' : 16,
        # 'gradient_clip_val' : 0.5, 
        # 'gradient_clip_algorithm' : 'value',
    }
    
    dm = npyImageData(config, 64)                                              # Specify image width here    
    if checkpoint_dir is not None:
        kwargs['resume_from_checkpoint'] = os.path.join(checkpoint_dir, 'checkpoint')
        # model = Stage1.load_from_checkpoint(kwargs['resume_from_checkpoint'], config=config)
    # else:

    model = Stage1(config)
    trainer = pl.Trainer(**kwargs)

    trainer.fit(model, dm)
# __tune_train_checkpoint_end__

# __tune_pbt_begin__
def tune_Stage1_pbt(num_samples=10, num_epochs=10, gpus_per_trial=torch.cuda.device_count()):
    # print(os.cpu_count(), torch.cuda.device_count())
    analysis = tune.run(
        tune.with_parameters(
            train_Stage1,
            num_epochs=num_epochs,
            num_gpus=gpus_per_trial
        ),
        # Change the folder name when changing dataset--------------------------------------------------------------------------
        name='F/Stage1/pbt_tanh',
        metric='auroc',
        mode='max',
        # stop=TrialPlateauStopper('FID'),
        resources_per_trial={'cpu': os.cpu_count(), 'gpu': gpus_per_trial},
        local_dir='./drive/MyDrive/Logs',
        # config={'lr': tune.choice([1e-4, 1e-3, 1e-5, 1e-2, 1e-6, 1e-1, 1e-7]),
        #         'bs': tune.grid_search([8, 16, 32, 64, 128]),
        #         },
        # scheduler = pbtScheduler(max_t=num_epochs, grace_period=2, reduction_factor=2),
        # Can't use RB2 as it requires mutations to be continuous
        config={'lr': 1e-4,
                'n_fmaps': tune.grid_search([8, 16, 32, 64]),
                'bs': 8,
                },
        # config = {'lr': 2.340983544823817e-05, 'n_fmaps': 32, 'bs': 8},
        scheduler = PopulationBasedTraining(time_attr='training_iteration', quantile_fraction=0.25,
                                            resample_probability=0.25,  perturbation_interval=1,
                                            hyperparam_mutations={
                                                'lr': tune.loguniform(1e-7, 1e-1),
                                                'bs': [8, 16, 32, 64, 128],
                                            },
        ),
        progress_reporter=JupyterNotebookReporter(
            overwrite=False,
            parameter_columns=['lr', 'n_fmaps', 'bs'],
            metric_columns=['loss_G', 'loss_D', 'FID', 'auroc', 'FID_cross', 
                            'auroc_cross', 'training_iteration'],
        ),
        fail_fast = True,
        # reuse_actors=True,
        num_samples=num_samples,
        # resume='PROMPT',
        # restore=BEST_J_STAGE1,
    )
    # ---------------------------------------------------------------------------------------------
    BEST_F_STAGE1 = analysis.best_checkpoint
    print('Best checkpoint path found is: ', analysis.best_checkpoint)

# __tune_pbt_end__

if __name__ == '__main__':
    import argparse

    parser = argparse.ArgumentParser()
    parser.add_argument(
        '--smoke-test', action='store_true', help='Finish quickly for testing')
    args, _ = parser.parse_known_args()

    if args.smoke_test:
        tune_Stage1_pbt(num_samples=1, num_epochs=6, gpus_per_trial=torch.cuda.device_count())
    else:
        # pbt scheduler
        tune_Stage1_pbt(num_samples=1, num_epochs=10, gpus_per_trial=torch.cuda.device_count())

Trial name,status,loc,lr,n_fmaps,bs
train_Stage1_70527_00000,RUNNING,,0.0001,8,8
train_Stage1_70527_00001,PENDING,,0.0001,16,8
train_Stage1_70527_00002,PENDING,,0.0001,32,8
train_Stage1_70527_00003,PENDING,,0.0001,64,8


[2m[36m(pid=787)[0m Using native 16bit precision.
[2m[36m(pid=787)[0m GPU available: True, used: True
[2m[36m(pid=787)[0m TPU available: False, using: 0 TPU cores
[2m[36m(pid=787)[0m IPU available: False, using: 0 IPUs
[2m[36m(pid=787)[0m LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Trial name,status,loc,lr,n_fmaps,bs
train_Stage1_70527_00000,RUNNING,,0.0001,8,8
train_Stage1_70527_00001,PENDING,,0.0001,16,8
train_Stage1_70527_00002,PENDING,,0.0001,32,8
train_Stage1_70527_00003,PENDING,,0.0001,64,8


[2m[36m(pid=787)[0m 
[2m[36m(pid=787)[0m   | Name          | Type               | Params
[2m[36m(pid=787)[0m -----------------------------------------------------
[2m[36m(pid=787)[0m 0 | generator     | DCGANGenerator     | 146 K 
[2m[36m(pid=787)[0m 1 | discriminator | DCGANDiscriminator | 44.4 K
[2m[36m(pid=787)[0m 2 | criterion     | BCEWithLogitsLoss  | 0     
[2m[36m(pid=787)[0m 3 | modelF        | ResNet             | 11.2 M
[2m[36m(pid=787)[0m 4 | lastF         | Sequential         | 1.5 K 
[2m[36m(pid=787)[0m 5 | modelJ        | ResNet             | 11.2 M
[2m[36m(pid=787)[0m 6 | lastJ         | Sequential         | 1.5 K 
[2m[36m(pid=787)[0m 7 | val_metrics   | MetricCollection   | 22.3 M
[2m[36m(pid=787)[0m -----------------------------------------------------
[2m[36m(pid=787)[0m 22.5 M    Trainable params
[2m[36m(pid=787)[0m 0         Non-trainable params
[2m[36m(pid=787)[0m 22.5 M    Total params
[2m[36m(pid=787)[0m 90.136    

[2m[36m(pid=787)[0m Validation sanity check: 0it [00:00, ?it/s]Validation sanity check:   0%|          | 0/2 [00:00<?, ?it/s]


[2m[36m(pid=787)[0m   return torch.max_pool2d(input, kernel_size, stride, padding, dilation, ceil_mode)
[2m[36m(pid=787)[0m   "`Trainer.running_sanity_check` has been renamed to `Trainer.sanity_checking` and will be removed in v1.5."


Epoch 0:   0%|          | 0/1030 [00:00<00:00, 2839.75it/s]  
Epoch 0:  10%|█         | 103/1030 [00:07<01:06, 13.91it/s, loss=1.39, v_num=.]
Epoch 0:  20%|██        | 206/1030 [00:14<00:58, 14.19it/s, loss=1.85, v_num=.]
Epoch 0:  30%|███       | 309/1030 [00:21<00:50, 14.26it/s, loss=2.22, v_num=.]
Epoch 0:  40%|████      | 412/1030 [00:28<00:43, 14.31it/s, loss=2.36, v_num=.]
Epoch 0:  50%|█████     | 515/1030 [00:35<00:35, 14.36it/s, loss=2.53, v_num=.]
Epoch 0:  60%|██████    | 618/1030 [00:43<00:28, 14.39it/s, loss=2.83, v_num=.]
Epoch 0:  70%|███████   | 721/1030 [00:50<00:21, 14.41it/s, loss=2.95, v_num=.]
Epoch 0:  80%|████████  | 824/1030 [00:57<00:14, 14.42it/s, loss=3.09, v_num=.]
Epoch 0:  90%|█████████ | 927/1030 [01:04<00:07, 14.44it/s, loss=2.99, v_num=.]
Epoch 0: 100%|██████████| 1030/1030 [01:04<00:00, 15.92it/s, loss=2.99, v_num=.]
Validating: 0it [00:00, ?it/s][A
Validating:   0%|          | 0/93 [00:00<?, ?it/s][A
[2m[36m(pid=787)[0m 
Validating: 100%|████████

[2m[36m(pid=787)[0m   "`Trainer.running_sanity_check` has been renamed to `Trainer.sanity_checking` and will be removed in v1.5."


Result for train_Stage1_70527_00000:
  FID: 191.125
  FID_cross: 277.25
  auroc: 0.38902196288108826
  auroc_cross: 0.39847883582115173
  date: 2021-09-07_23-02-12
  done: false
  experiment_id: f4cb33f5595841f8bba99795ccc25185
  hostname: e89b1a816e20
  iterations_since_restore: 1
  loss_D: 0.003334502223879099
  loss_G: 5.904624938964844
  node_ip: 172.28.0.2
  pid: 787
  should_checkpoint: true
  time_since_restore: 81.98236560821533
  time_this_iter_s: 81.98236560821533
  time_total_s: 81.98236560821533
  timestamp: 1631055732
  timesteps_since_restore: 0
  training_iteration: 1
  trial_id: '70527_00000'
  


Trial name,status,loc,lr,n_fmaps,bs,loss_G,loss_D,FID,auroc,FID_cross,auroc_cross,training_iteration
train_Stage1_70527_00000,RUNNING,172.28.0.2:787,0.0001,8,8,5.90462,0.0033345,191.125,0.389022,277.25,0.398479,1.0
train_Stage1_70527_00001,PENDING,,0.0001,16,8,,,,,,,
train_Stage1_70527_00002,PENDING,,0.0001,32,8,,,,,,,
train_Stage1_70527_00003,PENDING,,0.0001,64,8,,,,,,,


[2m[36m(pid=787)[0m 2021-09-07 23:02:12,412	INFO trainable.py:76 -- Checkpoint size is 91896386 bytes
[2m[36m(pid=957)[0m Using native 16bit precision.
[2m[36m(pid=957)[0m GPU available: True, used: True
[2m[36m(pid=957)[0m TPU available: False, using: 0 TPU cores
[2m[36m(pid=957)[0m IPU available: False, using: 0 IPUs
[2m[36m(pid=957)[0m LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
[2m[36m(pid=957)[0m 
[2m[36m(pid=957)[0m   | Name          | Type               | Params
[2m[36m(pid=957)[0m -----------------------------------------------------
[2m[36m(pid=957)[0m 0 | generator     | DCGANGenerator     | 377 K 
[2m[36m(pid=957)[0m 1 | discriminator | DCGANDiscriminator | 174 K 
[2m[36m(pid=957)[0m 2 | criterion     | BCEWithLogitsLoss  | 0     
[2m[36m(pid=957)[0m 3 | modelF        | ResNet             | 11.2 M
[2m[36m(pid=957)[0m 4 | lastF         | Sequential         | 1.5 K 
[2m[36m(pid=957)[0m 5 | modelJ        | ResNet             | 11.2 M


[2m[36m(pid=957)[0m Validation sanity check: 0it [00:00, ?it/s]Validation sanity check:   0%|          | 0/2 [00:00<?, ?it/s]


[2m[36m(pid=957)[0m   "`Trainer.running_sanity_check` has been renamed to `Trainer.sanity_checking` and will be removed in v1.5."


Epoch 0:   0%|          | 0/1030 [00:00<00:00, 2451.38it/s] 
Epoch 0:  10%|█         | 103/1030 [00:08<01:11, 12.93it/s, loss=2.03, v_num=.]
Epoch 0:  20%|██        | 206/1030 [00:15<01:03, 13.05it/s, loss=2.45, v_num=.]
Epoch 0:  30%|███       | 309/1030 [00:23<00:55, 13.09it/s, loss=2.84, v_num=.]
Epoch 0:  40%|████      | 412/1030 [00:31<00:47, 13.08it/s, loss=3, v_num=.]   
Epoch 0:  50%|█████     | 515/1030 [00:39<00:39, 13.09it/s, loss=3.14, v_num=.]
Epoch 0:  60%|██████    | 618/1030 [00:47<00:31, 13.06it/s, loss=3.18, v_num=.]
Epoch 0:  70%|███████   | 721/1030 [00:55<00:23, 13.04it/s, loss=3.32, v_num=.]
Epoch 0:  80%|████████  | 824/1030 [01:03<00:15, 13.01it/s, loss=3.48, v_num=.]
Epoch 0:  90%|█████████ | 927/1030 [01:11<00:07, 13.01it/s, loss=3.49, v_num=.]
Epoch 0: 100%|██████████| 1030/1030 [01:11<00:00, 14.36it/s, loss=3.49, v_num=.]
Validating: 0it [00:00, ?it/s][A
Validating:   0%|          | 0/93 [00:00<?, ?it/s][A
[2m[36m(pid=957)[0m 
Validating: 100%|█████████

[2m[36m(pid=957)[0m   "`Trainer.running_sanity_check` has been renamed to `Trainer.sanity_checking` and will be removed in v1.5."


Result for train_Stage1_70527_00001:
  FID: 30.765625
  FID_cross: 40.96875
  auroc: 0.4423384666442871
  auroc_cross: 0.42358049750328064
  date: 2021-09-07_23-03-45
  done: false
  experiment_id: 2dfd70ad19c84d218b50ad9c615885cf
  hostname: e89b1a816e20
  iterations_since_restore: 1
  loss_D: 0.0015495466068387032
  loss_G: 6.983859062194824
  node_ip: 172.28.0.2
  pid: 957
  should_checkpoint: true
  time_since_restore: 89.65257692337036
  time_this_iter_s: 89.65257692337036
  time_total_s: 89.65257692337036
  timestamp: 1631055825
  timesteps_since_restore: 0
  training_iteration: 1
  trial_id: '70527_00001'
  


Trial name,status,loc,lr,n_fmaps,bs,loss_G,loss_D,FID,auroc,FID_cross,auroc_cross,training_iteration
train_Stage1_70527_00001,RUNNING,172.28.0.2:957,0.0001,16,8,6.98386,0.00154955,30.7656,0.442338,40.9688,0.42358,1.0
train_Stage1_70527_00000,PAUSED,,0.0001,8,8,5.90462,0.0033345,191.125,0.389022,277.25,0.398479,1.0
train_Stage1_70527_00002,PENDING,,0.0001,32,8,,,,,,,
train_Stage1_70527_00003,PENDING,,0.0001,64,8,,,,,,,


[2m[36m(pid=957)[0m 2021-09-07 23:03:45,402	INFO trainable.py:76 -- Checkpoint size is 96244802 bytes
[2m[36m(pid=957)[0m 2021-09-07 23:03:45,795	INFO trainable.py:76 -- Checkpoint size is 96244802 bytes


Trial name,status,loc,lr,n_fmaps,bs,loss_G,loss_D,FID,auroc,FID_cross,auroc_cross,training_iteration
train_Stage1_70527_00002,RUNNING,,0.0001,32,8,,,,,,,
train_Stage1_70527_00000,PAUSED,,0.0001,8,8,5.90462,0.0033345,191.125,0.389022,277.25,0.398479,1.0
train_Stage1_70527_00001,PAUSED,,0.0001,16,8,6.98386,0.00154955,30.7656,0.442338,40.9688,0.42358,1.0
train_Stage1_70527_00003,PENDING,,0.0001,64,8,,,,,,,


[2m[36m(pid=1044)[0m Using native 16bit precision.
[2m[36m(pid=1044)[0m GPU available: True, used: True
[2m[36m(pid=1044)[0m TPU available: False, using: 0 TPU cores
[2m[36m(pid=1044)[0m IPU available: False, using: 0 IPUs
[2m[36m(pid=1044)[0m LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Trial name,status,loc,lr,n_fmaps,bs,loss_G,loss_D,FID,auroc,FID_cross,auroc_cross,training_iteration
train_Stage1_70527_00002,RUNNING,,0.0001,32,8,,,,,,,
train_Stage1_70527_00000,PAUSED,,0.0001,8,8,5.90462,0.0033345,191.125,0.389022,277.25,0.398479,1.0
train_Stage1_70527_00001,PAUSED,,0.0001,16,8,6.98386,0.00154955,30.7656,0.442338,40.9688,0.42358,1.0
train_Stage1_70527_00003,PENDING,,0.0001,64,8,,,,,,,


[2m[36m(pid=1044)[0m 
[2m[36m(pid=1044)[0m   | Name          | Type               | Params
[2m[36m(pid=1044)[0m -----------------------------------------------------
[2m[36m(pid=1044)[0m 0 | generator     | DCGANGenerator     | 1.1 M 
[2m[36m(pid=1044)[0m 1 | discriminator | DCGANDiscriminator | 693 K 
[2m[36m(pid=1044)[0m 2 | criterion     | BCEWithLogitsLoss  | 0     
[2m[36m(pid=1044)[0m 3 | modelF        | ResNet             | 11.2 M
[2m[36m(pid=1044)[0m 4 | lastF         | Sequential         | 1.5 K 
[2m[36m(pid=1044)[0m 5 | modelJ        | ResNet             | 11.2 M
[2m[36m(pid=1044)[0m 6 | lastJ         | Sequential         | 1.5 K 
[2m[36m(pid=1044)[0m 7 | val_metrics   | MetricCollection   | 22.3 M
[2m[36m(pid=1044)[0m -----------------------------------------------------
[2m[36m(pid=1044)[0m 24.1 M    Trainable params
[2m[36m(pid=1044)[0m 0         Non-trainable params
[2m[36m(pid=1044)[0m 24.1 M    Total params
[2m[36m(pid=1044

[2m[36m(pid=1044)[0m Validation sanity check: 0it [00:00, ?it/s]Validation sanity check:   0%|          | 0/2 [00:00<?, ?it/s]


[2m[36m(pid=1044)[0m   return torch.max_pool2d(input, kernel_size, stride, padding, dilation, ceil_mode)
[2m[36m(pid=1044)[0m   "`Trainer.running_sanity_check` has been renamed to `Trainer.sanity_checking` and will be removed in v1.5."


[2m[36m(pid=1044)[0m                                                               Training: -1it [00:00, ?it/s]Training:   0%|          | 0/1030 [00:00<00:00, 24385.49it/s]Epoch 0:   0%|          | 0/1030 [00:00<00:00, 3975.64it/s]  


Trial name,status,loc,lr,n_fmaps,bs,loss_G,loss_D,FID,auroc,FID_cross,auroc_cross,training_iteration
train_Stage1_70527_00002,RUNNING,,0.0001,32,8,,,,,,,
train_Stage1_70527_00000,PAUSED,,0.0001,8,8,5.90462,0.0033345,191.125,0.389022,277.25,0.398479,1.0
train_Stage1_70527_00001,PAUSED,,0.0001,16,8,6.98386,0.00154955,30.7656,0.442338,40.9688,0.42358,1.0
train_Stage1_70527_00003,PENDING,,0.0001,64,8,,,,,,,


Epoch 0:  10%|█         | 103/1030 [00:09<01:27, 10.57it/s, loss=2.59, v_num=.]
Epoch 0:  20%|██        | 206/1030 [00:19<01:16, 10.70it/s, loss=2.99, v_num=.]
Epoch 0:  30%|███       | 309/1030 [00:28<01:07, 10.76it/s, loss=2.37, v_num=.]
Epoch 0:  40%|████      | 412/1030 [00:38<00:57, 10.76it/s, loss=3.47, v_num=.]
Epoch 0:  50%|█████     | 515/1030 [00:48<00:48, 10.72it/s, loss=2.3, v_num=.] 
Epoch 0:  60%|██████    | 618/1030 [00:57<00:38, 10.70it/s, loss=1.72, v_num=.]
Epoch 0:  70%|███████   | 721/1030 [01:07<00:28, 10.68it/s, loss=1.67, v_num=.]
Epoch 0:  80%|████████  | 824/1030 [01:17<00:19, 10.66it/s, loss=1.65, v_num=.]
Epoch 0:  90%|█████████ | 927/1030 [01:27<00:09, 10.64it/s, loss=1.66, v_num=.]
Epoch 0: 100%|██████████| 1030/1030 [01:27<00:00, 11.75it/s, loss=1.66, v_num=.]
Validating: 0it [00:00, ?it/s][A
Validating:   0%|          | 0/93 [00:00<?, ?it/s][A
[2m[36m(pid=1044)[0m 
Validating: 100%|██████████| 93/93 [00:07<00:00, 11.91it/s][A


[2m[36m(pid=1044)[0m   "`Trainer.running_sanity_check` has been renamed to `Trainer.sanity_checking` and will be removed in v1.5."


Result for train_Stage1_70527_00002:
  FID: 0.70361328125
  FID_cross: 1.5791015625
  auroc: 0.5101351737976074
  auroc_cross: 0.42421388626098633
  date: 2021-09-07_23-05-35
  done: false
  experiment_id: 92ef339ced7541c68ae490a4eb574118
  hostname: e89b1a816e20
  iterations_since_restore: 1
  loss_D: 0.41784751415252686
  loss_G: 2.7400169372558594
  node_ip: 172.28.0.2
  pid: 1044
  should_checkpoint: true
  time_since_restore: 106.1958703994751
  time_this_iter_s: 106.1958703994751
  time_total_s: 106.1958703994751
  timestamp: 1631055935
  timesteps_since_restore: 0
  training_iteration: 1
  trial_id: '70527_00002'
  


Trial name,status,loc,lr,n_fmaps,bs,loss_G,loss_D,FID,auroc,FID_cross,auroc_cross,training_iteration
train_Stage1_70527_00002,RUNNING,172.28.0.2:1044,0.0001,32,8,2.74002,0.417848,0.703613,0.510135,1.5791,0.424214,1.0
train_Stage1_70527_00000,PAUSED,,0.0001,8,8,5.90462,0.0033345,191.125,0.389022,277.25,0.398479,1.0
train_Stage1_70527_00001,PAUSED,,0.0001,16,8,6.98386,0.00154955,30.7656,0.442338,40.9688,0.42358,1.0
train_Stage1_70527_00003,PENDING,,0.0001,64,8,,,,,,,


[2m[36m(pid=1044)[0m 2021-09-07 23:05:35,957	INFO trainable.py:76 -- Checkpoint size is 111134338 bytes
[2m[36m(pid=1044)[0m 2021-09-07 23:05:36,834	INFO trainable.py:76 -- Checkpoint size is 111134338 bytes


Trial name,status,loc,lr,n_fmaps,bs,loss_G,loss_D,FID,auroc,FID_cross,auroc_cross,training_iteration
train_Stage1_70527_00003,RUNNING,,0.0001,64,8,,,,,,,
train_Stage1_70527_00001,PAUSED,,0.0001,16,8,6.98386,0.00154955,30.7656,0.442338,40.9688,0.42358,1.0
train_Stage1_70527_00002,PAUSED,,0.0001,32,8,2.74002,0.417848,0.703613,0.510135,1.5791,0.424214,1.0
train_Stage1_70527_00000,PENDING,,0.0001,8,8,5.90462,0.0033345,191.125,0.389022,277.25,0.398479,1.0


[2m[36m(pid=1127)[0m Using native 16bit precision.
[2m[36m(pid=1127)[0m GPU available: True, used: True
[2m[36m(pid=1127)[0m TPU available: False, using: 0 TPU cores
[2m[36m(pid=1127)[0m IPU available: False, using: 0 IPUs
[2m[36m(pid=1127)[0m LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
[2m[36m(pid=1127)[0m 
[2m[36m(pid=1127)[0m   | Name          | Type               | Params
[2m[36m(pid=1127)[0m -----------------------------------------------------
[2m[36m(pid=1127)[0m 0 | generator     | DCGANGenerator     | 3.6 M 
[2m[36m(pid=1127)[0m 1 | discriminator | DCGANDiscriminator | 2.8 M 
[2m[36m(pid=1127)[0m 2 | criterion     | BCEWithLogitsLoss  | 0     
[2m[36m(pid=1127)[0m 3 | modelF        | ResNet             | 11.2 M
[2m[36m(pid=1127)[0m 4 | lastF         | Sequential         | 1.5 K 
[2m[36m(pid=1127)[0m 5 | modelJ        | ResNet             | 11.2 M
[2m[36m(pid=1127)[0m 6 | lastJ         | Sequential         | 1.5 K 
[2m[36m(pid=1127)

[2m[36m(pid=1127)[0m Validation sanity check: 0it [00:00, ?it/s]Validation sanity check:   0%|          | 0/2 [00:00<?, ?it/s]


[2m[36m(pid=1127)[0m   return torch.max_pool2d(input, kernel_size, stride, padding, dilation, ceil_mode)


Trial name,status,loc,lr,n_fmaps,bs,loss_G,loss_D,FID,auroc,FID_cross,auroc_cross,training_iteration
train_Stage1_70527_00003,RUNNING,,0.0001,64,8,,,,,,,
train_Stage1_70527_00001,PAUSED,,0.0001,16,8,6.98386,0.00154955,30.7656,0.442338,40.9688,0.42358,1.0
train_Stage1_70527_00002,PAUSED,,0.0001,32,8,2.74002,0.417848,0.703613,0.510135,1.5791,0.424214,1.0
train_Stage1_70527_00000,PENDING,,0.0001,8,8,5.90462,0.0033345,191.125,0.389022,277.25,0.398479,1.0


[2m[36m(pid=1127)[0m   "`Trainer.running_sanity_check` has been renamed to `Trainer.sanity_checking` and will be removed in v1.5."


Epoch 0:   0%|          | 0/1030 [00:00<00:00, 3248.88it/s]  
Epoch 0:  10%|█         | 103/1030 [00:13<02:04,  7.46it/s, loss=3.27, v_num=.]
Epoch 0:  20%|██        | 206/1030 [00:27<01:49,  7.51it/s, loss=3.86, v_num=.]
Epoch 0:  30%|███       | 309/1030 [00:41<01:36,  7.50it/s, loss=3.77, v_num=.]
Epoch 0:  40%|████      | 412/1030 [00:54<01:22,  7.52it/s, loss=3.96, v_num=.]
Epoch 0:  50%|█████     | 515/1030 [01:08<01:08,  7.52it/s, loss=4.02, v_num=.]
Epoch 0:  60%|██████    | 618/1030 [01:22<00:54,  7.53it/s, loss=4.08, v_num=.]
Epoch 0:  70%|███████   | 721/1030 [01:35<00:41,  7.53it/s, loss=3.88, v_num=.]
Epoch 0:  80%|████████  | 824/1030 [01:49<00:27,  7.55it/s, loss=2.56, v_num=.]
Epoch 0:  90%|█████████ | 927/1030 [02:02<00:13,  7.56it/s, loss=2.88, v_num=.]
Epoch 0: 100%|██████████| 1030/1030 [02:03<00:00,  8.36it/s, loss=2.88, v_num=.]
Validating: 0it [00:00, ?it/s][A
Validating:   0%|          | 0/93 [00:00<?, ?it/s][A
[2m[36m(pid=1127)[0m 
Validating: 100%|███████

[2m[36m(pid=1127)[0m   "`Trainer.running_sanity_check` has been renamed to `Trainer.sanity_checking` and will be removed in v1.5."


Result for train_Stage1_70527_00003:
  FID: 13.28125
  FID_cross: 5.4296875
  auroc: 0.4093356430530548
  auroc_cross: 0.4226904809474945
  date: 2021-09-07_23-08-01
  done: false
  experiment_id: 9b670fdf81a34eecb2b1085a111512ff
  hostname: e89b1a816e20
  iterations_since_restore: 1
  loss_D: 0.06155066192150116
  loss_G: 6.270906448364258
  node_ip: 172.28.0.2
  pid: 1127
  should_checkpoint: true
  time_since_restore: 142.04967093467712
  time_this_iter_s: 142.04967093467712
  time_total_s: 142.04967093467712
  timestamp: 1631056081
  timesteps_since_restore: 0
  training_iteration: 1
  trial_id: '70527_00003'
  


Trial name,status,loc,lr,n_fmaps,bs,loss_G,loss_D,FID,auroc,FID_cross,auroc_cross,training_iteration
train_Stage1_70527_00003,RUNNING,172.28.0.2:1127,0.0001,64,8,6.27091,0.0615507,13.2812,0.409336,5.42969,0.42269,1
train_Stage1_70527_00001,PAUSED,,0.0001,16,8,6.98386,0.00154955,30.7656,0.442338,40.9688,0.42358,1
train_Stage1_70527_00002,PAUSED,,0.0001,32,8,2.74002,0.417848,0.703613,0.510135,1.5791,0.424214,1
train_Stage1_70527_00000,PENDING,,0.0001,8,8,5.90462,0.0033345,191.125,0.389022,277.25,0.398479,1


[2m[36m(pid=1127)[0m 2021-09-07 23:08:02,607	INFO trainable.py:76 -- Checkpoint size is 165685954 bytes
2021-09-07 23:08:02,924	INFO trainable.py:76 -- Checkpoint size is 91896386 bytes
[2m[36m(pid=1205)[0m 2021-09-07 23:08:06,494	INFO trainable.py:383 -- Restored on 172.28.0.2 from checkpoint: /content/drive/MyDrive/Logs/F/Stage1/pbt_tanh/train_Stage1_70527_00000_0_n_fmaps=8_2021-09-07_23-00-47/checkpoint_tmp1a4be0/./
[2m[36m(pid=1205)[0m 2021-09-07 23:08:06,494	INFO trainable.py:390 -- Current state after restoring: {'_iteration': 1, '_timesteps_total': None, '_time_total': 81.98236560821533, '_episodes_total': None}


Trial name,status,loc,lr,n_fmaps,bs,loss_G,loss_D,FID,auroc,FID_cross,auroc_cross,training_iteration
train_Stage1_70527_00000,RUNNING,,0.0001,8,8,5.90462,0.0033345,191.125,0.389022,277.25,0.398479,1
train_Stage1_70527_00002,PAUSED,,0.0001,32,8,2.74002,0.417848,0.703613,0.510135,1.5791,0.424214,1
train_Stage1_70527_00003,PAUSED,,0.0001,64,8,6.27091,0.0615507,13.2812,0.409336,5.42969,0.42269,1
train_Stage1_70527_00001,PENDING,,0.0001,16,8,6.98386,0.00154955,30.7656,0.442338,40.9688,0.42358,1


[2m[36m(pid=1205)[0m Using native 16bit precision.
[2m[36m(pid=1205)[0m GPU available: True, used: True
[2m[36m(pid=1205)[0m TPU available: False, using: 0 TPU cores
[2m[36m(pid=1205)[0m IPU available: False, using: 0 IPUs
[2m[36m(pid=1205)[0m Restoring states from the checkpoint file at /content/drive/MyDrive/Logs/F/Stage1/pbt_tanh/train_Stage1_70527_00000_0_n_fmaps=8_2021-09-07_23-00-47/checkpoint_tmp1a4be0/./checkpoint
[2m[36m(pid=1205)[0m LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Trial name,status,loc,lr,n_fmaps,bs,loss_G,loss_D,FID,auroc,FID_cross,auroc_cross,training_iteration
train_Stage1_70527_00000,RUNNING,,0.0001,8,8,5.90462,0.0033345,191.125,0.389022,277.25,0.398479,1
train_Stage1_70527_00002,PAUSED,,0.0001,32,8,2.74002,0.417848,0.703613,0.510135,1.5791,0.424214,1
train_Stage1_70527_00003,PAUSED,,0.0001,64,8,6.27091,0.0615507,13.2812,0.409336,5.42969,0.42269,1
train_Stage1_70527_00001,PENDING,,0.0001,16,8,6.98386,0.00154955,30.7656,0.442338,40.9688,0.42358,1


[2m[36m(pid=1205)[0m Restored all states from the checkpoint file at /content/drive/MyDrive/Logs/F/Stage1/pbt_tanh/train_Stage1_70527_00000_0_n_fmaps=8_2021-09-07_23-00-47/checkpoint_tmp1a4be0/./checkpoint
[2m[36m(pid=1205)[0m 
[2m[36m(pid=1205)[0m   | Name          | Type               | Params
[2m[36m(pid=1205)[0m -----------------------------------------------------
[2m[36m(pid=1205)[0m 0 | generator     | DCGANGenerator     | 146 K 
[2m[36m(pid=1205)[0m 1 | discriminator | DCGANDiscriminator | 44.4 K
[2m[36m(pid=1205)[0m 2 | criterion     | BCEWithLogitsLoss  | 0     
[2m[36m(pid=1205)[0m 3 | modelF        | ResNet             | 11.2 M
[2m[36m(pid=1205)[0m 4 | lastF         | Sequential         | 1.5 K 
[2m[36m(pid=1205)[0m 5 | modelJ        | ResNet             | 11.2 M
[2m[36m(pid=1205)[0m 6 | lastJ         | Sequential         | 1.5 K 
[2m[36m(pid=1205)[0m 7 | val_metrics   | MetricCollection   | 22.3 M
[2m[36m(pid=1205)[0m ----------------

[2m[36m(pid=1205)[0m Validation sanity check: 0it [00:00, ?it/s]Validation sanity check:   0%|          | 0/2 [00:00<?, ?it/s]


[2m[36m(pid=1205)[0m   "`Trainer.running_sanity_check` has been renamed to `Trainer.sanity_checking` and will be removed in v1.5."


Epoch 1:   0%|          | 0/1030 [00:00<00:00, 2114.06it/s] 
Epoch 1:  10%|█         | 103/1030 [00:07<01:04, 14.28it/s, loss=3.07, v_num=.]
Epoch 1:  20%|██        | 206/1030 [00:14<00:57, 14.38it/s, loss=3.19, v_num=.]
Epoch 1:  30%|███       | 309/1030 [00:21<00:49, 14.51it/s, loss=3.32, v_num=.]
Epoch 1:  40%|████      | 412/1030 [00:28<00:42, 14.57it/s, loss=3.25, v_num=.]
Epoch 1:  50%|█████     | 515/1030 [00:35<00:35, 14.55it/s, loss=3.29, v_num=.]
Epoch 1:  60%|██████    | 618/1030 [00:42<00:28, 14.54it/s, loss=3.53, v_num=.]
Epoch 1:  70%|███████   | 721/1030 [00:49<00:21, 14.54it/s, loss=3.57, v_num=.]
Epoch 1:  80%|████████  | 824/1030 [00:56<00:14, 14.55it/s, loss=3.63, v_num=.]
Epoch 1:  90%|█████████ | 927/1030 [01:03<00:07, 14.56it/s, loss=3.85, v_num=.]
Epoch 1: 100%|██████████| 1030/1030 [01:04<00:00, 16.06it/s, loss=3.85, v_num=.]
Validating: 0it [00:00, ?it/s][A
Validating:   0%|          | 0/93 [00:00<?, ?it/s][A
[2m[36m(pid=1205)[0m 
Validating: 100%|████████

[2m[36m(pid=1205)[0m   "`Trainer.running_sanity_check` has been renamed to `Trainer.sanity_checking` and will be removed in v1.5."


Result for train_Stage1_70527_00000:
  FID: 41.53125
  FID_cross: 73.5625
  auroc: 0.5105860829353333
  auroc_cross: 0.5025189518928528
  date: 2021-09-07_23-09-29
  done: false
  experiment_id: f4cb33f5595841f8bba99795ccc25185
  hostname: e89b1a816e20
  iterations_since_restore: 1
  loss_D: 0.0005351308500394225
  loss_G: 7.743613243103027
  node_ip: 172.28.0.2
  pid: 1205
  should_checkpoint: true
  time_since_restore: 83.0351984500885
  time_this_iter_s: 83.0351984500885
  time_total_s: 165.01756405830383
  timestamp: 1631056169
  timesteps_since_restore: 0
  training_iteration: 2
  trial_id: '70527_00000'
  


Trial name,status,loc,lr,n_fmaps,bs,loss_G,loss_D,FID,auroc,FID_cross,auroc_cross,training_iteration
train_Stage1_70527_00000,RUNNING,172.28.0.2:1205,0.0001,8,8,7.74361,0.000535131,41.5312,0.510586,73.5625,0.502519,2
train_Stage1_70527_00002,PAUSED,,0.0001,32,8,2.74002,0.417848,0.703613,0.510135,1.5791,0.424214,1
train_Stage1_70527_00003,PAUSED,,0.0001,64,8,6.27091,0.0615507,13.2812,0.409336,5.42969,0.42269,1
train_Stage1_70527_00001,PENDING,,0.0001,16,8,6.98386,0.00154955,30.7656,0.442338,40.9688,0.42358,1


[2m[36m(pid=1205)[0m 2021-09-07 23:09:29,919	INFO trainable.py:76 -- Checkpoint size is 91896386 bytes
[2m[36m(pid=1205)[0m 2021-09-07 23:09:30,230	INFO trainable.py:76 -- Checkpoint size is 91896386 bytes
2021-09-07 23:09:32,411	INFO trainable.py:76 -- Checkpoint size is 96244802 bytes


Trial name,status,loc,lr,n_fmaps,bs,loss_G,loss_D,FID,auroc,FID_cross,auroc_cross,training_iteration
train_Stage1_70527_00001,RUNNING,,0.0001,16,8,6.98386,0.00154955,30.7656,0.442338,40.9688,0.42358,1
train_Stage1_70527_00000,PAUSED,,0.0001,8,8,7.74361,0.000535131,41.5312,0.510586,73.5625,0.502519,2
train_Stage1_70527_00003,PAUSED,,0.0001,64,8,6.27091,0.0615507,13.2812,0.409336,5.42969,0.42269,1
train_Stage1_70527_00002,PENDING,,0.0001,32,8,2.74002,0.417848,0.703613,0.510135,1.5791,0.424214,1


[2m[36m(pid=1294)[0m 2021-09-07 23:09:34,867	INFO trainable.py:383 -- Restored on 172.28.0.2 from checkpoint: /content/drive/MyDrive/Logs/F/Stage1/pbt_tanh/train_Stage1_70527_00001_1_n_fmaps=16_2021-09-07_23-00-47/checkpoint_tmp91a526/./
[2m[36m(pid=1294)[0m 2021-09-07 23:09:34,867	INFO trainable.py:390 -- Current state after restoring: {'_iteration': 1, '_timesteps_total': None, '_time_total': 89.65257692337036, '_episodes_total': None}
[2m[36m(pid=1294)[0m Using native 16bit precision.
[2m[36m(pid=1294)[0m GPU available: True, used: True
[2m[36m(pid=1294)[0m TPU available: False, using: 0 TPU cores
[2m[36m(pid=1294)[0m IPU available: False, using: 0 IPUs
[2m[36m(pid=1294)[0m Restoring states from the checkpoint file at /content/drive/MyDrive/Logs/F/Stage1/pbt_tanh/train_Stage1_70527_00001_1_n_fmaps=16_2021-09-07_23-00-47/checkpoint_tmp91a526/./checkpoint
[2m[36m(pid=1294)[0m LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Trial name,status,loc,lr,n_fmaps,bs,loss_G,loss_D,FID,auroc,FID_cross,auroc_cross,training_iteration
train_Stage1_70527_00001,RUNNING,,0.0001,16,8,6.98386,0.00154955,30.7656,0.442338,40.9688,0.42358,1
train_Stage1_70527_00000,PAUSED,,0.0001,8,8,7.74361,0.000535131,41.5312,0.510586,73.5625,0.502519,2
train_Stage1_70527_00003,PAUSED,,0.0001,64,8,6.27091,0.0615507,13.2812,0.409336,5.42969,0.42269,1
train_Stage1_70527_00002,PENDING,,0.0001,32,8,2.74002,0.417848,0.703613,0.510135,1.5791,0.424214,1


[2m[36m(pid=1294)[0m Restored all states from the checkpoint file at /content/drive/MyDrive/Logs/F/Stage1/pbt_tanh/train_Stage1_70527_00001_1_n_fmaps=16_2021-09-07_23-00-47/checkpoint_tmp91a526/./checkpoint
[2m[36m(pid=1294)[0m 
[2m[36m(pid=1294)[0m   | Name          | Type               | Params
[2m[36m(pid=1294)[0m -----------------------------------------------------
[2m[36m(pid=1294)[0m 0 | generator     | DCGANGenerator     | 377 K 
[2m[36m(pid=1294)[0m 1 | discriminator | DCGANDiscriminator | 174 K 
[2m[36m(pid=1294)[0m 2 | criterion     | BCEWithLogitsLoss  | 0     
[2m[36m(pid=1294)[0m 3 | modelF        | ResNet             | 11.2 M
[2m[36m(pid=1294)[0m 4 | lastF         | Sequential         | 1.5 K 
[2m[36m(pid=1294)[0m 5 | modelJ        | ResNet             | 11.2 M
[2m[36m(pid=1294)[0m 6 | lastJ         | Sequential         | 1.5 K 
[2m[36m(pid=1294)[0m 7 | val_metrics   | MetricCollection   | 22.3 M
[2m[36m(pid=1294)[0m ---------------

[2m[36m(pid=1294)[0m Validation sanity check: 0it [00:00, ?it/s]Validation sanity check:   0%|          | 0/2 [00:00<?, ?it/s]


[2m[36m(pid=1294)[0m   "`Trainer.running_sanity_check` has been renamed to `Trainer.sanity_checking` and will be removed in v1.5."


Epoch 1:   0%|          | 0/1030 [00:00<00:00, 3387.97it/s]  
Epoch 1:  10%|█         | 103/1030 [00:07<01:10, 13.07it/s, loss=3.59, v_num=.]
Epoch 1:  20%|██        | 206/1030 [00:15<01:02, 13.09it/s, loss=3.66, v_num=.]
Epoch 1:  30%|███       | 309/1030 [00:23<00:55, 13.09it/s, loss=3.86, v_num=.]
Epoch 1:  40%|████      | 412/1030 [00:31<00:47, 13.10it/s, loss=4.03, v_num=.]
Epoch 1:  50%|█████     | 515/1030 [00:39<00:39, 13.11it/s, loss=4.12, v_num=.]
Epoch 1:  60%|██████    | 618/1030 [00:47<00:31, 13.08it/s, loss=4.1, v_num=.] 
Epoch 1:  70%|███████   | 721/1030 [00:55<00:23, 13.05it/s, loss=4.09, v_num=.]
Epoch 1:  80%|████████  | 824/1030 [01:03<00:15, 13.03it/s, loss=4.28, v_num=.]
Epoch 1:  90%|█████████ | 927/1030 [01:11<00:07, 13.01it/s, loss=4.37, v_num=.]
Epoch 1: 100%|██████████| 1030/1030 [01:11<00:00, 14.36it/s, loss=4.37, v_num=.]
Validating: 0it [00:00, ?it/s][A
Validating:   0%|          | 0/93 [00:00<?, ?it/s][A
[2m[36m(pid=1294)[0m 
Validating: 100%|███████

[2m[36m(pid=1294)[0m   "`Trainer.running_sanity_check` has been renamed to `Trainer.sanity_checking` and will be removed in v1.5."


Result for train_Stage1_70527_00001:
  FID: 32.9375
  FID_cross: 42.15625
  auroc: 0.4622346758842468
  auroc_cross: 0.46624955534935
  date: 2021-09-07_23-11-05
  done: false
  experiment_id: 2dfd70ad19c84d218b50ad9c615885cf
  hostname: e89b1a816e20
  iterations_since_restore: 1
  loss_D: 0.00020558886171784252
  loss_G: 8.990370750427246
  node_ip: 172.28.0.2
  pid: 1294
  should_checkpoint: true
  time_since_restore: 90.62591862678528
  time_this_iter_s: 90.62591862678528
  time_total_s: 180.27849555015564
  timestamp: 1631056265
  timesteps_since_restore: 0
  training_iteration: 2
  trial_id: '70527_00001'
  


Trial name,status,loc,lr,n_fmaps,bs,loss_G,loss_D,FID,auroc,FID_cross,auroc_cross,training_iteration
train_Stage1_70527_00001,RUNNING,172.28.0.2:1294,0.0001,16,8,8.99037,0.000205589,32.9375,0.462235,42.1562,0.46625,2
train_Stage1_70527_00000,PAUSED,,0.0001,8,8,7.74361,0.000535131,41.5312,0.510586,73.5625,0.502519,2
train_Stage1_70527_00003,PAUSED,,0.0001,64,8,6.27091,0.0615507,13.2812,0.409336,5.42969,0.42269,1
train_Stage1_70527_00002,PENDING,,0.0001,32,8,2.74002,0.417848,0.703613,0.510135,1.5791,0.424214,1


[2m[36m(pid=1294)[0m 2021-09-07 23:11:05,987	INFO trainable.py:76 -- Checkpoint size is 96244802 bytes
2021-09-07 23:11:06,530	INFO trainable.py:76 -- Checkpoint size is 111134338 bytes


Trial name,status,loc,lr,n_fmaps,bs,loss_G,loss_D,FID,auroc,FID_cross,auroc_cross,training_iteration
train_Stage1_70527_00002,RUNNING,,0.0001,32,8,2.74002,0.417848,0.703613,0.510135,1.5791,0.424214,1
train_Stage1_70527_00000,PAUSED,,0.0001,8,8,7.74361,0.000535131,41.5312,0.510586,73.5625,0.502519,2
train_Stage1_70527_00001,PAUSED,,0.0001,16,8,8.99037,0.000205589,32.9375,0.462235,42.1562,0.46625,2
train_Stage1_70527_00003,PENDING,,0.0001,64,8,6.27091,0.0615507,13.2812,0.409336,5.42969,0.42269,1


[2m[36m(pid=1376)[0m 2021-09-07 23:11:10,953	INFO trainable.py:383 -- Restored on 172.28.0.2 from checkpoint: /content/drive/MyDrive/Logs/F/Stage1/pbt_tanh/train_Stage1_70527_00002_2_n_fmaps=32_2021-09-07_23-02-12/checkpoint_tmpc5718a/./
[2m[36m(pid=1376)[0m 2021-09-07 23:11:10,953	INFO trainable.py:390 -- Current state after restoring: {'_iteration': 1, '_timesteps_total': None, '_time_total': 106.1958703994751, '_episodes_total': None}
[2m[36m(pid=1376)[0m Using native 16bit precision.
[2m[36m(pid=1376)[0m GPU available: True, used: True
[2m[36m(pid=1376)[0m TPU available: False, using: 0 TPU cores
[2m[36m(pid=1376)[0m IPU available: False, using: 0 IPUs
[2m[36m(pid=1376)[0m Restoring states from the checkpoint file at /content/drive/MyDrive/Logs/F/Stage1/pbt_tanh/train_Stage1_70527_00002_2_n_fmaps=32_2021-09-07_23-02-12/checkpoint_tmpc5718a/./checkpoint
[2m[36m(pid=1376)[0m LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Trial name,status,loc,lr,n_fmaps,bs,loss_G,loss_D,FID,auroc,FID_cross,auroc_cross,training_iteration
train_Stage1_70527_00002,RUNNING,,0.0001,32,8,2.74002,0.417848,0.703613,0.510135,1.5791,0.424214,1
train_Stage1_70527_00000,PAUSED,,0.0001,8,8,7.74361,0.000535131,41.5312,0.510586,73.5625,0.502519,2
train_Stage1_70527_00001,PAUSED,,0.0001,16,8,8.99037,0.000205589,32.9375,0.462235,42.1562,0.46625,2
train_Stage1_70527_00003,PENDING,,0.0001,64,8,6.27091,0.0615507,13.2812,0.409336,5.42969,0.42269,1


[2m[36m(pid=1376)[0m Restored all states from the checkpoint file at /content/drive/MyDrive/Logs/F/Stage1/pbt_tanh/train_Stage1_70527_00002_2_n_fmaps=32_2021-09-07_23-02-12/checkpoint_tmpc5718a/./checkpoint
[2m[36m(pid=1376)[0m 
[2m[36m(pid=1376)[0m   | Name          | Type               | Params
[2m[36m(pid=1376)[0m -----------------------------------------------------
[2m[36m(pid=1376)[0m 0 | generator     | DCGANGenerator     | 1.1 M 
[2m[36m(pid=1376)[0m 1 | discriminator | DCGANDiscriminator | 693 K 
[2m[36m(pid=1376)[0m 2 | criterion     | BCEWithLogitsLoss  | 0     
[2m[36m(pid=1376)[0m 3 | modelF        | ResNet             | 11.2 M
[2m[36m(pid=1376)[0m 4 | lastF         | Sequential         | 1.5 K 
[2m[36m(pid=1376)[0m 5 | modelJ        | ResNet             | 11.2 M
[2m[36m(pid=1376)[0m 6 | lastJ         | Sequential         | 1.5 K 
[2m[36m(pid=1376)[0m 7 | val_metrics   | MetricCollection   | 22.3 M
[2m[36m(pid=1376)[0m ---------------

[2m[36m(pid=1376)[0m Validation sanity check: 0it [00:00, ?it/s]Validation sanity check:   0%|          | 0/2 [00:00<?, ?it/s]


[2m[36m(pid=1376)[0m   return torch.max_pool2d(input, kernel_size, stride, padding, dilation, ceil_mode)
[2m[36m(pid=1376)[0m   "`Trainer.running_sanity_check` has been renamed to `Trainer.sanity_checking` and will be removed in v1.5."


Epoch 1:   0%|          | 0/1030 [00:00<00:00, 3802.63it/s]  
Epoch 1:  10%|█         | 103/1030 [00:09<01:28, 10.48it/s, loss=1.95, v_num=.]
Epoch 1:  20%|██        | 206/1030 [00:19<01:17, 10.59it/s, loss=1.38, v_num=.]
Epoch 1:  30%|███       | 309/1030 [00:29<01:07, 10.63it/s, loss=1.5, v_num=.] 
Epoch 1:  40%|████      | 412/1030 [00:38<00:58, 10.62it/s, loss=1.52, v_num=.]
Epoch 1:  50%|█████     | 515/1030 [00:48<00:48, 10.58it/s, loss=1.56, v_num=.]
Epoch 1:  60%|██████    | 618/1030 [00:58<00:39, 10.54it/s, loss=1.4, v_num=.] 
Epoch 1:  70%|███████   | 721/1030 [01:08<00:29, 10.52it/s, loss=1.3, v_num=.]
Epoch 1:  80%|████████  | 824/1030 [01:18<00:19, 10.49it/s, loss=1.56, v_num=.]
Epoch 1:  90%|█████████ | 927/1030 [01:28<00:09, 10.48it/s, loss=1.56, v_num=.]
Epoch 1: 100%|██████████| 1030/1030 [01:29<00:00, 11.57it/s, loss=1.56, v_num=.]
Validating: 0it [00:00, ?it/s][A
Validating:   0%|          | 0/93 [00:00<?, ?it/s][A
[2m[36m(pid=1376)[0m 
Validating: 100%|████████

[2m[36m(pid=1376)[0m   "`Trainer.running_sanity_check` has been renamed to `Trainer.sanity_checking` and will be removed in v1.5."


Result for train_Stage1_70527_00002:
  FID: 10.59375
  FID_cross: 16.515625
  auroc: 0.45046722888946533
  auroc_cross: 0.45313936471939087
  date: 2021-09-07_23-12-58
  done: false
  experiment_id: 92ef339ced7541c68ae490a4eb574118
  hostname: e89b1a816e20
  iterations_since_restore: 1
  loss_D: 0.3209291994571686
  loss_G: 2.821678638458252
  node_ip: 172.28.0.2
  pid: 1376
  should_checkpoint: true
  time_since_restore: 107.50416731834412
  time_this_iter_s: 107.50416731834412
  time_total_s: 213.7000377178192
  timestamp: 1631056378
  timesteps_since_restore: 0
  training_iteration: 2
  trial_id: '70527_00002'
  


Trial name,status,loc,lr,n_fmaps,bs,loss_G,loss_D,FID,auroc,FID_cross,auroc_cross,training_iteration
train_Stage1_70527_00002,RUNNING,172.28.0.2:1376,0.0001,32,8,2.82168,0.320929,10.5938,0.450467,16.5156,0.453139,2
train_Stage1_70527_00000,PAUSED,,0.0001,8,8,7.74361,0.000535131,41.5312,0.510586,73.5625,0.502519,2
train_Stage1_70527_00001,PAUSED,,0.0001,16,8,8.99037,0.000205589,32.9375,0.462235,42.1562,0.46625,2
train_Stage1_70527_00003,PENDING,,0.0001,64,8,6.27091,0.0615507,13.2812,0.409336,5.42969,0.42269,1


[2m[36m(pid=1376)[0m 2021-09-07 23:12:58,998	INFO trainable.py:76 -- Checkpoint size is 111134338 bytes
2021-09-07 23:12:59,843	INFO trainable.py:76 -- Checkpoint size is 165685954 bytes


Trial name,status,loc,lr,n_fmaps,bs,loss_G,loss_D,FID,auroc,FID_cross,auroc_cross,training_iteration
train_Stage1_70527_00003,RUNNING,,0.0001,64,8,6.27091,0.0615507,13.2812,0.409336,5.42969,0.42269,1
train_Stage1_70527_00001,PAUSED,,0.0001,16,8,8.99037,0.000205589,32.9375,0.462235,42.1562,0.46625,2
train_Stage1_70527_00002,PAUSED,,0.0001,32,8,2.82168,0.320929,10.5938,0.450467,16.5156,0.453139,2
train_Stage1_70527_00000,PENDING,,0.0001,8,8,7.74361,0.000535131,41.5312,0.510586,73.5625,0.502519,2


[2m[36m(pid=1461)[0m 2021-09-07 23:13:04,166	INFO trainable.py:383 -- Restored on 172.28.0.2 from checkpoint: /content/drive/MyDrive/Logs/F/Stage1/pbt_tanh/train_Stage1_70527_00003_3_n_fmaps=64_2021-09-07_23-03-45/checkpoint_tmp70f321/./
[2m[36m(pid=1461)[0m 2021-09-07 23:13:04,166	INFO trainable.py:390 -- Current state after restoring: {'_iteration': 1, '_timesteps_total': None, '_time_total': 142.04967093467712, '_episodes_total': None}
[2m[36m(pid=1461)[0m Using native 16bit precision.
[2m[36m(pid=1461)[0m GPU available: True, used: True
[2m[36m(pid=1461)[0m TPU available: False, using: 0 TPU cores
[2m[36m(pid=1461)[0m IPU available: False, using: 0 IPUs
[2m[36m(pid=1461)[0m Restoring states from the checkpoint file at /content/drive/MyDrive/Logs/F/Stage1/pbt_tanh/train_Stage1_70527_00003_3_n_fmaps=64_2021-09-07_23-03-45/checkpoint_tmp70f321/./checkpoint
[2m[36m(pid=1461)[0m LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Trial name,status,loc,lr,n_fmaps,bs,loss_G,loss_D,FID,auroc,FID_cross,auroc_cross,training_iteration
train_Stage1_70527_00003,RUNNING,,0.0001,64,8,6.27091,0.0615507,13.2812,0.409336,5.42969,0.42269,1
train_Stage1_70527_00001,PAUSED,,0.0001,16,8,8.99037,0.000205589,32.9375,0.462235,42.1562,0.46625,2
train_Stage1_70527_00002,PAUSED,,0.0001,32,8,2.82168,0.320929,10.5938,0.450467,16.5156,0.453139,2
train_Stage1_70527_00000,PENDING,,0.0001,8,8,7.74361,0.000535131,41.5312,0.510586,73.5625,0.502519,2


[2m[36m(pid=1461)[0m Restored all states from the checkpoint file at /content/drive/MyDrive/Logs/F/Stage1/pbt_tanh/train_Stage1_70527_00003_3_n_fmaps=64_2021-09-07_23-03-45/checkpoint_tmp70f321/./checkpoint
[2m[36m(pid=1461)[0m 
[2m[36m(pid=1461)[0m   | Name          | Type               | Params
[2m[36m(pid=1461)[0m -----------------------------------------------------
[2m[36m(pid=1461)[0m 0 | generator     | DCGANGenerator     | 3.6 M 
[2m[36m(pid=1461)[0m 1 | discriminator | DCGANDiscriminator | 2.8 M 
[2m[36m(pid=1461)[0m 2 | criterion     | BCEWithLogitsLoss  | 0     
[2m[36m(pid=1461)[0m 3 | modelF        | ResNet             | 11.2 M
[2m[36m(pid=1461)[0m 4 | lastF         | Sequential         | 1.5 K 
[2m[36m(pid=1461)[0m 5 | modelJ        | ResNet             | 11.2 M
[2m[36m(pid=1461)[0m 6 | lastJ         | Sequential         | 1.5 K 
[2m[36m(pid=1461)[0m 7 | val_metrics   | MetricCollection   | 22.3 M
[2m[36m(pid=1461)[0m ---------------

[2m[36m(pid=1461)[0m Validation sanity check: 0it [00:00, ?it/s]Validation sanity check:   0%|          | 0/2 [00:00<?, ?it/s]


[2m[36m(pid=1461)[0m   return torch.max_pool2d(input, kernel_size, stride, padding, dilation, ceil_mode)
[2m[36m(pid=1461)[0m   "`Trainer.running_sanity_check` has been renamed to `Trainer.sanity_checking` and will be removed in v1.5."


[2m[36m(pid=1461)[0m                                                               
Epoch 1:   0%|          | 0/1030 [00:00<00:00, 4288.65it/s]  
Epoch 1:  10%|█         | 103/1030 [00:14<02:05,  7.36it/s, loss=1.97, v_num=.]
Epoch 1:  20%|██        | 206/1030 [00:27<01:50,  7.45it/s, loss=2.04, v_num=.]
Epoch 1:  30%|███       | 309/1030 [00:41<01:36,  7.48it/s, loss=1.78, v_num=.]
Epoch 1:  40%|████      | 412/1030 [00:55<01:22,  7.49it/s, loss=2.16, v_num=.]
Epoch 1:  50%|█████     | 515/1030 [01:08<01:08,  7.49it/s, loss=1.68, v_num=.]
Epoch 1:  60%|██████    | 618/1030 [01:22<00:54,  7.50it/s, loss=1.8, v_num=.] 
Epoch 1:  70%|███████   | 721/1030 [01:36<00:41,  7.49it/s, loss=1.91, v_num=.]
Epoch 1:  80%|████████  | 824/1030 [01:50<00:27,  7.49it/s, loss=1.95, v_num=.]
Epoch 1:  90%|█████████ | 927/1030 [02:03<00:13,  7.49it/s, loss=1.95, v_num=.]
Epoch 1: 100%|██████████| 1030/1030 [02:04<00:00,  8.28it/s, loss=1.95, v_num=.]
Validating: 0it [00:00, ?it/s][A
Validating:   

[2m[36m(pid=1461)[0m   "`Trainer.running_sanity_check` has been renamed to `Trainer.sanity_checking` and will be removed in v1.5."


Result for train_Stage1_70527_00003:
  FID: 5.00390625
  FID_cross: 7.92578125
  auroc: 0.47338634729385376
  auroc_cross: 0.47230350971221924
  date: 2021-09-07_23-15-28
  done: false
  experiment_id: 9b670fdf81a34eecb2b1085a111512ff
  hostname: e89b1a816e20
  iterations_since_restore: 1
  loss_D: 0.4107634723186493
  loss_G: 3.6034457683563232
  node_ip: 172.28.0.2
  pid: 1461
  should_checkpoint: true
  time_since_restore: 143.85066890716553
  time_this_iter_s: 143.85066890716553
  time_total_s: 285.90033984184265
  timestamp: 1631056528
  timesteps_since_restore: 0
  training_iteration: 2
  trial_id: '70527_00003'
  


Trial name,status,loc,lr,n_fmaps,bs,loss_G,loss_D,FID,auroc,FID_cross,auroc_cross,training_iteration
train_Stage1_70527_00003,RUNNING,172.28.0.2:1461,0.0001,64,8,3.60345,0.410763,5.00391,0.473386,7.92578,0.472304,2
train_Stage1_70527_00001,PAUSED,,0.0001,16,8,8.99037,0.000205589,32.9375,0.462235,42.1562,0.46625,2
train_Stage1_70527_00002,PAUSED,,0.0001,32,8,2.82168,0.320929,10.5938,0.450467,16.5156,0.453139,2
train_Stage1_70527_00000,PENDING,,0.0001,8,8,7.74361,0.000535131,41.5312,0.510586,73.5625,0.502519,2


[2m[36m(pid=1461)[0m 2021-09-07 23:15:29,919	INFO trainable.py:76 -- Checkpoint size is 165685954 bytes
2021-09-07 23:15:31,083	INFO trainable.py:76 -- Checkpoint size is 91896386 bytes


Trial name,status,loc,lr,n_fmaps,bs,loss_G,loss_D,FID,auroc,FID_cross,auroc_cross,training_iteration
train_Stage1_70527_00000,RUNNING,,0.0001,8,8,7.74361,0.000535131,41.5312,0.510586,73.5625,0.502519,2
train_Stage1_70527_00002,PAUSED,,0.0001,32,8,2.82168,0.320929,10.5938,0.450467,16.5156,0.453139,2
train_Stage1_70527_00003,PAUSED,,0.0001,64,8,3.60345,0.410763,5.00391,0.473386,7.92578,0.472304,2
train_Stage1_70527_00001,PENDING,,0.0001,16,8,8.99037,0.000205589,32.9375,0.462235,42.1562,0.46625,2


[2m[36m(pid=1542)[0m 2021-09-07 23:15:34,469	INFO trainable.py:383 -- Restored on 172.28.0.2 from checkpoint: /content/drive/MyDrive/Logs/F/Stage1/pbt_tanh/train_Stage1_70527_00000_0_n_fmaps=8_2021-09-07_23-00-47/checkpoint_tmp59431f/./
[2m[36m(pid=1542)[0m 2021-09-07 23:15:34,469	INFO trainable.py:390 -- Current state after restoring: {'_iteration': 2, '_timesteps_total': None, '_time_total': 165.01756405830383, '_episodes_total': None}
[2m[36m(pid=1542)[0m Using native 16bit precision.
[2m[36m(pid=1542)[0m GPU available: True, used: True
[2m[36m(pid=1542)[0m TPU available: False, using: 0 TPU cores
[2m[36m(pid=1542)[0m IPU available: False, using: 0 IPUs
[2m[36m(pid=1542)[0m Restoring states from the checkpoint file at /content/drive/MyDrive/Logs/F/Stage1/pbt_tanh/train_Stage1_70527_00000_0_n_fmaps=8_2021-09-07_23-00-47/checkpoint_tmp59431f/./checkpoint
[2m[36m(pid=1542)[0m LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Trial name,status,loc,lr,n_fmaps,bs,loss_G,loss_D,FID,auroc,FID_cross,auroc_cross,training_iteration
train_Stage1_70527_00000,RUNNING,,0.0001,8,8,7.74361,0.000535131,41.5312,0.510586,73.5625,0.502519,2
train_Stage1_70527_00002,PAUSED,,0.0001,32,8,2.82168,0.320929,10.5938,0.450467,16.5156,0.453139,2
train_Stage1_70527_00003,PAUSED,,0.0001,64,8,3.60345,0.410763,5.00391,0.473386,7.92578,0.472304,2
train_Stage1_70527_00001,PENDING,,0.0001,16,8,8.99037,0.000205589,32.9375,0.462235,42.1562,0.46625,2


[2m[36m(pid=1542)[0m Restored all states from the checkpoint file at /content/drive/MyDrive/Logs/F/Stage1/pbt_tanh/train_Stage1_70527_00000_0_n_fmaps=8_2021-09-07_23-00-47/checkpoint_tmp59431f/./checkpoint
[2m[36m(pid=1542)[0m 
[2m[36m(pid=1542)[0m   | Name          | Type               | Params
[2m[36m(pid=1542)[0m -----------------------------------------------------
[2m[36m(pid=1542)[0m 0 | generator     | DCGANGenerator     | 146 K 
[2m[36m(pid=1542)[0m 1 | discriminator | DCGANDiscriminator | 44.4 K
[2m[36m(pid=1542)[0m 2 | criterion     | BCEWithLogitsLoss  | 0     
[2m[36m(pid=1542)[0m 3 | modelF        | ResNet             | 11.2 M
[2m[36m(pid=1542)[0m 4 | lastF         | Sequential         | 1.5 K 
[2m[36m(pid=1542)[0m 5 | modelJ        | ResNet             | 11.2 M
[2m[36m(pid=1542)[0m 6 | lastJ         | Sequential         | 1.5 K 
[2m[36m(pid=1542)[0m 7 | val_metrics   | MetricCollection   | 22.3 M
[2m[36m(pid=1542)[0m ----------------

[2m[36m(pid=1542)[0m Validation sanity check: 0it [00:00, ?it/s]Validation sanity check:   0%|          | 0/2 [00:00<?, ?it/s]


[2m[36m(pid=1542)[0m   return torch.max_pool2d(input, kernel_size, stride, padding, dilation, ceil_mode)
[2m[36m(pid=1542)[0m   "`Trainer.running_sanity_check` has been renamed to `Trainer.sanity_checking` and will be removed in v1.5."


Epoch 2:   0%|          | 0/1030 [00:00<00:00, 4373.62it/s]  
Epoch 2:  10%|█         | 103/1030 [00:10<01:31, 10.12it/s, loss=3.92, v_num=.]
Epoch 2:  20%|██        | 206/1030 [00:17<01:09, 11.89it/s, loss=3.85, v_num=.]
Epoch 2:  30%|███       | 309/1030 [00:24<00:57, 12.54it/s, loss=3.94, v_num=.]
Epoch 2:  40%|████      | 412/1030 [00:32<00:47, 12.90it/s, loss=4.02, v_num=.]
Epoch 2:  50%|█████     | 515/1030 [00:39<00:39, 13.17it/s, loss=4.18, v_num=.]
Epoch 2:  60%|██████    | 618/1030 [00:46<00:30, 13.35it/s, loss=4.23, v_num=.]
Epoch 2:  70%|███████   | 721/1030 [00:53<00:22, 13.49it/s, loss=4.04, v_num=.]
Epoch 2:  80%|████████  | 824/1030 [01:00<00:15, 13.57it/s, loss=4.17, v_num=.]
Epoch 2:  90%|█████████ | 927/1030 [01:08<00:07, 13.65it/s, loss=4.24, v_num=.]
Epoch 2: 100%|██████████| 1030/1030 [01:08<00:00, 15.06it/s, loss=4.24, v_num=.]
Validating: 0it [00:00, ?it/s][A
Validating:   0%|          | 0/93 [00:00<?, ?it/s][A
[2m[36m(pid=1542)[0m 
Validating: 100%|███████

[2m[36m(pid=1542)[0m   "`Trainer.running_sanity_check` has been renamed to `Trainer.sanity_checking` and will be removed in v1.5."


Result for train_Stage1_70527_00000:
  FID: 39.0
  FID_cross: 55.5625
  auroc: 0.4847951829433441
  auroc_cross: 0.46438685059547424
  date: 2021-09-07_23-17-01
  done: false
  experiment_id: f4cb33f5595841f8bba99795ccc25185
  hostname: e89b1a816e20
  iterations_since_restore: 1
  loss_D: 0.00027429067995399237
  loss_G: 8.61932373046875
  node_ip: 172.28.0.2
  pid: 1542
  should_checkpoint: true
  time_since_restore: 87.4700243473053
  time_this_iter_s: 87.4700243473053
  time_total_s: 252.48758840560913
  timestamp: 1631056621
  timesteps_since_restore: 0
  training_iteration: 3
  trial_id: '70527_00000'
  


[2m[36m(pid=1542)[0m 2021-09-07 23:17:02,231	INFO trainable.py:76 -- Checkpoint size is 91896386 bytes


Trial name,status,loc,lr,n_fmaps,bs,loss_G,loss_D,FID,auroc,FID_cross,auroc_cross,training_iteration
train_Stage1_70527_00000,RUNNING,172.28.0.2:1542,0.0001,8,8,8.61932,0.000274291,39.0,0.484795,55.5625,0.464387,3
train_Stage1_70527_00002,PAUSED,,0.0001,32,8,2.82168,0.320929,10.5938,0.450467,16.5156,0.453139,2
train_Stage1_70527_00003,PAUSED,,0.0001,64,8,3.60345,0.410763,5.00391,0.473386,7.92578,0.472304,2
train_Stage1_70527_00001,PENDING,,0.0001,16,8,8.99037,0.000205589,32.9375,0.462235,42.1562,0.46625,2


[2m[36m(pid=1542)[0m 2021-09-07 23:17:02,799	INFO trainable.py:76 -- Checkpoint size is 91896386 bytes
2021-09-07 23:17:03,484	INFO trainable.py:76 -- Checkpoint size is 96244802 bytes
[2m[36m(pid=1631)[0m 2021-09-07 23:17:06,917	INFO trainable.py:383 -- Restored on 172.28.0.2 from checkpoint: /content/drive/MyDrive/Logs/F/Stage1/pbt_tanh/train_Stage1_70527_00001_1_n_fmaps=16_2021-09-07_23-00-47/checkpoint_tmpf3e6f6/./
[2m[36m(pid=1631)[0m 2021-09-07 23:17:06,917	INFO trainable.py:390 -- Current state after restoring: {'_iteration': 2, '_timesteps_total': None, '_time_total': 180.27849555015564, '_episodes_total': None}


Trial name,status,loc,lr,n_fmaps,bs,loss_G,loss_D,FID,auroc,FID_cross,auroc_cross,training_iteration
train_Stage1_70527_00001,RUNNING,,0.0001,16,8,8.99037,0.000205589,32.9375,0.462235,42.1562,0.46625,2
train_Stage1_70527_00000,PAUSED,,0.0001,8,8,8.61932,0.000274291,39.0,0.484795,55.5625,0.464387,3
train_Stage1_70527_00003,PAUSED,,0.0001,64,8,3.60345,0.410763,5.00391,0.473386,7.92578,0.472304,2
train_Stage1_70527_00002,PENDING,,0.0001,32,8,2.82168,0.320929,10.5938,0.450467,16.5156,0.453139,2


[2m[36m(pid=1631)[0m Using native 16bit precision.
[2m[36m(pid=1631)[0m GPU available: True, used: True
[2m[36m(pid=1631)[0m TPU available: False, using: 0 TPU cores
[2m[36m(pid=1631)[0m IPU available: False, using: 0 IPUs
[2m[36m(pid=1631)[0m Restoring states from the checkpoint file at /content/drive/MyDrive/Logs/F/Stage1/pbt_tanh/train_Stage1_70527_00001_1_n_fmaps=16_2021-09-07_23-00-47/checkpoint_tmpf3e6f6/./checkpoint
[2m[36m(pid=1631)[0m LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Trial name,status,loc,lr,n_fmaps,bs,loss_G,loss_D,FID,auroc,FID_cross,auroc_cross,training_iteration
train_Stage1_70527_00001,RUNNING,,0.0001,16,8,8.99037,0.000205589,32.9375,0.462235,42.1562,0.46625,2
train_Stage1_70527_00000,PAUSED,,0.0001,8,8,8.61932,0.000274291,39.0,0.484795,55.5625,0.464387,3
train_Stage1_70527_00003,PAUSED,,0.0001,64,8,3.60345,0.410763,5.00391,0.473386,7.92578,0.472304,2
train_Stage1_70527_00002,PENDING,,0.0001,32,8,2.82168,0.320929,10.5938,0.450467,16.5156,0.453139,2


[2m[36m(pid=1631)[0m Restored all states from the checkpoint file at /content/drive/MyDrive/Logs/F/Stage1/pbt_tanh/train_Stage1_70527_00001_1_n_fmaps=16_2021-09-07_23-00-47/checkpoint_tmpf3e6f6/./checkpoint
[2m[36m(pid=1631)[0m 
[2m[36m(pid=1631)[0m   | Name          | Type               | Params
[2m[36m(pid=1631)[0m -----------------------------------------------------
[2m[36m(pid=1631)[0m 0 | generator     | DCGANGenerator     | 377 K 
[2m[36m(pid=1631)[0m 1 | discriminator | DCGANDiscriminator | 174 K 
[2m[36m(pid=1631)[0m 2 | criterion     | BCEWithLogitsLoss  | 0     
[2m[36m(pid=1631)[0m 3 | modelF        | ResNet             | 11.2 M
[2m[36m(pid=1631)[0m 4 | lastF         | Sequential         | 1.5 K 
[2m[36m(pid=1631)[0m 5 | modelJ        | ResNet             | 11.2 M
[2m[36m(pid=1631)[0m 6 | lastJ         | Sequential         | 1.5 K 
[2m[36m(pid=1631)[0m 7 | val_metrics   | MetricCollection   | 22.3 M
[2m[36m(pid=1631)[0m ---------------

[2m[36m(pid=1631)[0m Validation sanity check: 0it [00:00, ?it/s]Validation sanity check:   0%|          | 0/2 [00:00<?, ?it/s]


[2m[36m(pid=1631)[0m   return torch.max_pool2d(input, kernel_size, stride, padding, dilation, ceil_mode)
[2m[36m(pid=1631)[0m   "`Trainer.running_sanity_check` has been renamed to `Trainer.sanity_checking` and will be removed in v1.5."


Epoch 2:   0%|          | 0/1030 [00:00<00:00, 4169.29it/s]  
Epoch 2:  10%|█         | 103/1030 [00:12<01:47,  8.63it/s, loss=4.34, v_num=.]
Epoch 2:  20%|██        | 206/1030 [00:19<01:19, 10.35it/s, loss=4.5, v_num=.] 
Epoch 2:  30%|███       | 309/1030 [00:27<01:04, 11.12it/s, loss=4.49, v_num=.]
Epoch 2:  40%|████      | 412/1030 [00:35<00:53, 11.51it/s, loss=4.5, v_num=.] 
Epoch 2:  50%|█████     | 515/1030 [00:43<00:43, 11.74it/s, loss=4.42, v_num=.]
Epoch 2:  60%|██████    | 618/1030 [00:52<00:34, 11.89it/s, loss=4.56, v_num=.]
Epoch 2:  70%|███████   | 721/1030 [01:00<00:25, 12.01it/s, loss=4.58, v_num=.]
Epoch 2:  80%|████████  | 824/1030 [01:08<00:17, 12.09it/s, loss=4.64, v_num=.]
Epoch 2:  90%|█████████ | 927/1030 [01:16<00:08, 12.12it/s, loss=4.81, v_num=.]
Epoch 2: 100%|██████████| 1030/1030 [01:17<00:00, 13.38it/s, loss=4.81, v_num=.]
Validating: 0it [00:00, ?it/s][A
Validating:   0%|          | 0/93 [00:00<?, ?it/s][A
[2m[36m(pid=1631)[0m 
Validating: 100%|███████

[2m[36m(pid=1631)[0m   "`Trainer.running_sanity_check` has been renamed to `Trainer.sanity_checking` and will be removed in v1.5."


[2m[36m(pid=1631)[0m Epoch 2: 100%|██████████| 1030/1030 [01:27<00:00, 11.74it/s, loss=4.81, v_num=.]
Result for train_Stage1_70527_00001:
  FID: 470.75
  FID_cross: 557.5
  auroc: 0.49846595525741577
  auroc_cross: 0.5
  date: 2021-09-07_23-18-43
  done: false
  experiment_id: 2dfd70ad19c84d218b50ad9c615885cf
  hostname: e89b1a816e20
  iterations_since_restore: 1
  loss_D: 8.804814569884911e-05
  loss_G: 9.50886344909668
  node_ip: 172.28.0.2
  pid: 1631
  should_checkpoint: true
  time_since_restore: 96.24924945831299
  time_this_iter_s: 96.24924945831299
  time_total_s: 276.5277450084686
  timestamp: 1631056723
  timesteps_since_restore: 0
  training_iteration: 3
  trial_id: '70527_00001'
  


Trial name,status,loc,lr,n_fmaps,bs,loss_G,loss_D,FID,auroc,FID_cross,auroc_cross,training_iteration
train_Stage1_70527_00001,RUNNING,172.28.0.2:1631,0.0001,16,8,9.50886,8.80481e-05,470.75,0.498466,557.5,0.5,3
train_Stage1_70527_00000,PAUSED,,0.0001,8,8,8.61932,0.000274291,39.0,0.484795,55.5625,0.464387,3
train_Stage1_70527_00003,PAUSED,,0.0001,64,8,3.60345,0.410763,5.00391,0.473386,7.92578,0.472304,2
train_Stage1_70527_00002,PENDING,,0.0001,32,8,2.82168,0.320929,10.5938,0.450467,16.5156,0.453139,2


[2m[36m(pid=1631)[0m 2021-09-07 23:18:43,540	INFO trainable.py:76 -- Checkpoint size is 96244802 bytes
[2m[36m(pid=1631)[0m 2021-09-07 23:18:43,903	INFO trainable.py:76 -- Checkpoint size is 96244802 bytes
2021-09-07 23:18:44,506	INFO trainable.py:76 -- Checkpoint size is 111134338 bytes


Trial name,status,loc,lr,n_fmaps,bs,loss_G,loss_D,FID,auroc,FID_cross,auroc_cross,training_iteration
train_Stage1_70527_00002,RUNNING,,0.0001,32,8,2.82168,0.320929,10.5938,0.450467,16.5156,0.453139,2
train_Stage1_70527_00000,PAUSED,,0.0001,8,8,8.61932,0.000274291,39.0,0.484795,55.5625,0.464387,3
train_Stage1_70527_00001,PAUSED,,0.0001,16,8,9.50886,8.80481e-05,470.75,0.498466,557.5,0.5,3
train_Stage1_70527_00003,PENDING,,0.0001,64,8,3.60345,0.410763,5.00391,0.473386,7.92578,0.472304,2


[2m[36m(pid=1716)[0m 2021-09-07 23:18:48,590	INFO trainable.py:383 -- Restored on 172.28.0.2 from checkpoint: /content/drive/MyDrive/Logs/F/Stage1/pbt_tanh/train_Stage1_70527_00002_2_n_fmaps=32_2021-09-07_23-02-12/checkpoint_tmpcc3cc8/./
[2m[36m(pid=1716)[0m 2021-09-07 23:18:48,590	INFO trainable.py:390 -- Current state after restoring: {'_iteration': 2, '_timesteps_total': None, '_time_total': 213.7000377178192, '_episodes_total': None}
[2m[36m(pid=1716)[0m Using native 16bit precision.
[2m[36m(pid=1716)[0m GPU available: True, used: True
[2m[36m(pid=1716)[0m TPU available: False, using: 0 TPU cores
[2m[36m(pid=1716)[0m IPU available: False, using: 0 IPUs
[2m[36m(pid=1716)[0m Restoring states from the checkpoint file at /content/drive/MyDrive/Logs/F/Stage1/pbt_tanh/train_Stage1_70527_00002_2_n_fmaps=32_2021-09-07_23-02-12/checkpoint_tmpcc3cc8/./checkpoint
[2m[36m(pid=1716)[0m LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Trial name,status,loc,lr,n_fmaps,bs,loss_G,loss_D,FID,auroc,FID_cross,auroc_cross,training_iteration
train_Stage1_70527_00002,RUNNING,,0.0001,32,8,2.82168,0.320929,10.5938,0.450467,16.5156,0.453139,2
train_Stage1_70527_00000,PAUSED,,0.0001,8,8,8.61932,0.000274291,39.0,0.484795,55.5625,0.464387,3
train_Stage1_70527_00001,PAUSED,,0.0001,16,8,9.50886,8.80481e-05,470.75,0.498466,557.5,0.5,3
train_Stage1_70527_00003,PENDING,,0.0001,64,8,3.60345,0.410763,5.00391,0.473386,7.92578,0.472304,2


[2m[36m(pid=1716)[0m Restored all states from the checkpoint file at /content/drive/MyDrive/Logs/F/Stage1/pbt_tanh/train_Stage1_70527_00002_2_n_fmaps=32_2021-09-07_23-02-12/checkpoint_tmpcc3cc8/./checkpoint
[2m[36m(pid=1716)[0m 
[2m[36m(pid=1716)[0m   | Name          | Type               | Params
[2m[36m(pid=1716)[0m -----------------------------------------------------
[2m[36m(pid=1716)[0m 0 | generator     | DCGANGenerator     | 1.1 M 
[2m[36m(pid=1716)[0m 1 | discriminator | DCGANDiscriminator | 693 K 
[2m[36m(pid=1716)[0m 2 | criterion     | BCEWithLogitsLoss  | 0     
[2m[36m(pid=1716)[0m 3 | modelF        | ResNet             | 11.2 M
[2m[36m(pid=1716)[0m 4 | lastF         | Sequential         | 1.5 K 
[2m[36m(pid=1716)[0m 5 | modelJ        | ResNet             | 11.2 M
[2m[36m(pid=1716)[0m 6 | lastJ         | Sequential         | 1.5 K 
[2m[36m(pid=1716)[0m 7 | val_metrics   | MetricCollection   | 22.3 M
[2m[36m(pid=1716)[0m ---------------

[2m[36m(pid=1716)[0m Validation sanity check: 0it [00:00, ?it/s]Validation sanity check:   0%|          | 0/2 [00:00<?, ?it/s]


[2m[36m(pid=1716)[0m   return torch.max_pool2d(input, kernel_size, stride, padding, dilation, ceil_mode)
[2m[36m(pid=1716)[0m   "`Trainer.running_sanity_check` has been renamed to `Trainer.sanity_checking` and will be removed in v1.5."


Epoch 2:   0%|          | 0/1030 [00:00<00:00, 2949.58it/s]  
Epoch 2:  10%|█         | 103/1030 [00:15<02:20,  6.61it/s, loss=1.29, v_num=.]
Epoch 2:  20%|██        | 206/1030 [00:25<01:40,  8.19it/s, loss=1.16, v_num=.]
Epoch 2:  30%|███       | 309/1030 [00:34<01:21,  8.87it/s, loss=1.36, v_num=.]
Epoch 2:  40%|████      | 412/1030 [00:45<01:07,  9.16it/s, loss=1.6, v_num=.] 
Epoch 2:  50%|█████     | 515/1030 [00:54<00:54,  9.38it/s, loss=1.81, v_num=.]
Epoch 2:  60%|██████    | 618/1030 [01:04<00:43,  9.53it/s, loss=1.19, v_num=.]
Epoch 2:  70%|███████   | 721/1030 [01:14<00:32,  9.63it/s, loss=1.19, v_num=.]
Epoch 2:  70%|███████   | 721/1030 [01:14<00:32,  9.63it/s, loss=1.53, v_num=.]
Epoch 2:  80%|████████  | 824/1030 [01:24<00:21,  9.71it/s, loss=1.17, v_num=.]
Epoch 2:  90%|█████████ | 927/1030 [01:34<00:10,  9.77it/s, loss=1.54, v_num=.]
Epoch 2: 100%|██████████| 1030/1030 [01:35<00:00, 10.80it/s, loss=1.54, v_num=.]
Validating: 0it [00:00, ?it/s][A
Validating:   0%|      

[2m[36m(pid=1716)[0m   "`Trainer.running_sanity_check` has been renamed to `Trainer.sanity_checking` and will be removed in v1.5."
2021-09-07 23:20:43,076	INFO pbt.py:543 -- [exploit] transferring weights from trial train_Stage1_70527_00001 (score 0.49846595525741577) -> train_Stage1_70527_00002 (score 0.4679776728153229)
2021-09-07 23:20:43,083	INFO pbt.py:558 -- [explore] perturbed config from {'lr': 0.0001, 'bs': 8} -> {'lr': 0.0007956865814995524, 'bs': 128}


Result for train_Stage1_70527_00002:
  FID: 3.328125
  FID_cross: 6.390625
  auroc: 0.4679776728153229
  auroc_cross: 0.46242690086364746
  date: 2021-09-07_23-20-43
  done: false
  experiment_id: 92ef339ced7541c68ae490a4eb574118
  hostname: e89b1a816e20
  iterations_since_restore: 1
  loss_D: 1.017324447631836
  loss_G: 1.1361310482025146
  node_ip: 172.28.0.2
  pid: 1716
  should_checkpoint: true
  time_since_restore: 114.46495294570923
  time_this_iter_s: 114.46495294570923
  time_total_s: 328.16499066352844
  timestamp: 1631056843
  timesteps_since_restore: 0
  training_iteration: 3
  trial_id: '70527_00002'
  


Trial name,status,loc,lr,n_fmaps,bs,loss_G,loss_D,FID,auroc,FID_cross,auroc_cross,training_iteration
train_Stage1_70527_00002,RUNNING,172.28.0.2:1716,0.000795687,16,128,1.13613,1.01732,3.32812,0.467978,6.39062,0.462427,3
train_Stage1_70527_00000,PAUSED,,0.0001,8,8,8.61932,0.000274291,39.0,0.484795,55.5625,0.464387,3
train_Stage1_70527_00001,PAUSED,,0.0001,16,8,9.50886,8.80481e-05,470.75,0.498466,557.5,0.5,3
train_Stage1_70527_00003,PENDING,,0.0001,64,8,3.60345,0.410763,5.00391,0.473386,7.92578,0.472304,2


[2m[36m(pid=1716)[0m 2021-09-07 23:20:44,060	INFO trainable.py:383 -- Restored on 172.28.0.2 from checkpoint: /content/drive/MyDrive/Logs/F/Stage1/pbt_tanh/train_Stage1_70527_00002_2_n_fmaps=32_2021-09-07_23-02-12/checkpoint_tmp563492/./
[2m[36m(pid=1716)[0m 2021-09-07 23:20:44,060	INFO trainable.py:390 -- Current state after restoring: {'_iteration': 3, '_timesteps_total': None, '_time_total': 276.5277450084686, '_episodes_total': None}
[2m[36m(pid=1716)[0m 2021-09-07 23:20:45,184	INFO trainable.py:76 -- Checkpoint size is 96244802 bytes
2021-09-07 23:20:46,035	INFO trainable.py:76 -- Checkpoint size is 165685954 bytes


Trial name,status,loc,lr,n_fmaps,bs,loss_G,loss_D,FID,auroc,FID_cross,auroc_cross,training_iteration
train_Stage1_70527_00003,RUNNING,,0.0001,64,8,3.60345,0.410763,5.00391,0.473386,7.92578,0.472304,2
train_Stage1_70527_00001,PAUSED,,0.0001,16,8,9.50886,8.80481e-05,470.75,0.498466,557.5,0.5,3
train_Stage1_70527_00002,PAUSED,,0.000795687,16,128,1.13613,1.01732,3.32812,0.467978,6.39062,0.462427,3
train_Stage1_70527_00000,PENDING,,0.0001,8,8,8.61932,0.000274291,39.0,0.484795,55.5625,0.464387,3


[2m[36m(pid=1795)[0m 2021-09-07 23:20:50,329	INFO trainable.py:383 -- Restored on 172.28.0.2 from checkpoint: /content/drive/MyDrive/Logs/F/Stage1/pbt_tanh/train_Stage1_70527_00003_3_n_fmaps=64_2021-09-07_23-03-45/checkpoint_tmpdb8ea7/./
[2m[36m(pid=1795)[0m 2021-09-07 23:20:50,330	INFO trainable.py:390 -- Current state after restoring: {'_iteration': 2, '_timesteps_total': None, '_time_total': 285.90033984184265, '_episodes_total': None}
[2m[36m(pid=1795)[0m Using native 16bit precision.
[2m[36m(pid=1795)[0m GPU available: True, used: True
[2m[36m(pid=1795)[0m TPU available: False, using: 0 TPU cores
[2m[36m(pid=1795)[0m IPU available: False, using: 0 IPUs
[2m[36m(pid=1795)[0m Restoring states from the checkpoint file at /content/drive/MyDrive/Logs/F/Stage1/pbt_tanh/train_Stage1_70527_00003_3_n_fmaps=64_2021-09-07_23-03-45/checkpoint_tmpdb8ea7/./checkpoint
[2m[36m(pid=1795)[0m LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Trial name,status,loc,lr,n_fmaps,bs,loss_G,loss_D,FID,auroc,FID_cross,auroc_cross,training_iteration
train_Stage1_70527_00003,RUNNING,,0.0001,64,8,3.60345,0.410763,5.00391,0.473386,7.92578,0.472304,2
train_Stage1_70527_00001,PAUSED,,0.0001,16,8,9.50886,8.80481e-05,470.75,0.498466,557.5,0.5,3
train_Stage1_70527_00002,PAUSED,,0.000795687,16,128,1.13613,1.01732,3.32812,0.467978,6.39062,0.462427,3
train_Stage1_70527_00000,PENDING,,0.0001,8,8,8.61932,0.000274291,39.0,0.484795,55.5625,0.464387,3


[2m[36m(pid=1795)[0m Restored all states from the checkpoint file at /content/drive/MyDrive/Logs/F/Stage1/pbt_tanh/train_Stage1_70527_00003_3_n_fmaps=64_2021-09-07_23-03-45/checkpoint_tmpdb8ea7/./checkpoint
[2m[36m(pid=1795)[0m 
[2m[36m(pid=1795)[0m   | Name          | Type               | Params
[2m[36m(pid=1795)[0m -----------------------------------------------------
[2m[36m(pid=1795)[0m 0 | generator     | DCGANGenerator     | 3.6 M 
[2m[36m(pid=1795)[0m 1 | discriminator | DCGANDiscriminator | 2.8 M 
[2m[36m(pid=1795)[0m 2 | criterion     | BCEWithLogitsLoss  | 0     
[2m[36m(pid=1795)[0m 3 | modelF        | ResNet             | 11.2 M
[2m[36m(pid=1795)[0m 4 | lastF         | Sequential         | 1.5 K 
[2m[36m(pid=1795)[0m 5 | modelJ        | ResNet             | 11.2 M
[2m[36m(pid=1795)[0m 6 | lastJ         | Sequential         | 1.5 K 
[2m[36m(pid=1795)[0m 7 | val_metrics   | MetricCollection   | 22.3 M
[2m[36m(pid=1795)[0m ---------------

[2m[36m(pid=1795)[0m Validation sanity check: 0it [00:00, ?it/s]Validation sanity check:   0%|          | 0/2 [00:00<?, ?it/s]


[2m[36m(pid=1795)[0m   return torch.max_pool2d(input, kernel_size, stride, padding, dilation, ceil_mode)
[2m[36m(pid=1795)[0m   "`Trainer.running_sanity_check` has been renamed to `Trainer.sanity_checking` and will be removed in v1.5."


Epoch 2:   0%|          | 0/1030 [00:00<00:00, 4728.64it/s]  
Epoch 2:  10%|█         | 103/1030 [00:15<02:14,  6.88it/s, loss=1.5, v_num=.]
Epoch 2:  20%|██        | 206/1030 [00:28<01:54,  7.18it/s, loss=1.75, v_num=.]
Epoch 2:  30%|███       | 309/1030 [00:42<01:38,  7.31it/s, loss=1.87, v_num=.]
Epoch 2:  40%|████      | 412/1030 [00:56<01:23,  7.36it/s, loss=1.64, v_num=.]
Epoch 2:  50%|█████     | 515/1030 [01:09<01:09,  7.40it/s, loss=1.27, v_num=.]
Epoch 2:  60%|██████    | 618/1030 [01:23<00:55,  7.43it/s, loss=1.85, v_num=.]
Epoch 2:  70%|███████   | 721/1030 [01:37<00:41,  7.43it/s, loss=1.64, v_num=.]
Epoch 2:  80%|████████  | 824/1030 [01:50<00:27,  7.44it/s, loss=1.59, v_num=.]
Epoch 2:  90%|█████████ | 927/1030 [02:04<00:13,  7.45it/s, loss=1.26, v_num=.]
Epoch 2: 100%|██████████| 1030/1030 [02:05<00:00,  8.24it/s, loss=1.26, v_num=.]
Validating: 0it [00:00, ?it/s][A
Validating:   0%|          | 0/93 [00:00<?, ?it/s][A
[2m[36m(pid=1795)[0m 
Validating: 100%|████████

[2m[36m(pid=1795)[0m   "`Trainer.running_sanity_check` has been renamed to `Trainer.sanity_checking` and will be removed in v1.5."


Result for train_Stage1_70527_00003:
  FID: 11.265625
  FID_cross: 19.703125
  auroc: 0.47999101877212524
  auroc_cross: 0.47273218631744385
  date: 2021-09-07_23-23-16
  done: false
  experiment_id: 9b670fdf81a34eecb2b1085a111512ff
  hostname: e89b1a816e20
  iterations_since_restore: 1
  loss_D: 0.7751947045326233
  loss_G: 2.6513383388519287
  node_ip: 172.28.0.2
  pid: 1795
  should_checkpoint: true
  time_since_restore: 145.84448266029358
  time_this_iter_s: 145.84448266029358
  time_total_s: 431.74482250213623
  timestamp: 1631056996
  timesteps_since_restore: 0
  training_iteration: 3
  trial_id: '70527_00003'
  


Trial name,status,loc,lr,n_fmaps,bs,loss_G,loss_D,FID,auroc,FID_cross,auroc_cross,training_iteration
train_Stage1_70527_00003,RUNNING,172.28.0.2:1795,0.0001,64,8,2.65134,0.775195,11.2656,0.479991,19.7031,0.472732,3
train_Stage1_70527_00001,PAUSED,,0.0001,16,8,9.50886,8.80481e-05,470.75,0.498466,557.5,0.5,3
train_Stage1_70527_00002,PAUSED,,0.000795687,16,128,1.13613,1.01732,3.32812,0.467978,6.39062,0.462427,3
train_Stage1_70527_00000,PENDING,,0.0001,8,8,8.61932,0.000274291,39.0,0.484795,55.5625,0.464387,3


[2m[36m(pid=1795)[0m Epoch 2: 100%|██████████| 1030/1030 [02:17<00:00,  7.50it/s, loss=1.26, v_num=.]


[2m[36m(pid=1795)[0m 2021-09-07 23:23:17,796	INFO trainable.py:76 -- Checkpoint size is 165685954 bytes
2021-09-07 23:23:19,784	INFO trainable.py:76 -- Checkpoint size is 91896386 bytes


Trial name,status,loc,lr,n_fmaps,bs,loss_G,loss_D,FID,auroc,FID_cross,auroc_cross,training_iteration
train_Stage1_70527_00000,RUNNING,,0.0001,8,8,8.61932,0.000274291,39.0,0.484795,55.5625,0.464387,3
train_Stage1_70527_00002,PAUSED,,0.000795687,16,128,1.13613,1.01732,3.32812,0.467978,6.39062,0.462427,3
train_Stage1_70527_00003,PAUSED,,0.0001,64,8,2.65134,0.775195,11.2656,0.479991,19.7031,0.472732,3
train_Stage1_70527_00001,PENDING,,0.0001,16,8,9.50886,8.80481e-05,470.75,0.498466,557.5,0.5,3


[2m[36m(pid=1876)[0m 2021-09-07 23:23:23,590	INFO trainable.py:383 -- Restored on 172.28.0.2 from checkpoint: /content/drive/MyDrive/Logs/F/Stage1/pbt_tanh/train_Stage1_70527_00000_0_n_fmaps=8_2021-09-07_23-00-47/checkpoint_tmpe882d9/./
[2m[36m(pid=1876)[0m 2021-09-07 23:23:23,590	INFO trainable.py:390 -- Current state after restoring: {'_iteration': 3, '_timesteps_total': None, '_time_total': 252.48758840560913, '_episodes_total': None}
[2m[36m(pid=1876)[0m Using native 16bit precision.
[2m[36m(pid=1876)[0m GPU available: True, used: True
[2m[36m(pid=1876)[0m TPU available: False, using: 0 TPU cores
[2m[36m(pid=1876)[0m IPU available: False, using: 0 IPUs
[2m[36m(pid=1876)[0m Restoring states from the checkpoint file at /content/drive/MyDrive/Logs/F/Stage1/pbt_tanh/train_Stage1_70527_00000_0_n_fmaps=8_2021-09-07_23-00-47/checkpoint_tmpe882d9/./checkpoint
[2m[36m(pid=1876)[0m LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Trial name,status,loc,lr,n_fmaps,bs,loss_G,loss_D,FID,auroc,FID_cross,auroc_cross,training_iteration
train_Stage1_70527_00000,RUNNING,,0.0001,8,8,8.61932,0.000274291,39.0,0.484795,55.5625,0.464387,3
train_Stage1_70527_00002,PAUSED,,0.000795687,16,128,1.13613,1.01732,3.32812,0.467978,6.39062,0.462427,3
train_Stage1_70527_00003,PAUSED,,0.0001,64,8,2.65134,0.775195,11.2656,0.479991,19.7031,0.472732,3
train_Stage1_70527_00001,PENDING,,0.0001,16,8,9.50886,8.80481e-05,470.75,0.498466,557.5,0.5,3


[2m[36m(pid=1876)[0m Restored all states from the checkpoint file at /content/drive/MyDrive/Logs/F/Stage1/pbt_tanh/train_Stage1_70527_00000_0_n_fmaps=8_2021-09-07_23-00-47/checkpoint_tmpe882d9/./checkpoint
[2m[36m(pid=1876)[0m 
[2m[36m(pid=1876)[0m   | Name          | Type               | Params
[2m[36m(pid=1876)[0m -----------------------------------------------------
[2m[36m(pid=1876)[0m 0 | generator     | DCGANGenerator     | 146 K 
[2m[36m(pid=1876)[0m 1 | discriminator | DCGANDiscriminator | 44.4 K
[2m[36m(pid=1876)[0m 2 | criterion     | BCEWithLogitsLoss  | 0     
[2m[36m(pid=1876)[0m 3 | modelF        | ResNet             | 11.2 M
[2m[36m(pid=1876)[0m 4 | lastF         | Sequential         | 1.5 K 
[2m[36m(pid=1876)[0m 5 | modelJ        | ResNet             | 11.2 M
[2m[36m(pid=1876)[0m 6 | lastJ         | Sequential         | 1.5 K 
[2m[36m(pid=1876)[0m 7 | val_metrics   | MetricCollection   | 22.3 M
[2m[36m(pid=1876)[0m ----------------

[2m[36m(pid=1876)[0m Validation sanity check: 0it [00:00, ?it/s]Validation sanity check:   0%|          | 0/2 [00:00<?, ?it/s]


[2m[36m(pid=1876)[0m   "`Trainer.running_sanity_check` has been renamed to `Trainer.sanity_checking` and will be removed in v1.5."


Epoch 3:   0%|          | 0/1030 [00:00<00:00, 4328.49it/s]  
Epoch 3:  10%|█         | 103/1030 [00:09<01:20, 11.49it/s, loss=4.27, v_num=.]
Epoch 3:  20%|██        | 206/1030 [00:16<01:04, 12.76it/s, loss=4.21, v_num=.]
Epoch 3:  30%|███       | 309/1030 [00:23<00:54, 13.22it/s, loss=4.35, v_num=.]
Epoch 3:  40%|████      | 412/1030 [00:30<00:45, 13.44it/s, loss=4.4, v_num=.] 
Epoch 3:  50%|█████     | 515/1030 [00:37<00:37, 13.60it/s, loss=4.4, v_num=.]
Epoch 3:  60%|██████    | 618/1030 [00:45<00:30, 13.70it/s, loss=4.4, v_num=.]
Epoch 3:  70%|███████   | 721/1030 [00:52<00:22, 13.76it/s, loss=4.4, v_num=.]
Epoch 3:  80%|████████  | 824/1030 [00:59<00:14, 13.80it/s, loss=2.98, v_num=.]
Epoch 3:  90%|█████████ | 927/1030 [01:07<00:07, 13.81it/s, loss=2.02, v_num=.]
Epoch 3: 100%|██████████| 1030/1030 [01:07<00:00, 15.22it/s, loss=2.02, v_num=.]
Validating: 0it [00:00, ?it/s][A
Validating:   0%|          | 0/93 [00:00<?, ?it/s][A
[2m[36m(pid=1876)[0m 
Validating: 100%|██████████

[2m[36m(pid=1876)[0m   "`Trainer.running_sanity_check` has been renamed to `Trainer.sanity_checking` and will be removed in v1.5."


Result for train_Stage1_70527_00000:
  FID: 26.984375
  FID_cross: 46.25
  auroc: 0.5033348798751831
  auroc_cross: 0.4764401912689209
  date: 2021-09-07_23-24-51
  done: false
  experiment_id: f4cb33f5595841f8bba99795ccc25185
  hostname: e89b1a816e20
  iterations_since_restore: 1
  loss_D: 0.0869472473859787
  loss_G: 4.641657829284668
  node_ip: 172.28.0.2
  pid: 1876
  should_checkpoint: true
  time_since_restore: 87.42844676971436
  time_this_iter_s: 87.42844676971436
  time_total_s: 339.9160351753235
  timestamp: 1631057091
  timesteps_since_restore: 0
  training_iteration: 4
  trial_id: '70527_00000'
  


Trial name,status,loc,lr,n_fmaps,bs,loss_G,loss_D,FID,auroc,FID_cross,auroc_cross,training_iteration
train_Stage1_70527_00000,RUNNING,172.28.0.2:1876,0.0001,8,8,4.64166,0.0869472,26.9844,0.503335,46.25,0.47644,4
train_Stage1_70527_00002,PAUSED,,0.000795687,16,128,1.13613,1.01732,3.32812,0.467978,6.39062,0.462427,3
train_Stage1_70527_00003,PAUSED,,0.0001,64,8,2.65134,0.775195,11.2656,0.479991,19.7031,0.472732,3
train_Stage1_70527_00001,PENDING,,0.0001,16,8,9.50886,8.80481e-05,470.75,0.498466,557.5,0.5,3


[2m[36m(pid=1876)[0m 2021-09-07 23:24:51,438	INFO trainable.py:76 -- Checkpoint size is 91896386 bytes
[2m[36m(pid=1876)[0m 2021-09-07 23:24:51,792	INFO trainable.py:76 -- Checkpoint size is 91896386 bytes
2021-09-07 23:24:54,598	INFO trainable.py:76 -- Checkpoint size is 96244802 bytes


Trial name,status,loc,lr,n_fmaps,bs,loss_G,loss_D,FID,auroc,FID_cross,auroc_cross,training_iteration
train_Stage1_70527_00001,RUNNING,,0.0001,16,8,9.50886,8.80481e-05,470.75,0.498466,557.5,0.5,3
train_Stage1_70527_00000,PAUSED,,0.0001,8,8,4.64166,0.0869472,26.9844,0.503335,46.25,0.47644,4
train_Stage1_70527_00003,PAUSED,,0.0001,64,8,2.65134,0.775195,11.2656,0.479991,19.7031,0.472732,3
train_Stage1_70527_00002,PENDING,,0.000795687,16,128,1.13613,1.01732,3.32812,0.467978,6.39062,0.462427,3


[2m[36m(pid=1959)[0m 2021-09-07 23:24:56,595	INFO trainable.py:383 -- Restored on 172.28.0.2 from checkpoint: /content/drive/MyDrive/Logs/F/Stage1/pbt_tanh/train_Stage1_70527_00001_1_n_fmaps=16_2021-09-07_23-00-47/checkpoint_tmp3b2a68/./
[2m[36m(pid=1959)[0m 2021-09-07 23:24:56,595	INFO trainable.py:390 -- Current state after restoring: {'_iteration': 3, '_timesteps_total': None, '_time_total': 276.5277450084686, '_episodes_total': None}
[2m[36m(pid=1959)[0m Using native 16bit precision.
[2m[36m(pid=1959)[0m GPU available: True, used: True
[2m[36m(pid=1959)[0m TPU available: False, using: 0 TPU cores
[2m[36m(pid=1959)[0m IPU available: False, using: 0 IPUs
[2m[36m(pid=1959)[0m Restoring states from the checkpoint file at /content/drive/MyDrive/Logs/F/Stage1/pbt_tanh/train_Stage1_70527_00001_1_n_fmaps=16_2021-09-07_23-00-47/checkpoint_tmp3b2a68/./checkpoint
[2m[36m(pid=1959)[0m LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Trial name,status,loc,lr,n_fmaps,bs,loss_G,loss_D,FID,auroc,FID_cross,auroc_cross,training_iteration
train_Stage1_70527_00001,RUNNING,,0.0001,16,8,9.50886,8.80481e-05,470.75,0.498466,557.5,0.5,3
train_Stage1_70527_00000,PAUSED,,0.0001,8,8,4.64166,0.0869472,26.9844,0.503335,46.25,0.47644,4
train_Stage1_70527_00003,PAUSED,,0.0001,64,8,2.65134,0.775195,11.2656,0.479991,19.7031,0.472732,3
train_Stage1_70527_00002,PENDING,,0.000795687,16,128,1.13613,1.01732,3.32812,0.467978,6.39062,0.462427,3


[2m[36m(pid=1959)[0m Restored all states from the checkpoint file at /content/drive/MyDrive/Logs/F/Stage1/pbt_tanh/train_Stage1_70527_00001_1_n_fmaps=16_2021-09-07_23-00-47/checkpoint_tmp3b2a68/./checkpoint
[2m[36m(pid=1959)[0m 
[2m[36m(pid=1959)[0m   | Name          | Type               | Params
[2m[36m(pid=1959)[0m -----------------------------------------------------
[2m[36m(pid=1959)[0m 0 | generator     | DCGANGenerator     | 377 K 
[2m[36m(pid=1959)[0m 1 | discriminator | DCGANDiscriminator | 174 K 
[2m[36m(pid=1959)[0m 2 | criterion     | BCEWithLogitsLoss  | 0     
[2m[36m(pid=1959)[0m 3 | modelF        | ResNet             | 11.2 M
[2m[36m(pid=1959)[0m 4 | lastF         | Sequential         | 1.5 K 
[2m[36m(pid=1959)[0m 5 | modelJ        | ResNet             | 11.2 M
[2m[36m(pid=1959)[0m 6 | lastJ         | Sequential         | 1.5 K 
[2m[36m(pid=1959)[0m 7 | val_metrics   | MetricCollection   | 22.3 M
[2m[36m(pid=1959)[0m ---------------

[2m[36m(pid=1959)[0m Validation sanity check: 0it [00:00, ?it/s]Validation sanity check:   0%|          | 0/2 [00:00<?, ?it/s]


[2m[36m(pid=1959)[0m   return torch.max_pool2d(input, kernel_size, stride, padding, dilation, ceil_mode)
[2m[36m(pid=1959)[0m   "`Trainer.running_sanity_check` has been renamed to `Trainer.sanity_checking` and will be removed in v1.5."


Epoch 3:   0%|          | 0/1030 [00:00<00:00, 3919.91it/s]  
Epoch 3:  10%|█         | 103/1030 [00:21<03:09,  4.89it/s, loss=4.8, v_num=.]
Epoch 3:  20%|██        | 206/1030 [00:30<01:59,  6.90it/s, loss=4.75, v_num=.]
Epoch 3:  30%|███       | 309/1030 [00:37<01:28,  8.17it/s, loss=4.65, v_num=.]
Epoch 3:  40%|████      | 412/1030 [00:46<01:09,  8.94it/s, loss=4.79, v_num=.]
Epoch 3:  50%|█████     | 515/1030 [00:54<00:54,  9.50it/s, loss=4.96, v_num=.]
Epoch 3:  60%|██████    | 618/1030 [01:02<00:41,  9.90it/s, loss=5.07, v_num=.]
Epoch 3:  70%|███████   | 721/1030 [01:10<00:30, 10.22it/s, loss=5.15, v_num=.]
Epoch 3:  80%|████████  | 824/1030 [01:18<00:19, 10.47it/s, loss=5.19, v_num=.]
Epoch 3:  90%|█████████ | 927/1030 [01:27<00:09, 10.66it/s, loss=5.2, v_num=.] 
Epoch 3: 100%|██████████| 1030/1030 [01:27<00:00, 11.78it/s, loss=5.2, v_num=.]
Validating: 0it [00:00, ?it/s][A
Validating:   0%|          | 0/93 [00:00<?, ?it/s][A
[2m[36m(pid=1959)[0m 
Validating: 100%|█████████

[2m[36m(pid=1959)[0m   "`Trainer.running_sanity_check` has been renamed to `Trainer.sanity_checking` and will be removed in v1.5."


Result for train_Stage1_70527_00001:
  FID: 175.375
  FID_cross: 186.25
  auroc: 0.48585206270217896
  auroc_cross: 0.4794314205646515
  date: 2021-09-07_23-26-43
  done: false
  experiment_id: 2dfd70ad19c84d218b50ad9c615885cf
  hostname: e89b1a816e20
  iterations_since_restore: 1
  loss_D: 4.2646443034755066e-05
  loss_G: 10.380890846252441
  node_ip: 172.28.0.2
  pid: 1959
  should_checkpoint: true
  time_since_restore: 106.37760639190674
  time_this_iter_s: 106.37760639190674
  time_total_s: 382.90535140037537
  timestamp: 1631057203
  timesteps_since_restore: 0
  training_iteration: 4
  trial_id: '70527_00001'
  


Trial name,status,loc,lr,n_fmaps,bs,loss_G,loss_D,FID,auroc,FID_cross,auroc_cross,training_iteration
train_Stage1_70527_00001,RUNNING,172.28.0.2:1959,0.0001,16,8,10.3809,4.26464e-05,175.375,0.485852,186.25,0.479431,4
train_Stage1_70527_00000,PAUSED,,0.0001,8,8,4.64166,0.0869472,26.9844,0.503335,46.25,0.47644,4
train_Stage1_70527_00003,PAUSED,,0.0001,64,8,2.65134,0.775195,11.2656,0.479991,19.7031,0.472732,3
train_Stage1_70527_00002,PENDING,,0.000795687,16,128,1.13613,1.01732,3.32812,0.467978,6.39062,0.462427,3


[2m[36m(pid=1959)[0m 2021-09-07 23:26:43,488	INFO trainable.py:76 -- Checkpoint size is 96244802 bytes
2021-09-07 23:26:43,938	INFO trainable.py:76 -- Checkpoint size is 96244802 bytes
[2m[36m(pid=2050)[0m 2021-09-07 23:26:47,429	INFO trainable.py:383 -- Restored on 172.28.0.2 from checkpoint: /content/drive/MyDrive/Logs/F/Stage1/pbt_tanh/train_Stage1_70527_00002_2_n_fmaps=32_2021-09-07_23-02-12/checkpoint_tmp5395d0/./
[2m[36m(pid=2050)[0m 2021-09-07 23:26:47,430	INFO trainable.py:390 -- Current state after restoring: {'_iteration': 3, '_timesteps_total': None, '_time_total': 276.5277450084686, '_episodes_total': None}


Trial name,status,loc,lr,n_fmaps,bs,loss_G,loss_D,FID,auroc,FID_cross,auroc_cross,training_iteration
train_Stage1_70527_00002,RUNNING,,0.000795687,16,128,1.13613,1.01732,3.32812,0.467978,6.39062,0.462427,3
train_Stage1_70527_00000,PAUSED,,0.0001,8,8,4.64166,0.0869472,26.9844,0.503335,46.25,0.47644,4
train_Stage1_70527_00001,PAUSED,,0.0001,16,8,10.3809,4.26464e-05,175.375,0.485852,186.25,0.479431,4
train_Stage1_70527_00003,PENDING,,0.0001,64,8,2.65134,0.775195,11.2656,0.479991,19.7031,0.472732,3


[2m[36m(pid=2050)[0m Using native 16bit precision.
[2m[36m(pid=2050)[0m GPU available: True, used: True
[2m[36m(pid=2050)[0m TPU available: False, using: 0 TPU cores
[2m[36m(pid=2050)[0m IPU available: False, using: 0 IPUs
[2m[36m(pid=2050)[0m Restoring states from the checkpoint file at /content/drive/MyDrive/Logs/F/Stage1/pbt_tanh/train_Stage1_70527_00002_2_n_fmaps=32_2021-09-07_23-02-12/checkpoint_tmp5395d0/./checkpoint
[2m[36m(pid=2050)[0m LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Trial name,status,loc,lr,n_fmaps,bs,loss_G,loss_D,FID,auroc,FID_cross,auroc_cross,training_iteration
train_Stage1_70527_00002,RUNNING,,0.000795687,16,128,1.13613,1.01732,3.32812,0.467978,6.39062,0.462427,3
train_Stage1_70527_00000,PAUSED,,0.0001,8,8,4.64166,0.0869472,26.9844,0.503335,46.25,0.47644,4
train_Stage1_70527_00001,PAUSED,,0.0001,16,8,10.3809,4.26464e-05,175.375,0.485852,186.25,0.479431,4
train_Stage1_70527_00003,PENDING,,0.0001,64,8,2.65134,0.775195,11.2656,0.479991,19.7031,0.472732,3


[2m[36m(pid=2050)[0m Restored all states from the checkpoint file at /content/drive/MyDrive/Logs/F/Stage1/pbt_tanh/train_Stage1_70527_00002_2_n_fmaps=32_2021-09-07_23-02-12/checkpoint_tmp5395d0/./checkpoint
[2m[36m(pid=2050)[0m 
[2m[36m(pid=2050)[0m   | Name          | Type               | Params
[2m[36m(pid=2050)[0m -----------------------------------------------------
[2m[36m(pid=2050)[0m 0 | generator     | DCGANGenerator     | 377 K 
[2m[36m(pid=2050)[0m 1 | discriminator | DCGANDiscriminator | 174 K 
[2m[36m(pid=2050)[0m 2 | criterion     | BCEWithLogitsLoss  | 0     
[2m[36m(pid=2050)[0m 3 | modelF        | ResNet             | 11.2 M
[2m[36m(pid=2050)[0m 4 | lastF         | Sequential         | 1.5 K 
[2m[36m(pid=2050)[0m 5 | modelJ        | ResNet             | 11.2 M
[2m[36m(pid=2050)[0m 6 | lastJ         | Sequential         | 1.5 K 
[2m[36m(pid=2050)[0m 7 | val_metrics   | MetricCollection   | 22.3 M
[2m[36m(pid=2050)[0m ---------------

[2m[36m(pid=2050)[0m Validation sanity check: 0it [00:00, ?it/s]Validation sanity check:   0%|          | 0/2 [00:00<?, ?it/s]


[2m[36m(pid=2050)[0m   return torch.max_pool2d(input, kernel_size, stride, padding, dilation, ceil_mode)
[2m[36m(pid=2050)[0m   "`Trainer.running_sanity_check` has been renamed to `Trainer.sanity_checking` and will be removed in v1.5."


Epoch 3:   0%|          | 0/63 [00:00<00:00, 4025.24it/s]  
Epoch 3:  10%|▉         | 6/63 [00:04<00:37,  1.53it/s, loss=4.76, v_num=.]
Epoch 3:  19%|█▉        | 12/63 [00:07<00:31,  1.63it/s, loss=4.76, v_num=.]
Epoch 3:  29%|██▊       | 18/63 [00:11<00:27,  1.66it/s, loss=4.76, v_num=.]
Epoch 3:  38%|███▊      | 24/63 [00:14<00:23,  1.68it/s, loss=4.76, v_num=.]
Epoch 3:  48%|████▊     | 30/63 [00:18<00:19,  1.68it/s, loss=4.77, v_num=.]
Epoch 3:  57%|█████▋    | 36/63 [00:22<00:16,  1.68it/s, loss=4.78, v_num=.]
Epoch 3:  67%|██████▋   | 42/63 [00:25<00:12,  1.67it/s, loss=4.77, v_num=.]
Epoch 3:  76%|███████▌  | 48/63 [00:29<00:09,  1.66it/s, loss=4.78, v_num=.]
Epoch 3:  76%|███████▌  | 48/63 [00:45<00:13,  1.09it/s, loss=4.78, v_num=.]
Epoch 3:  86%|████████▌ | 54/63 [00:46<00:07,  1.19it/s, loss=4.8, v_num=.] 
Epoch 3:  95%|█████████▌| 60/63 [00:48<00:02,  1.26it/s, loss=4.8, v_num=.]
Validating: 0it [00:00, ?it/s][A
Validating:   0%|          | 0/5 [00:00<?, ?it/s][A
[2m[36

[2m[36m(pid=2050)[0m   "`Trainer.running_sanity_check` has been renamed to `Trainer.sanity_checking` and will be removed in v1.5."


Result for train_Stage1_70527_00002:
  FID: 510.75
  FID_cross: 559.0
  auroc: 0.4882422685623169
  auroc_cross: 0.47150999307632446
  date: 2021-09-07_23-27-56
  done: false
  experiment_id: 2dfd70ad19c84d218b50ad9c615885cf
  hostname: e89b1a816e20
  iterations_since_restore: 1
  loss_D: 9.20077072805725e-05
  loss_G: 9.53413200378418
  node_ip: 172.28.0.2
  pid: 2050
  should_checkpoint: true
  time_since_restore: 68.75642156600952
  time_this_iter_s: 68.75642156600952
  time_total_s: 345.28416657447815
  timestamp: 1631057276
  timesteps_since_restore: 0
  training_iteration: 4
  trial_id: '70527_00002'
  


Trial name,status,loc,lr,n_fmaps,bs,loss_G,loss_D,FID,auroc,FID_cross,auroc_cross,training_iteration
train_Stage1_70527_00002,RUNNING,172.28.0.2:2050,0.000795687,16,128,9.53413,9.20077e-05,510.75,0.488242,559.0,0.47151,4
train_Stage1_70527_00000,PAUSED,,0.0001,8,8,4.64166,0.0869472,26.9844,0.503335,46.25,0.47644,4
train_Stage1_70527_00001,PAUSED,,0.0001,16,8,10.3809,4.26464e-05,175.375,0.485852,186.25,0.479431,4
train_Stage1_70527_00003,PENDING,,0.0001,64,8,2.65134,0.775195,11.2656,0.479991,19.7031,0.472732,3


[2m[36m(pid=2050)[0m 2021-09-07 23:27:57,099	INFO trainable.py:76 -- Checkpoint size is 96244802 bytes
2021-09-07 23:27:57,751	INFO trainable.py:76 -- Checkpoint size is 165685954 bytes


Trial name,status,loc,lr,n_fmaps,bs,loss_G,loss_D,FID,auroc,FID_cross,auroc_cross,training_iteration
train_Stage1_70527_00003,RUNNING,,0.0001,64,8,2.65134,0.775195,11.2656,0.479991,19.7031,0.472732,3
train_Stage1_70527_00001,PAUSED,,0.0001,16,8,10.3809,4.26464e-05,175.375,0.485852,186.25,0.479431,4
train_Stage1_70527_00002,PAUSED,,0.000795687,16,128,9.53413,9.20077e-05,510.75,0.488242,559.0,0.47151,4
train_Stage1_70527_00000,PENDING,,0.0001,8,8,4.64166,0.0869472,26.9844,0.503335,46.25,0.47644,4


[2m[36m(pid=2127)[0m 2021-09-07 23:28:02,437	INFO trainable.py:383 -- Restored on 172.28.0.2 from checkpoint: /content/drive/MyDrive/Logs/F/Stage1/pbt_tanh/train_Stage1_70527_00003_3_n_fmaps=64_2021-09-07_23-03-45/checkpoint_tmpf4fa00/./
[2m[36m(pid=2127)[0m 2021-09-07 23:28:02,437	INFO trainable.py:390 -- Current state after restoring: {'_iteration': 3, '_timesteps_total': None, '_time_total': 431.74482250213623, '_episodes_total': None}
[2m[36m(pid=2127)[0m Using native 16bit precision.
[2m[36m(pid=2127)[0m GPU available: True, used: True
[2m[36m(pid=2127)[0m TPU available: False, using: 0 TPU cores
[2m[36m(pid=2127)[0m IPU available: False, using: 0 IPUs
[2m[36m(pid=2127)[0m Restoring states from the checkpoint file at /content/drive/MyDrive/Logs/F/Stage1/pbt_tanh/train_Stage1_70527_00003_3_n_fmaps=64_2021-09-07_23-03-45/checkpoint_tmpf4fa00/./checkpoint
[2m[36m(pid=2127)[0m LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Trial name,status,loc,lr,n_fmaps,bs,loss_G,loss_D,FID,auroc,FID_cross,auroc_cross,training_iteration
train_Stage1_70527_00003,RUNNING,,0.0001,64,8,2.65134,0.775195,11.2656,0.479991,19.7031,0.472732,3
train_Stage1_70527_00001,PAUSED,,0.0001,16,8,10.3809,4.26464e-05,175.375,0.485852,186.25,0.479431,4
train_Stage1_70527_00002,PAUSED,,0.000795687,16,128,9.53413,9.20077e-05,510.75,0.488242,559.0,0.47151,4
train_Stage1_70527_00000,PENDING,,0.0001,8,8,4.64166,0.0869472,26.9844,0.503335,46.25,0.47644,4


[2m[36m(pid=2127)[0m Restored all states from the checkpoint file at /content/drive/MyDrive/Logs/F/Stage1/pbt_tanh/train_Stage1_70527_00003_3_n_fmaps=64_2021-09-07_23-03-45/checkpoint_tmpf4fa00/./checkpoint
[2m[36m(pid=2127)[0m 
[2m[36m(pid=2127)[0m   | Name          | Type               | Params
[2m[36m(pid=2127)[0m -----------------------------------------------------
[2m[36m(pid=2127)[0m 0 | generator     | DCGANGenerator     | 3.6 M 
[2m[36m(pid=2127)[0m 1 | discriminator | DCGANDiscriminator | 2.8 M 
[2m[36m(pid=2127)[0m 2 | criterion     | BCEWithLogitsLoss  | 0     
[2m[36m(pid=2127)[0m 3 | modelF        | ResNet             | 11.2 M
[2m[36m(pid=2127)[0m 4 | lastF         | Sequential         | 1.5 K 
[2m[36m(pid=2127)[0m 5 | modelJ        | ResNet             | 11.2 M
[2m[36m(pid=2127)[0m 6 | lastJ         | Sequential         | 1.5 K 
[2m[36m(pid=2127)[0m 7 | val_metrics   | MetricCollection   | 22.3 M
[2m[36m(pid=2127)[0m ---------------

[2m[36m(pid=2127)[0m Validation sanity check: 0it [00:00, ?it/s]
[2m[36m(pid=2127)[0m Validation sanity check:   0%|          | 0/2 [00:00<?, ?it/s]


[2m[36m(pid=2127)[0m   return torch.max_pool2d(input, kernel_size, stride, padding, dilation, ceil_mode)
[2m[36m(pid=2127)[0m   "`Trainer.running_sanity_check` has been renamed to `Trainer.sanity_checking` and will be removed in v1.5."


Epoch 3:   0%|          | 0/1030 [00:00<00:00, 4396.55it/s]  
Epoch 3:  10%|█         | 103/1030 [00:14<02:11,  7.04it/s, loss=1.75, v_num=.]
Epoch 3:  20%|██        | 206/1030 [00:28<01:53,  7.29it/s, loss=1.62, v_num=.]
Epoch 3:  30%|███       | 309/1030 [00:42<01:37,  7.38it/s, loss=1.64, v_num=.]
Epoch 3:  40%|████      | 412/1030 [00:55<01:23,  7.41it/s, loss=1.69, v_num=.]
Epoch 3:  50%|█████     | 515/1030 [01:09<01:09,  7.44it/s, loss=1.77, v_num=.]
Epoch 3:  60%|██████    | 618/1030 [01:22<00:55,  7.46it/s, loss=1.23, v_num=.]
Epoch 3:  70%|███████   | 721/1030 [01:36<00:41,  7.46it/s, loss=1.57, v_num=.]
Epoch 3:  80%|████████  | 824/1030 [01:50<00:27,  7.47it/s, loss=1.59, v_num=.]
Epoch 3:  90%|█████████ | 927/1030 [02:04<00:13,  7.48it/s, loss=1.66, v_num=.]
Epoch 3: 100%|██████████| 1030/1030 [02:04<00:00,  8.27it/s, loss=1.66, v_num=.]
Validating: 0it [00:00, ?it/s][A
Validating:   0%|          | 0/93 [00:00<?, ?it/s][A
[2m[36m(pid=2127)[0m 
Validating: 100%|███████

[2m[36m(pid=2127)[0m   "`Trainer.running_sanity_check` has been renamed to `Trainer.sanity_checking` and will be removed in v1.5."


Result for train_Stage1_70527_00003:
  FID: 13.140625
  FID_cross: 25.421875
  auroc: 0.4979512691497803
  auroc_cross: 0.4798106849193573
  date: 2021-09-07_23-30-28
  done: false
  experiment_id: 9b670fdf81a34eecb2b1085a111512ff
  hostname: e89b1a816e20
  iterations_since_restore: 1
  loss_D: 0.7078417539596558
  loss_G: 0.9946161508560181
  node_ip: 172.28.0.2
  pid: 2127
  should_checkpoint: true
  time_since_restore: 145.9379494190216
  time_this_iter_s: 145.9379494190216
  time_total_s: 577.6827719211578
  timestamp: 1631057428
  timesteps_since_restore: 0
  training_iteration: 4
  trial_id: '70527_00003'
  


Trial name,status,loc,lr,n_fmaps,bs,loss_G,loss_D,FID,auroc,FID_cross,auroc_cross,training_iteration
train_Stage1_70527_00003,RUNNING,172.28.0.2:2127,0.0001,64,8,0.994616,0.707842,13.1406,0.497951,25.4219,0.479811,4
train_Stage1_70527_00001,PAUSED,,0.0001,16,8,10.3809,4.26464e-05,175.375,0.485852,186.25,0.479431,4
train_Stage1_70527_00002,PAUSED,,0.000795687,16,128,9.53413,9.20077e-05,510.75,0.488242,559.0,0.47151,4
train_Stage1_70527_00000,PENDING,,0.0001,8,8,4.64166,0.0869472,26.9844,0.503335,46.25,0.47644,4


[2m[36m(pid=2127)[0m Epoch 3: 100%|██████████| 1030/1030 [02:17<00:00,  7.50it/s, loss=1.66, v_num=.]


[2m[36m(pid=2127)[0m 2021-09-07 23:30:30,369	INFO trainable.py:76 -- Checkpoint size is 165685954 bytes
2021-09-07 23:30:31,422	INFO trainable.py:76 -- Checkpoint size is 91896386 bytes


Trial name,status,loc,lr,n_fmaps,bs,loss_G,loss_D,FID,auroc,FID_cross,auroc_cross,training_iteration
train_Stage1_70527_00000,RUNNING,,0.0001,8,8,4.64166,0.0869472,26.9844,0.503335,46.25,0.47644,4
train_Stage1_70527_00002,PAUSED,,0.000795687,16,128,9.53413,9.20077e-05,510.75,0.488242,559.0,0.47151,4
train_Stage1_70527_00003,PAUSED,,0.0001,64,8,0.994616,0.707842,13.1406,0.497951,25.4219,0.479811,4
train_Stage1_70527_00001,PENDING,,0.0001,16,8,10.3809,4.26464e-05,175.375,0.485852,186.25,0.479431,4


[2m[36m(pid=2236)[0m 2021-09-07 23:30:34,894	INFO trainable.py:383 -- Restored on 172.28.0.2 from checkpoint: /content/drive/MyDrive/Logs/F/Stage1/pbt_tanh/train_Stage1_70527_00000_0_n_fmaps=8_2021-09-07_23-00-47/checkpoint_tmpda87e5/./
[2m[36m(pid=2236)[0m 2021-09-07 23:30:34,894	INFO trainable.py:390 -- Current state after restoring: {'_iteration': 4, '_timesteps_total': None, '_time_total': 339.9160351753235, '_episodes_total': None}
[2m[36m(pid=2236)[0m Using native 16bit precision.
[2m[36m(pid=2236)[0m GPU available: True, used: True
[2m[36m(pid=2236)[0m TPU available: False, using: 0 TPU cores
[2m[36m(pid=2236)[0m IPU available: False, using: 0 IPUs
[2m[36m(pid=2236)[0m Restoring states from the checkpoint file at /content/drive/MyDrive/Logs/F/Stage1/pbt_tanh/train_Stage1_70527_00000_0_n_fmaps=8_2021-09-07_23-00-47/checkpoint_tmpda87e5/./checkpoint
[2m[36m(pid=2236)[0m LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Trial name,status,loc,lr,n_fmaps,bs,loss_G,loss_D,FID,auroc,FID_cross,auroc_cross,training_iteration
train_Stage1_70527_00000,RUNNING,,0.0001,8,8,4.64166,0.0869472,26.9844,0.503335,46.25,0.47644,4
train_Stage1_70527_00002,PAUSED,,0.000795687,16,128,9.53413,9.20077e-05,510.75,0.488242,559.0,0.47151,4
train_Stage1_70527_00003,PAUSED,,0.0001,64,8,0.994616,0.707842,13.1406,0.497951,25.4219,0.479811,4
train_Stage1_70527_00001,PENDING,,0.0001,16,8,10.3809,4.26464e-05,175.375,0.485852,186.25,0.479431,4


[2m[36m(pid=2236)[0m Restored all states from the checkpoint file at /content/drive/MyDrive/Logs/F/Stage1/pbt_tanh/train_Stage1_70527_00000_0_n_fmaps=8_2021-09-07_23-00-47/checkpoint_tmpda87e5/./checkpoint
[2m[36m(pid=2236)[0m 
[2m[36m(pid=2236)[0m   | Name          | Type               | Params
[2m[36m(pid=2236)[0m -----------------------------------------------------
[2m[36m(pid=2236)[0m 0 | generator     | DCGANGenerator     | 146 K 
[2m[36m(pid=2236)[0m 1 | discriminator | DCGANDiscriminator | 44.4 K
[2m[36m(pid=2236)[0m 2 | criterion     | BCEWithLogitsLoss  | 0     
[2m[36m(pid=2236)[0m 3 | modelF        | ResNet             | 11.2 M
[2m[36m(pid=2236)[0m 4 | lastF         | Sequential         | 1.5 K 
[2m[36m(pid=2236)[0m 5 | modelJ        | ResNet             | 11.2 M
[2m[36m(pid=2236)[0m 6 | lastJ         | Sequential         | 1.5 K 
[2m[36m(pid=2236)[0m 7 | val_metrics   | MetricCollection   | 22.3 M
[2m[36m(pid=2236)[0m ----------------

[2m[36m(pid=2236)[0m Validation sanity check: 0it [00:00, ?it/s]Validation sanity check:   0%|          | 0/2 [00:00<?, ?it/s]


[2m[36m(pid=2236)[0m   return torch.max_pool2d(input, kernel_size, stride, padding, dilation, ceil_mode)
[2m[36m(pid=2236)[0m   "`Trainer.running_sanity_check` has been renamed to `Trainer.sanity_checking` and will be removed in v1.5."


Epoch 4:   0%|          | 0/1030 [00:00<00:00, 2657.99it/s]  
Epoch 4:  10%|█         | 103/1030 [00:09<01:23, 11.07it/s, loss=1.92, v_num=.]
Epoch 4:  20%|██        | 206/1030 [00:16<01:06, 12.48it/s, loss=2.11, v_num=.]
Epoch 4:  30%|███       | 309/1030 [00:23<00:55, 13.04it/s, loss=2.11, v_num=.]
Epoch 4:  40%|████      | 412/1030 [00:30<00:46, 13.34it/s, loss=2.15, v_num=.]
Epoch 4:  50%|█████     | 515/1030 [00:38<00:38, 13.54it/s, loss=2.23, v_num=.]
Epoch 4:  60%|██████    | 618/1030 [00:45<00:30, 13.65it/s, loss=1.9, v_num=.] 
Epoch 4:  70%|███████   | 721/1030 [00:52<00:22, 13.75it/s, loss=2.05, v_num=.]
Epoch 4:  80%|████████  | 824/1030 [00:59<00:14, 13.80it/s, loss=1.79, v_num=.]
Epoch 4:  90%|█████████ | 927/1030 [01:07<00:07, 13.84it/s, loss=1.97, v_num=.]
Epoch 4: 100%|██████████| 1030/1030 [01:07<00:00, 15.27it/s, loss=1.97, v_num=.]
Validating: 0it [00:00, ?it/s][A
Validating:   0%|          | 0/93 [00:00<?, ?it/s][A
[2m[36m(pid=2236)[0m 
Validating: 100%|███████

[2m[36m(pid=2236)[0m   "`Trainer.running_sanity_check` has been renamed to `Trainer.sanity_checking` and will be removed in v1.5."


Result for train_Stage1_70527_00000:
  FID: 44.71875
  FID_cross: 50.96875
  auroc: 0.5
  auroc_cross: 0.48799726366996765
  date: 2021-09-07_23-32-01
  done: false
  experiment_id: f4cb33f5595841f8bba99795ccc25185
  hostname: e89b1a816e20
  iterations_since_restore: 1
  loss_D: 0.0607372522354126
  loss_G: 3.9784183502197266
  node_ip: 172.28.0.2
  pid: 2236
  should_checkpoint: true
  time_since_restore: 86.46307849884033
  time_this_iter_s: 86.46307849884033
  time_total_s: 426.3791136741638
  timestamp: 1631057521
  timesteps_since_restore: 0
  training_iteration: 5
  trial_id: '70527_00000'
  


Trial name,status,loc,lr,n_fmaps,bs,loss_G,loss_D,FID,auroc,FID_cross,auroc_cross,training_iteration
train_Stage1_70527_00000,RUNNING,172.28.0.2:2236,0.0001,8,8,3.97842,0.0607373,44.7188,0.5,50.9688,0.487997,5
train_Stage1_70527_00002,PAUSED,,0.000795687,16,128,9.53413,9.20077e-05,510.75,0.488242,559.0,0.47151,4
train_Stage1_70527_00003,PAUSED,,0.0001,64,8,0.994616,0.707842,13.1406,0.497951,25.4219,0.479811,4
train_Stage1_70527_00001,PENDING,,0.0001,16,8,10.3809,4.26464e-05,175.375,0.485852,186.25,0.479431,4


[2m[36m(pid=2236)[0m 2021-09-07 23:32:01,977	INFO trainable.py:76 -- Checkpoint size is 91896386 bytes
[2m[36m(pid=2236)[0m 2021-09-07 23:32:02,342	INFO trainable.py:76 -- Checkpoint size is 91896386 bytes
2021-09-07 23:32:02,872	INFO trainable.py:76 -- Checkpoint size is 96244802 bytes
[2m[36m(pid=2335)[0m 2021-09-07 23:32:06,357	INFO trainable.py:383 -- Restored on 172.28.0.2 from checkpoint: /content/drive/MyDrive/Logs/F/Stage1/pbt_tanh/train_Stage1_70527_00001_1_n_fmaps=16_2021-09-07_23-00-47/checkpoint_tmpa5b01b/./
[2m[36m(pid=2335)[0m 2021-09-07 23:32:06,357	INFO trainable.py:390 -- Current state after restoring: {'_iteration': 4, '_timesteps_total': None, '_time_total': 382.90535140037537, '_episodes_total': None}


Trial name,status,loc,lr,n_fmaps,bs,loss_G,loss_D,FID,auroc,FID_cross,auroc_cross,training_iteration
train_Stage1_70527_00001,RUNNING,,0.0001,16,8,10.3809,4.26464e-05,175.375,0.485852,186.25,0.479431,4
train_Stage1_70527_00000,PAUSED,,0.0001,8,8,3.97842,0.0607373,44.7188,0.5,50.9688,0.487997,5
train_Stage1_70527_00003,PAUSED,,0.0001,64,8,0.994616,0.707842,13.1406,0.497951,25.4219,0.479811,4
train_Stage1_70527_00002,PENDING,,0.000795687,16,128,9.53413,9.20077e-05,510.75,0.488242,559.0,0.47151,4


[2m[36m(pid=2335)[0m Using native 16bit precision.
[2m[36m(pid=2335)[0m GPU available: True, used: True
[2m[36m(pid=2335)[0m TPU available: False, using: 0 TPU cores
[2m[36m(pid=2335)[0m IPU available: False, using: 0 IPUs
[2m[36m(pid=2335)[0m Restoring states from the checkpoint file at /content/drive/MyDrive/Logs/F/Stage1/pbt_tanh/train_Stage1_70527_00001_1_n_fmaps=16_2021-09-07_23-00-47/checkpoint_tmpa5b01b/./checkpoint
[2m[36m(pid=2335)[0m LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Trial name,status,loc,lr,n_fmaps,bs,loss_G,loss_D,FID,auroc,FID_cross,auroc_cross,training_iteration
train_Stage1_70527_00001,RUNNING,,0.0001,16,8,10.3809,4.26464e-05,175.375,0.485852,186.25,0.479431,4
train_Stage1_70527_00000,PAUSED,,0.0001,8,8,3.97842,0.0607373,44.7188,0.5,50.9688,0.487997,5
train_Stage1_70527_00003,PAUSED,,0.0001,64,8,0.994616,0.707842,13.1406,0.497951,25.4219,0.479811,4
train_Stage1_70527_00002,PENDING,,0.000795687,16,128,9.53413,9.20077e-05,510.75,0.488242,559.0,0.47151,4


[2m[36m(pid=2335)[0m Restored all states from the checkpoint file at /content/drive/MyDrive/Logs/F/Stage1/pbt_tanh/train_Stage1_70527_00001_1_n_fmaps=16_2021-09-07_23-00-47/checkpoint_tmpa5b01b/./checkpoint
[2m[36m(pid=2335)[0m 
[2m[36m(pid=2335)[0m   | Name          | Type               | Params
[2m[36m(pid=2335)[0m -----------------------------------------------------
[2m[36m(pid=2335)[0m 0 | generator     | DCGANGenerator     | 377 K 
[2m[36m(pid=2335)[0m 1 | discriminator | DCGANDiscriminator | 174 K 
[2m[36m(pid=2335)[0m 2 | criterion     | BCEWithLogitsLoss  | 0     
[2m[36m(pid=2335)[0m 3 | modelF        | ResNet             | 11.2 M
[2m[36m(pid=2335)[0m 4 | lastF         | Sequential         | 1.5 K 
[2m[36m(pid=2335)[0m 5 | modelJ        | ResNet             | 11.2 M
[2m[36m(pid=2335)[0m 6 | lastJ         | Sequential         | 1.5 K 
[2m[36m(pid=2335)[0m 7 | val_metrics   | MetricCollection   | 22.3 M
[2m[36m(pid=2335)[0m ---------------

[2m[36m(pid=2335)[0m Validation sanity check: 0it [00:00, ?it/s]Validation sanity check:   0%|          | 0/2 [00:00<?, ?it/s]


[2m[36m(pid=2335)[0m   return torch.max_pool2d(input, kernel_size, stride, padding, dilation, ceil_mode)
[2m[36m(pid=2335)[0m   "`Trainer.running_sanity_check` has been renamed to `Trainer.sanity_checking` and will be removed in v1.5."


Epoch 4:   0%|          | 0/1030 [00:00<00:00, 4382.76it/s]  
Epoch 4:  10%|█         | 103/1030 [00:09<01:21, 11.44it/s, loss=5.32, v_num=.]
Epoch 4:  20%|██        | 206/1030 [00:17<01:08, 12.09it/s, loss=5.4, v_num=.] 
Epoch 4:  30%|███       | 309/1030 [00:25<00:58, 12.30it/s, loss=5.56, v_num=.]
Epoch 4:  40%|████      | 412/1030 [00:33<00:49, 12.45it/s, loss=5.68, v_num=.]
Epoch 4:  50%|█████     | 515/1030 [00:41<00:41, 12.53it/s, loss=5.73, v_num=.]
Epoch 4:  60%|██████    | 618/1030 [00:49<00:32, 12.55it/s, loss=5.74, v_num=.]
Epoch 4:  70%|███████   | 721/1030 [00:57<00:24, 12.55it/s, loss=5.78, v_num=.]
Epoch 4:  80%|████████  | 824/1030 [01:05<00:16, 12.57it/s, loss=5.77, v_num=.]
Epoch 4:  90%|█████████ | 927/1030 [01:13<00:08, 12.59it/s, loss=5.68, v_num=.]
Epoch 4: 100%|██████████| 1030/1030 [01:14<00:00, 13.90it/s, loss=5.68, v_num=.]
Validating: 0it [00:00, ?it/s][A
Validating:   0%|          | 0/93 [00:00<?, ?it/s][A
[2m[36m(pid=2335)[0m 
Validating: 100%|███████

[2m[36m(pid=2335)[0m   "`Trainer.running_sanity_check` has been renamed to `Trainer.sanity_checking` and will be removed in v1.5."
2021-09-07 23:33:40,390	INFO pbt.py:543 -- [exploit] transferring weights from trial train_Stage1_70527_00000 (score 0.5) -> train_Stage1_70527_00001 (score 0.485434353351593)
2021-09-07 23:33:40,394	INFO pbt.py:558 -- [explore] perturbed config from {'lr': 0.0001, 'bs': 8} -> {'lr': 8e-05, 'bs': 8}


Result for train_Stage1_70527_00001:
  FID: 80.25
  FID_cross: 69.5
  auroc: 0.485434353351593
  auroc_cross: 0.4727035164833069
  date: 2021-09-07_23-33-40
  done: false
  experiment_id: 2dfd70ad19c84d218b50ad9c615885cf
  hostname: e89b1a816e20
  iterations_since_restore: 1
  loss_D: 1.1011910828528926e-05
  loss_G: 11.378917694091797
  node_ip: 172.28.0.2
  pid: 2335
  should_checkpoint: true
  time_since_restore: 94.0211112499237
  time_this_iter_s: 94.0211112499237
  time_total_s: 476.9264626502991
  timestamp: 1631057620
  timesteps_since_restore: 0
  training_iteration: 5
  trial_id: '70527_00001'
  


Trial name,status,loc,lr,n_fmaps,bs,loss_G,loss_D,FID,auroc,FID_cross,auroc_cross,training_iteration
train_Stage1_70527_00001,RUNNING,172.28.0.2:2335,8e-05,8,8,11.3789,1.10119e-05,80.25,0.485434,69.5,0.472704,5
train_Stage1_70527_00000,PAUSED,,0.0001,8,8,3.97842,0.0607373,44.7188,0.5,50.9688,0.487997,5
train_Stage1_70527_00003,PAUSED,,0.0001,64,8,0.994616,0.707842,13.1406,0.497951,25.4219,0.479811,4
train_Stage1_70527_00002,PENDING,,0.000795687,16,128,9.53413,9.20077e-05,510.75,0.488242,559.0,0.47151,4


[2m[36m(pid=2335)[0m 2021-09-07 23:33:41,043	INFO trainable.py:383 -- Restored on 172.28.0.2 from checkpoint: /content/drive/MyDrive/Logs/F/Stage1/pbt_tanh/train_Stage1_70527_00001_1_n_fmaps=16_2021-09-07_23-00-47/checkpoint_tmpe548cd/./
[2m[36m(pid=2335)[0m 2021-09-07 23:33:41,043	INFO trainable.py:390 -- Current state after restoring: {'_iteration': 5, '_timesteps_total': None, '_time_total': 426.3791136741638, '_episodes_total': None}
[2m[36m(pid=2335)[0m 2021-09-07 23:33:42,117	INFO trainable.py:76 -- Checkpoint size is 91896386 bytes
2021-09-07 23:33:42,720	INFO trainable.py:76 -- Checkpoint size is 96244802 bytes


Trial name,status,loc,lr,n_fmaps,bs,loss_G,loss_D,FID,auroc,FID_cross,auroc_cross,training_iteration
train_Stage1_70527_00002,RUNNING,,0.000795687,16,128,9.53413,9.20077e-05,510.75,0.488242,559.0,0.47151,4
train_Stage1_70527_00000,PAUSED,,0.0001,8,8,3.97842,0.0607373,44.7188,0.5,50.9688,0.487997,5
train_Stage1_70527_00001,PAUSED,,8e-05,8,8,11.3789,1.10119e-05,80.25,0.485434,69.5,0.472704,5
train_Stage1_70527_00003,PENDING,,0.0001,64,8,0.994616,0.707842,13.1406,0.497951,25.4219,0.479811,4


[2m[36m(pid=2434)[0m 2021-09-07 23:33:46,425	INFO trainable.py:383 -- Restored on 172.28.0.2 from checkpoint: /content/drive/MyDrive/Logs/F/Stage1/pbt_tanh/train_Stage1_70527_00002_2_n_fmaps=32_2021-09-07_23-02-12/checkpoint_tmp596c9e/./
[2m[36m(pid=2434)[0m 2021-09-07 23:33:46,426	INFO trainable.py:390 -- Current state after restoring: {'_iteration': 4, '_timesteps_total': None, '_time_total': 345.28416657447815, '_episodes_total': None}
[2m[36m(pid=2434)[0m Using native 16bit precision.
[2m[36m(pid=2434)[0m GPU available: True, used: True
[2m[36m(pid=2434)[0m TPU available: False, using: 0 TPU cores
[2m[36m(pid=2434)[0m IPU available: False, using: 0 IPUs
[2m[36m(pid=2434)[0m Restoring states from the checkpoint file at /content/drive/MyDrive/Logs/F/Stage1/pbt_tanh/train_Stage1_70527_00002_2_n_fmaps=32_2021-09-07_23-02-12/checkpoint_tmp596c9e/./checkpoint
[2m[36m(pid=2434)[0m LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Trial name,status,loc,lr,n_fmaps,bs,loss_G,loss_D,FID,auroc,FID_cross,auroc_cross,training_iteration
train_Stage1_70527_00002,RUNNING,,0.000795687,16,128,9.53413,9.20077e-05,510.75,0.488242,559.0,0.47151,4
train_Stage1_70527_00000,PAUSED,,0.0001,8,8,3.97842,0.0607373,44.7188,0.5,50.9688,0.487997,5
train_Stage1_70527_00001,PAUSED,,8e-05,8,8,11.3789,1.10119e-05,80.25,0.485434,69.5,0.472704,5
train_Stage1_70527_00003,PENDING,,0.0001,64,8,0.994616,0.707842,13.1406,0.497951,25.4219,0.479811,4


[2m[36m(pid=2434)[0m Restored all states from the checkpoint file at /content/drive/MyDrive/Logs/F/Stage1/pbt_tanh/train_Stage1_70527_00002_2_n_fmaps=32_2021-09-07_23-02-12/checkpoint_tmp596c9e/./checkpoint
[2m[36m(pid=2434)[0m 
[2m[36m(pid=2434)[0m   | Name          | Type               | Params
[2m[36m(pid=2434)[0m -----------------------------------------------------
[2m[36m(pid=2434)[0m 0 | generator     | DCGANGenerator     | 377 K 
[2m[36m(pid=2434)[0m 1 | discriminator | DCGANDiscriminator | 174 K 
[2m[36m(pid=2434)[0m 2 | criterion     | BCEWithLogitsLoss  | 0     
[2m[36m(pid=2434)[0m 3 | modelF        | ResNet             | 11.2 M
[2m[36m(pid=2434)[0m 4 | lastF         | Sequential         | 1.5 K 
[2m[36m(pid=2434)[0m 5 | modelJ        | ResNet             | 11.2 M
[2m[36m(pid=2434)[0m 6 | lastJ         | Sequential         | 1.5 K 
[2m[36m(pid=2434)[0m 7 | val_metrics   | MetricCollection   | 22.3 M
[2m[36m(pid=2434)[0m ---------------

[2m[36m(pid=2434)[0m Validation sanity check: 0it [00:00, ?it/s]Validation sanity check:   0%|          | 0/2 [00:00<?, ?it/s]


[2m[36m(pid=2434)[0m   return torch.max_pool2d(input, kernel_size, stride, padding, dilation, ceil_mode)
[2m[36m(pid=2434)[0m   "`Trainer.running_sanity_check` has been renamed to `Trainer.sanity_checking` and will be removed in v1.5."


Epoch 4:   0%|          | 0/63 [00:00<00:00, 3057.07it/s]  
Epoch 4:  10%|▉         | 6/63 [00:09<01:19,  1.40s/it, loss=4.78, v_num=.]
Epoch 4:  19%|█▉        | 12/63 [00:17<01:08,  1.35s/it, loss=4.77, v_num=.]
Epoch 4:  29%|██▊       | 18/63 [00:21<00:49,  1.11s/it, loss=4.78, v_num=.]
Epoch 4:  38%|███▊      | 24/63 [00:24<00:38,  1.01it/s, loss=4.79, v_num=.]
Epoch 4:  48%|████▊     | 30/63 [00:28<00:30,  1.09it/s, loss=4.77, v_num=.]
Epoch 4:  57%|█████▋    | 36/63 [00:32<00:23,  1.15it/s, loss=4.78, v_num=.]
Epoch 4:  67%|██████▋   | 42/63 [00:35<00:17,  1.20it/s, loss=4.79, v_num=.]
Epoch 4:  76%|███████▌  | 48/63 [00:39<00:12,  1.24it/s, loss=4.8, v_num=.] 
Epoch 4:  76%|███████▌  | 48/63 [00:55<00:16,  1.13s/it, loss=4.8, v_num=.]
Epoch 4:  86%|████████▌ | 54/63 [00:55<00:09,  1.01s/it, loss=4.8, v_num=.]
Epoch 4:  95%|█████████▌| 60/63 [00:58<00:02,  1.05it/s, loss=4.8, v_num=.]
Validating: 0it [00:00, ?it/s][A
Validating:   0%|          | 0/5 [00:00<?, ?it/s][A
[2m[36m(

[2m[36m(pid=2434)[0m   "`Trainer.running_sanity_check` has been renamed to `Trainer.sanity_checking` and will be removed in v1.5."


Result for train_Stage1_70527_00002:
  FID: 491.75
  FID_cross: 552.0
  auroc: 0.5
  auroc_cross: 0.5
  date: 2021-09-07_23-35-04
  done: false
  experiment_id: 2dfd70ad19c84d218b50ad9c615885cf
  hostname: e89b1a816e20
  iterations_since_restore: 1
  loss_D: 8.635871927253902e-05
  loss_G: 9.649480819702148
  node_ip: 172.28.0.2
  pid: 2434
  should_checkpoint: true
  time_since_restore: 78.14408946037292
  time_this_iter_s: 78.14408946037292
  time_total_s: 423.4282560348511
  timestamp: 1631057704
  timesteps_since_restore: 0
  training_iteration: 5
  trial_id: '70527_00002'
  


[2m[36m(pid=2434)[0m 2021-09-07 23:35:04,943	INFO trainable.py:76 -- Checkpoint size is 96244802 bytes


Trial name,status,loc,lr,n_fmaps,bs,loss_G,loss_D,FID,auroc,FID_cross,auroc_cross,training_iteration
train_Stage1_70527_00002,RUNNING,172.28.0.2:2434,0.000795687,16,128,9.64948,8.63587e-05,491.75,0.5,552.0,0.5,5
train_Stage1_70527_00000,PAUSED,,0.0001,8,8,3.97842,0.0607373,44.7188,0.5,50.9688,0.487997,5
train_Stage1_70527_00001,PAUSED,,8e-05,8,8,11.3789,1.10119e-05,80.25,0.485434,69.5,0.472704,5
train_Stage1_70527_00003,PENDING,,0.0001,64,8,0.994616,0.707842,13.1406,0.497951,25.4219,0.479811,4


[2m[36m(pid=2434)[0m 2021-09-07 23:35:05,583	INFO trainable.py:76 -- Checkpoint size is 96244802 bytes
2021-09-07 23:35:06,438	INFO trainable.py:76 -- Checkpoint size is 165685954 bytes


Trial name,status,loc,lr,n_fmaps,bs,loss_G,loss_D,FID,auroc,FID_cross,auroc_cross,training_iteration
train_Stage1_70527_00003,RUNNING,,0.0001,64,8,0.994616,0.707842,13.1406,0.497951,25.4219,0.479811,4
train_Stage1_70527_00001,PAUSED,,8e-05,8,8,11.3789,1.10119e-05,80.25,0.485434,69.5,0.472704,5
train_Stage1_70527_00002,PAUSED,,0.000795687,16,128,9.64948,8.63587e-05,491.75,0.5,552.0,0.5,5
train_Stage1_70527_00000,PENDING,,0.0001,8,8,3.97842,0.0607373,44.7188,0.5,50.9688,0.487997,5


[2m[36m(pid=2527)[0m 2021-09-07 23:35:10,756	INFO trainable.py:383 -- Restored on 172.28.0.2 from checkpoint: /content/drive/MyDrive/Logs/F/Stage1/pbt_tanh/train_Stage1_70527_00003_3_n_fmaps=64_2021-09-07_23-03-45/checkpoint_tmp784906/./
[2m[36m(pid=2527)[0m 2021-09-07 23:35:10,756	INFO trainable.py:390 -- Current state after restoring: {'_iteration': 4, '_timesteps_total': None, '_time_total': 577.6827719211578, '_episodes_total': None}
[2m[36m(pid=2527)[0m Using native 16bit precision.
[2m[36m(pid=2527)[0m GPU available: True, used: True
[2m[36m(pid=2527)[0m TPU available: False, using: 0 TPU cores
[2m[36m(pid=2527)[0m IPU available: False, using: 0 IPUs
[2m[36m(pid=2527)[0m Restoring states from the checkpoint file at /content/drive/MyDrive/Logs/F/Stage1/pbt_tanh/train_Stage1_70527_00003_3_n_fmaps=64_2021-09-07_23-03-45/checkpoint_tmp784906/./checkpoint
[2m[36m(pid=2527)[0m LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Trial name,status,loc,lr,n_fmaps,bs,loss_G,loss_D,FID,auroc,FID_cross,auroc_cross,training_iteration
train_Stage1_70527_00003,RUNNING,,0.0001,64,8,0.994616,0.707842,13.1406,0.497951,25.4219,0.479811,4
train_Stage1_70527_00001,PAUSED,,8e-05,8,8,11.3789,1.10119e-05,80.25,0.485434,69.5,0.472704,5
train_Stage1_70527_00002,PAUSED,,0.000795687,16,128,9.64948,8.63587e-05,491.75,0.5,552.0,0.5,5
train_Stage1_70527_00000,PENDING,,0.0001,8,8,3.97842,0.0607373,44.7188,0.5,50.9688,0.487997,5


[2m[36m(pid=2527)[0m Restored all states from the checkpoint file at /content/drive/MyDrive/Logs/F/Stage1/pbt_tanh/train_Stage1_70527_00003_3_n_fmaps=64_2021-09-07_23-03-45/checkpoint_tmp784906/./checkpoint
[2m[36m(pid=2527)[0m 
[2m[36m(pid=2527)[0m   | Name          | Type               | Params
[2m[36m(pid=2527)[0m -----------------------------------------------------
[2m[36m(pid=2527)[0m 0 | generator     | DCGANGenerator     | 3.6 M 
[2m[36m(pid=2527)[0m 1 | discriminator | DCGANDiscriminator | 2.8 M 
[2m[36m(pid=2527)[0m 2 | criterion     | BCEWithLogitsLoss  | 0     
[2m[36m(pid=2527)[0m 3 | modelF        | ResNet             | 11.2 M
[2m[36m(pid=2527)[0m 4 | lastF         | Sequential         | 1.5 K 
[2m[36m(pid=2527)[0m 5 | modelJ        | ResNet             | 11.2 M
[2m[36m(pid=2527)[0m 6 | lastJ         | Sequential         | 1.5 K 
[2m[36m(pid=2527)[0m 7 | val_metrics   | MetricCollection   | 22.3 M
[2m[36m(pid=2527)[0m ---------------

[2m[36m(pid=2527)[0m Validation sanity check: 0it [00:00, ?it/s]Validation sanity check:   0%|          | 0/2 [00:00<?, ?it/s]


[2m[36m(pid=2527)[0m   return torch.max_pool2d(input, kernel_size, stride, padding, dilation, ceil_mode)
[2m[36m(pid=2527)[0m   "`Trainer.running_sanity_check` has been renamed to `Trainer.sanity_checking` and will be removed in v1.5."


Epoch 4:   0%|          | 0/1030 [00:00<00:00, 4675.92it/s]  
Epoch 4:  10%|█         | 103/1030 [00:13<02:04,  7.43it/s, loss=1.18, v_num=.]
Epoch 4:  20%|██        | 206/1030 [00:27<01:49,  7.49it/s, loss=1.73, v_num=.]
Epoch 4:  30%|███       | 309/1030 [00:41<01:36,  7.49it/s, loss=1.6, v_num=.] 
Epoch 4:  40%|████      | 412/1030 [00:55<01:22,  7.50it/s, loss=1.56, v_num=.]
Epoch 4:  50%|█████     | 515/1030 [01:08<01:08,  7.51it/s, loss=1.42, v_num=.]
Epoch 4:  60%|██████    | 618/1030 [01:22<00:54,  7.51it/s, loss=1.52, v_num=.]
Epoch 4:  70%|███████   | 721/1030 [01:36<00:41,  7.50it/s, loss=1.15, v_num=.]
Epoch 4:  80%|████████  | 824/1030 [01:49<00:27,  7.51it/s, loss=1.19, v_num=.]
Epoch 4:  90%|█████████ | 927/1030 [02:03<00:13,  7.50it/s, loss=1.33, v_num=.]
Epoch 4: 100%|██████████| 1030/1030 [02:04<00:00,  8.30it/s, loss=1.33, v_num=.]
Validating: 0it [00:00, ?it/s][A
Validating:   0%|          | 0/93 [00:00<?, ?it/s][A
[2m[36m(pid=2527)[0m 
Validating: 100%|███████

[2m[36m(pid=2527)[0m   "`Trainer.running_sanity_check` has been renamed to `Trainer.sanity_checking` and will be removed in v1.5."
2021-09-07 23:37:36,637	INFO pbt.py:543 -- [exploit] transferring weights from trial train_Stage1_70527_00002 (score 0.5) -> train_Stage1_70527_00003 (score 0.48099344968795776)
2021-09-07 23:37:36,639	INFO pbt.py:558 -- [explore] perturbed config from {'lr': 0.0007956865814995524, 'bs': 128} -> {'lr': 0.0006365492651996419, 'bs': 128}


Result for train_Stage1_70527_00003:
  FID: 8.15625
  FID_cross: 17.171875
  auroc: 0.48099344968795776
  auroc_cross: 0.4998963177204132
  date: 2021-09-07_23-37-36
  done: false
  experiment_id: 9b670fdf81a34eecb2b1085a111512ff
  hostname: e89b1a816e20
  iterations_since_restore: 1
  loss_D: 1.7420072555541992
  loss_G: 1.999516487121582
  node_ip: 172.28.0.2
  pid: 2527
  should_checkpoint: true
  time_since_restore: 145.85594201087952
  time_this_iter_s: 145.85594201087952
  time_total_s: 723.5387139320374
  timestamp: 1631057856
  timesteps_since_restore: 0
  training_iteration: 5
  trial_id: '70527_00003'
  


Trial name,status,loc,lr,n_fmaps,bs,loss_G,loss_D,FID,auroc,FID_cross,auroc_cross,training_iteration
train_Stage1_70527_00003,RUNNING,172.28.0.2:2527,0.000636549,16,128,1.99952,1.74201,8.15625,0.480993,17.1719,0.499896,5
train_Stage1_70527_00001,PAUSED,,8e-05,8,8,11.3789,1.10119e-05,80.25,0.485434,69.5,0.472704,5
train_Stage1_70527_00002,PAUSED,,0.000795687,16,128,9.64948,8.63587e-05,491.75,0.5,552.0,0.5,5
train_Stage1_70527_00000,PENDING,,0.0001,8,8,3.97842,0.0607373,44.7188,0.5,50.9688,0.487997,5


[2m[36m(pid=2527)[0m 2021-09-07 23:37:37,330	INFO trainable.py:383 -- Restored on 172.28.0.2 from checkpoint: /content/drive/MyDrive/Logs/F/Stage1/pbt_tanh/train_Stage1_70527_00003_3_n_fmaps=64_2021-09-07_23-03-45/checkpoint_tmp0bd2d7/./
[2m[36m(pid=2527)[0m 2021-09-07 23:37:37,330	INFO trainable.py:390 -- Current state after restoring: {'_iteration': 5, '_timesteps_total': None, '_time_total': 423.4282560348511, '_episodes_total': None}


[2m[36m(pid=2527)[0m Epoch 4: 100%|██████████| 1030/1030 [02:16<00:00,  7.53it/s, loss=1.33, v_num=.]


[2m[36m(pid=2527)[0m 2021-09-07 23:37:38,828	INFO trainable.py:76 -- Checkpoint size is 96244802 bytes
2021-09-07 23:37:41,536	INFO trainable.py:76 -- Checkpoint size is 91896386 bytes


Trial name,status,loc,lr,n_fmaps,bs,loss_G,loss_D,FID,auroc,FID_cross,auroc_cross,training_iteration
train_Stage1_70527_00000,RUNNING,,0.0001,8,8,3.97842,0.0607373,44.7188,0.5,50.9688,0.487997,5
train_Stage1_70527_00002,PAUSED,,0.000795687,16,128,9.64948,8.63587e-05,491.75,0.5,552.0,0.5,5
train_Stage1_70527_00003,PAUSED,,0.000636549,16,128,1.99952,1.74201,8.15625,0.480993,17.1719,0.499896,5
train_Stage1_70527_00001,PENDING,,8e-05,8,8,11.3789,1.10119e-05,80.25,0.485434,69.5,0.472704,5


[2m[36m(pid=2634)[0m 2021-09-07 23:37:44,928	INFO trainable.py:383 -- Restored on 172.28.0.2 from checkpoint: /content/drive/MyDrive/Logs/F/Stage1/pbt_tanh/train_Stage1_70527_00000_0_n_fmaps=8_2021-09-07_23-00-47/checkpoint_tmpc1466d/./
[2m[36m(pid=2634)[0m 2021-09-07 23:37:44,928	INFO trainable.py:390 -- Current state after restoring: {'_iteration': 5, '_timesteps_total': None, '_time_total': 426.3791136741638, '_episodes_total': None}
[2m[36m(pid=2634)[0m Using native 16bit precision.
[2m[36m(pid=2634)[0m GPU available: True, used: True
[2m[36m(pid=2634)[0m TPU available: False, using: 0 TPU cores
[2m[36m(pid=2634)[0m IPU available: False, using: 0 IPUs
[2m[36m(pid=2634)[0m Restoring states from the checkpoint file at /content/drive/MyDrive/Logs/F/Stage1/pbt_tanh/train_Stage1_70527_00000_0_n_fmaps=8_2021-09-07_23-00-47/checkpoint_tmpc1466d/./checkpoint
[2m[36m(pid=2634)[0m LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Trial name,status,loc,lr,n_fmaps,bs,loss_G,loss_D,FID,auroc,FID_cross,auroc_cross,training_iteration
train_Stage1_70527_00000,RUNNING,,0.0001,8,8,3.97842,0.0607373,44.7188,0.5,50.9688,0.487997,5
train_Stage1_70527_00002,PAUSED,,0.000795687,16,128,9.64948,8.63587e-05,491.75,0.5,552.0,0.5,5
train_Stage1_70527_00003,PAUSED,,0.000636549,16,128,1.99952,1.74201,8.15625,0.480993,17.1719,0.499896,5
train_Stage1_70527_00001,PENDING,,8e-05,8,8,11.3789,1.10119e-05,80.25,0.485434,69.5,0.472704,5


[2m[36m(pid=2634)[0m Restored all states from the checkpoint file at /content/drive/MyDrive/Logs/F/Stage1/pbt_tanh/train_Stage1_70527_00000_0_n_fmaps=8_2021-09-07_23-00-47/checkpoint_tmpc1466d/./checkpoint
[2m[36m(pid=2634)[0m 
[2m[36m(pid=2634)[0m   | Name          | Type               | Params
[2m[36m(pid=2634)[0m -----------------------------------------------------
[2m[36m(pid=2634)[0m 0 | generator     | DCGANGenerator     | 146 K 
[2m[36m(pid=2634)[0m 1 | discriminator | DCGANDiscriminator | 44.4 K
[2m[36m(pid=2634)[0m 2 | criterion     | BCEWithLogitsLoss  | 0     
[2m[36m(pid=2634)[0m 3 | modelF        | ResNet             | 11.2 M
[2m[36m(pid=2634)[0m 4 | lastF         | Sequential         | 1.5 K 
[2m[36m(pid=2634)[0m 5 | modelJ        | ResNet             | 11.2 M
[2m[36m(pid=2634)[0m 6 | lastJ         | Sequential         | 1.5 K 
[2m[36m(pid=2634)[0m 7 | val_metrics   | MetricCollection   | 22.3 M
[2m[36m(pid=2634)[0m ----------------

[2m[36m(pid=2634)[0m Validation sanity check: 0it [00:00, ?it/s]Validation sanity check:   0%|          | 0/2 [00:00<?, ?it/s]


[2m[36m(pid=2634)[0m   return torch.max_pool2d(input, kernel_size, stride, padding, dilation, ceil_mode)
[2m[36m(pid=2634)[0m   "`Trainer.running_sanity_check` has been renamed to `Trainer.sanity_checking` and will be removed in v1.5."


Epoch 5:   0%|          | 0/1030 [00:00<00:00, 4350.94it/s]  
Epoch 5:  10%|█         | 103/1030 [00:14<02:06,  7.31it/s, loss=1.93, v_num=.]
Epoch 5:  20%|██        | 206/1030 [00:22<01:29,  9.18it/s, loss=2.14, v_num=.]
Epoch 5:  30%|███       | 309/1030 [00:29<01:09, 10.43it/s, loss=2.09, v_num=.]
Epoch 5:  40%|████      | 412/1030 [00:36<00:55, 11.18it/s, loss=2.03, v_num=.]
Epoch 5:  50%|█████     | 515/1030 [00:44<00:44, 11.70it/s, loss=1.99, v_num=.]
Epoch 5:  60%|██████    | 618/1030 [00:51<00:34, 12.06it/s, loss=1.7, v_num=.] 




Epoch 5:  70%|███████   | 721/1030 [00:58<00:25, 12.30it/s, loss=1.94, v_num=.]
Epoch 5:  80%|████████  | 824/1030 [01:05<00:16, 12.53it/s, loss=1.88, v_num=.]
Epoch 5:  90%|█████████ | 927/1030 [01:13<00:08, 12.71it/s, loss=1.83, v_num=.]
Epoch 5: 100%|██████████| 1030/1030 [01:13<00:00, 14.02it/s, loss=1.83, v_num=.]
Validating: 0it [00:00, ?it/s][A
Validating:   0%|          | 0/93 [00:00<?, ?it/s][A
[2m[36m(pid=2634)[0m 
Validating: 100%|██████████| 93/93 [00:08<00:00, 10.56it/s][A


[2m[36m(pid=2634)[0m   "`Trainer.running_sanity_check` has been renamed to `Trainer.sanity_checking` and will be removed in v1.5."


Result for train_Stage1_70527_00000:
  FID: 7.26171875
  FID_cross: 15.4453125
  auroc: 0.5
  auroc_cross: 0.5188722014427185
  date: 2021-09-07_23-39-17
  done: false
  experiment_id: f4cb33f5595841f8bba99795ccc25185
  hostname: e89b1a816e20
  iterations_since_restore: 1
  loss_D: 0.1962926983833313
  loss_G: 3.5838091373443604
  node_ip: 172.28.0.2
  pid: 2634
  should_checkpoint: true
  time_since_restore: 92.48306369781494
  time_this_iter_s: 92.48306369781494
  time_total_s: 518.8621773719788
  timestamp: 1631057957
  timesteps_since_restore: 0
  training_iteration: 6
  trial_id: '70527_00000'
  


Trial name,status,loc,lr,n_fmaps,bs,loss_G,loss_D,FID,auroc,FID_cross,auroc_cross,training_iteration
train_Stage1_70527_00000,RUNNING,172.28.0.2:2634,0.0001,8,8,3.58381,0.196293,7.26172,0.5,15.4453,0.518872,6
train_Stage1_70527_00002,PAUSED,,0.000795687,16,128,9.64948,8.63587e-05,491.75,0.5,552.0,0.5,5
train_Stage1_70527_00003,PAUSED,,0.000636549,16,128,1.99952,1.74201,8.15625,0.480993,17.1719,0.499896,5
train_Stage1_70527_00001,PENDING,,8e-05,8,8,11.3789,1.10119e-05,80.25,0.485434,69.5,0.472704,5


Trial name,status,loc,lr,n_fmaps,bs,loss_G,loss_D,FID,auroc,FID_cross,auroc_cross,training_iteration
train_Stage1_70527_00000,RUNNING,172.28.0.2:2634,0.0001,8,8,3.58381,0.196293,7.26172,0.5,15.4453,0.518872,6
train_Stage1_70527_00002,PAUSED,,0.000795687,16,128,9.64948,8.63587e-05,491.75,0.5,552.0,0.5,5
train_Stage1_70527_00003,PAUSED,,0.000636549,16,128,1.99952,1.74201,8.15625,0.480993,17.1719,0.499896,5
train_Stage1_70527_00001,PENDING,,8e-05,8,8,11.3789,1.10119e-05,80.25,0.485434,69.5,0.472704,5


2021-09-07 23:39:17,832	ERROR tune.py:557 -- Trials did not complete: [train_Stage1_70527_00000, train_Stage1_70527_00001, train_Stage1_70527_00002, train_Stage1_70527_00003]
2021-09-07 23:39:17,834	INFO tune.py:561 -- Total run time: 2310.31 seconds (2309.96 seconds for the tuning loop).


Best checkpoint path found is:  /content/drive/MyDrive/Logs/F/Stage1/pbt_tanh/train_Stage1_70527_00000_0_n_fmaps=8_2021-09-07_23-00-47/checkpoint_epoch=1-step=1873/


In [None]:
!tensorboard dev upload --logdir /content/drive/MyDrive/Logs/F/Stage1/test

In [None]:
drive.flush_and_unmount()

## LensGAN128

In [41]:
%rm -rf drive/MyDrive/Logs/F/LensGAN128/pbt_tanh/

In [43]:
# __tune_train_checkpoint_begin
def train_LensGAN128(config, checkpoint_dir=None, num_epochs=10, num_gpus=torch.cuda.device_count()):
    # print(os.cpu_count(), torch.cuda.device_count())
    kwargs = {
        'limit_train_batches' : 0.1,
        'limit_val_batches' : 0.1,
        'progress_bar_refresh_rate' : math.ceil(8250*0.1//config['bs']),
        'max_epochs' : num_epochs,
        'prepare_data_per_node' : False,
        # If fractional GPUs passed in, convert to int.
        'gpus' : math.ceil(num_gpus),
        'logger' : TensorBoardLogger(save_dir=tune.get_trial_dir(), name='', version='.'),
        'callbacks' : [
            TuneReportCheckpointCallback(
                {
                    'loss_G': 'Stage1/G/train/loss', 
                    'loss_D': 'Stage1/D/train/loss', 
                    # Switch up the FID vlues when training on different dataset -----------------------------------------------
                    'FID': 'Stage1/val/FID_F', 
                    'FID_cross': 'Stage1/val/FID_J',
                    'auroc': 'Stage1/LensResnet(F)/val/auroc',
                    'auroc_cross': 'Stage1/LensResnet(J)/val/auroc',
                },
            ),
            ModuleDataMonitor(True),
        ],
        # 'stochastic_weight_avg' : True,
        # works with only one optimizer
        'benchmark' : True,
        'precision' : 16,
        # 'gradient_clip_val' : 0.5, 
        # 'gradient_clip_algorithm' : 'value',
    }
    
    dm = npyImageData(config, 128)                                              # Specify image width here    
    if checkpoint_dir is not None:
        kwargs['resume_from_checkpoint'] = os.path.join(checkpoint_dir, 'checkpoint')
        # model = LensGAN128.load_from_checkpoint(kwargs['resume_from_checkpoint'], config=config)
    # else:

    model = LensGAN128(config)
    trainer = pl.Trainer(**kwargs)

    trainer.fit(model, dm)
# __tune_train_checkpoint_end__

# __tune_pbt_begin__
def tune_LensGAN128_pbt(num_samples=10, num_epochs=10, gpus_per_trial=torch.cuda.device_count()):
    # print(os.cpu_count(), torch.cuda.device_count())
    analysis = tune.run(
        tune.with_parameters(
            train_LensGAN128,
            num_epochs=num_epochs,
            num_gpus=gpus_per_trial
        ),
        # Change the folder name when changing dataset--------------------------------------------------------------------------
        name='F/LensGAN128/pbt_tanh',
        metric='FID',
        mode='min',
        # stop=TrialPlateauStopper('FID'),
        resources_per_trial={'cpu': os.cpu_count(), 'gpu': gpus_per_trial},
        local_dir='./drive/MyDrive/Logs',
        # config={'lr': tune.choice([1e-4, 1e-3, 1e-5, 1e-2, 1e-6, 1e-1, 1e-7]),
        #         'bs': tune.grid_search([8, 16, 32, 64, 128]),
        #         },
        # scheduler = pbtScheduler(max_t=num_epochs, grace_period=2, reduction_factor=2),
        # Can't use RB2 as it requires mutations to be continuous
        config={'lr': 1e-4,
                'n_fmaps': tune.grid_search([8, 16, 32, 64]),
                'bs': 8,
                },
        # config = {'lr': 2.340983544823817e-05, 'n_fmaps': 32, 'bs': 8},
        scheduler = PopulationBasedTraining(time_attr='training_iteration', quantile_fraction=0.25,
                                            resample_probability=0.25,  perturbation_interval=1,
                                            hyperparam_mutations={
                                                'lr': tune.loguniform(1e-7, 1e-1),
                                                'bs': [8, 16, 32, 64, 128],
                                            },
        ),
        progress_reporter=JupyterNotebookReporter(
            overwrite=False,
            parameter_columns=['lr', 'n_fmaps', 'bs'],
            metric_columns=['loss_G', 'loss_D', 'FID', 'auroc', 'FID_cross', 
                            'auroc_cross', 'training_iteration'],
        ),
        fail_fast = True,
        # reuse_actors=True,
        num_samples=num_samples,
        resume='PROMPT',
        # restore=BEST_J_LensGAN128,
    )
    # ---------------------------------------------------------------------------------------------
    BEST_F_LensGAN128 = analysis.best_checkpoint
    print('Best checkpoint path found is: ', analysis.best_checkpoint)

# __tune_pbt_end__

if __name__ == '__main__':
    import argparse

    parser = argparse.ArgumentParser()
    parser.add_argument(
        '--smoke-test', action='store_true', help='Finish quickly for testing')
    args, _ = parser.parse_known_args()

    if args.smoke_test:
        tune_LensGAN128_pbt(num_samples=1, num_epochs=6, gpus_per_trial=torch.cuda.device_count())
    else:
        # pbt scheduler
        tune_LensGAN128_pbt(num_samples=1, num_epochs=10, gpus_per_trial=torch.cuda.device_count())

Resume from local directory? (/content/drive/MyDrive/Logs/F/LensGAN128/pbt_tanh) [y/N]: y


2021-09-08 00:53:40,734	INFO tune.py:484 -- TrialRunner resumed, ignoring new add_experiment.


Trial name,status,loc,lr,n_fmaps,bs,loss_G,loss_D,FID,auroc,FID_cross,auroc_cross,training_iteration
train_LensGAN128_5fa35_00003,PAUSED,,0.0001,64,8,1.65938,0.786239,22.25,0.492572,19.9219,0.49814,2
train_LensGAN128_5fa35_00000,PENDING,,0.00012,32,8,10.6631,2.57638e-05,117.688,0.501384,156.75,0.5,3
train_LensGAN128_5fa35_00002,PENDING,,0.0001,32,8,10.2784,4.06049e-05,95.125,0.509944,170.75,0.5,2
train_LensGAN128_5fa35_00001,PENDING,,0.0001,16,8,9.88384,8.79142e-05,542.5,0.4942,647.5,0.491456,2


2021-09-08 00:53:46,762	INFO trainable.py:76 -- Checkpoint size is 166470866 bytes


Trial name,status,loc,lr,n_fmaps,bs,loss_G,loss_D,FID,auroc,FID_cross,auroc_cross,training_iteration
train_LensGAN128_5fa35_00000,RUNNING,,0.00012,32,8,10.6631,2.57638e-05,117.688,0.501384,156.75,0.5,3
train_LensGAN128_5fa35_00003,PAUSED,,0.0001,64,8,1.65938,0.786239,22.25,0.492572,19.9219,0.49814,2
train_LensGAN128_5fa35_00002,PENDING,,0.0001,32,8,10.2784,4.06049e-05,95.125,0.509944,170.75,0.5,2
train_LensGAN128_5fa35_00001,PENDING,,0.0001,16,8,9.88384,8.79142e-05,542.5,0.4942,647.5,0.491456,2


[2m[36m(pid=4486)[0m 2021-09-08 00:53:48,299	INFO trainable.py:383 -- Restored on 172.28.0.2 from checkpoint: /content/drive/MyDrive/Logs/F/LensGAN128/pbt_tanh/train_LensGAN128_5fa35_00000_0_n_fmaps=8_2021-09-08_00-04-45/checkpoint_tmpd456ca/./
[2m[36m(pid=4486)[0m 2021-09-08 00:53:48,299	INFO trainable.py:390 -- Current state after restoring: {'_iteration': 3, '_timesteps_total': None, '_time_total': 676.1944332122803, '_episodes_total': None}
[2m[36m(pid=4486)[0m Using native 16bit precision.
[2m[36m(pid=4486)[0m GPU available: True, used: True
[2m[36m(pid=4486)[0m TPU available: False, using: 0 TPU cores
[2m[36m(pid=4486)[0m IPU available: False, using: 0 IPUs
[2m[36m(pid=4486)[0m Restoring states from the checkpoint file at /content/drive/MyDrive/Logs/F/LensGAN128/pbt_tanh/train_LensGAN128_5fa35_00000_0_n_fmaps=8_2021-09-08_00-04-45/checkpoint_tmpd456ca/./checkpoint
[2m[36m(pid=4486)[0m LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Trial name,status,loc,lr,n_fmaps,bs,loss_G,loss_D,FID,auroc,FID_cross,auroc_cross,training_iteration
train_LensGAN128_5fa35_00000,RUNNING,,0.00012,32,8,10.6631,2.57638e-05,117.688,0.501384,156.75,0.5,3
train_LensGAN128_5fa35_00003,PENDING,,0.0001,64,8,1.65938,0.786239,22.25,0.492572,19.9219,0.49814,2
train_LensGAN128_5fa35_00002,PENDING,,0.0001,32,8,10.2784,4.06049e-05,95.125,0.509944,170.75,0.5,2
train_LensGAN128_5fa35_00001,PENDING,,0.0001,16,8,9.88384,8.79142e-05,542.5,0.4942,647.5,0.491456,2


[2m[36m(pid=4486)[0m Restored all states from the checkpoint file at /content/drive/MyDrive/Logs/F/LensGAN128/pbt_tanh/train_LensGAN128_5fa35_00000_0_n_fmaps=8_2021-09-08_00-04-45/checkpoint_tmpd456ca/./checkpoint
[2m[36m(pid=4486)[0m 
[2m[36m(pid=4486)[0m   | Name          | Type               | Params
[2m[36m(pid=4486)[0m -----------------------------------------------------
[2m[36m(pid=4486)[0m 0 | generator     | DCGANGenerator     | 3.6 M 
[2m[36m(pid=4486)[0m 1 | discriminator | DCGANDiscriminator | 2.8 M 
[2m[36m(pid=4486)[0m 2 | criterion     | BCEWithLogitsLoss  | 0     
[2m[36m(pid=4486)[0m 3 | modelF        | ResNet             | 11.2 M
[2m[36m(pid=4486)[0m 4 | lastF         | Sequential         | 1.5 K 
[2m[36m(pid=4486)[0m 5 | modelJ        | ResNet             | 11.2 M
[2m[36m(pid=4486)[0m 6 | lastJ         | Sequential         | 1.5 K 
[2m[36m(pid=4486)[0m 7 | val_metrics   | MetricCollection   | 22.3 M
[2m[36m(pid=4486)[0m --------

[2m[36m(pid=4486)[0m Validation sanity check: 0it [00:00, ?it/s]Validation sanity check:   0%|          | 0/2 [00:00<?, ?it/s]


[2m[36m(pid=4486)[0m   return torch.max_pool2d(input, kernel_size, stride, padding, dilation, ceil_mode)


Trial name,status,loc,lr,n_fmaps,bs,loss_G,loss_D,FID,auroc,FID_cross,auroc_cross,training_iteration
train_LensGAN128_5fa35_00000,RUNNING,,0.00012,32,8,10.6631,2.57638e-05,117.688,0.501384,156.75,0.5,3
train_LensGAN128_5fa35_00003,PENDING,,0.0001,64,8,1.65938,0.786239,22.25,0.492572,19.9219,0.49814,2
train_LensGAN128_5fa35_00002,PENDING,,0.0001,32,8,10.2784,4.06049e-05,95.125,0.509944,170.75,0.5,2
train_LensGAN128_5fa35_00001,PENDING,,0.0001,16,8,9.88384,8.79142e-05,542.5,0.4942,647.5,0.491456,2


[2m[36m(pid=4486)[0m   "`Trainer.running_sanity_check` has been renamed to `Trainer.sanity_checking` and will be removed in v1.5."


Epoch 3:   0%|          | 0/1030 [00:00<00:00, 3251.40it/s]  
Epoch 3:  10%|█         | 103/1030 [00:23<03:27,  4.47it/s, loss=5.61, v_num=.]
Epoch 3:  20%|██        | 206/1030 [00:46<03:03,  4.48it/s, loss=5.77, v_num=.]
Epoch 3:  30%|███       | 309/1030 [01:09<02:40,  4.49it/s, loss=5.83, v_num=.]
Epoch 3:  40%|████      | 412/1030 [01:31<02:17,  4.50it/s, loss=5.89, v_num=.]
Epoch 3:  50%|█████     | 515/1030 [01:54<01:54,  4.49it/s, loss=5.97, v_num=.]
Epoch 3:  60%|██████    | 618/1030 [02:17<01:31,  4.49it/s, loss=6.04, v_num=.]
Epoch 3:  70%|███████   | 721/1030 [02:40<01:08,  4.49it/s, loss=6.04, v_num=.]
Epoch 3:  80%|████████  | 824/1030 [03:03<00:45,  4.49it/s, loss=6.12, v_num=.]
Epoch 3:  90%|█████████ | 927/1030 [03:26<00:22,  4.49it/s, loss=6.13, v_num=.]
Epoch 3: 100%|██████████| 1030/1030 [03:27<00:00,  4.97it/s, loss=6.13, v_num=.]
Validating: 0it [00:00, ?it/s][A
Validating:   0%|          | 0/93 [00:00<?, ?it/s][A
[2m[36m(pid=4486)[0m 
Validating: 100%|███████

[2m[36m(pid=4486)[0m   "`Trainer.running_sanity_check` has been renamed to `Trainer.sanity_checking` and will be removed in v1.5."


[2m[36m(pid=4486)[0m Epoch 3: 100%|██████████| 1030/1030 [03:37<00:00,  4.74it/s, loss=6.13, v_num=.]
Result for train_LensGAN128_5fa35_00000:
  FID: 89.1875
  FID_cross: 109.6875
  auroc: 0.5095658302307129
  auroc_cross: 0.4978981018066406
  date: 2021-09-08_00-57-35
  done: false
  experiment_id: ea0cffa9423142eda364d8edd8601bb1
  hostname: e89b1a816e20
  iterations_since_restore: 1
  loss_D: 4.8875717766350135e-06
  loss_G: 12.399417877197266
  node_ip: 172.28.0.2
  pid: 4486
  should_checkpoint: true
  time_since_restore: 227.30387330055237
  time_this_iter_s: 227.30387330055237
  time_total_s: 903.4983065128326
  timestamp: 1631062655
  timesteps_since_restore: 0
  training_iteration: 4
  trial_id: 5fa35_00000
  


Trial name,status,loc,lr,n_fmaps,bs,loss_G,loss_D,FID,auroc,FID_cross,auroc_cross,training_iteration
train_LensGAN128_5fa35_00000,RUNNING,172.28.0.2:4486,0.00012,32,8,12.3994,4.88757e-06,89.1875,0.509566,109.688,0.497898,4
train_LensGAN128_5fa35_00003,PENDING,,0.0001,64,8,1.65938,0.786239,22.25,0.492572,19.9219,0.49814,2
train_LensGAN128_5fa35_00002,PENDING,,0.0001,32,8,10.2784,4.06049e-05,95.125,0.509944,170.75,0.5,2
train_LensGAN128_5fa35_00001,PENDING,,0.0001,16,8,9.88384,8.79142e-05,542.5,0.4942,647.5,0.491456,2


[2m[36m(pid=4486)[0m 2021-09-08 00:57:37,506	INFO trainable.py:76 -- Checkpoint size is 166470866 bytes
2021-09-08 00:57:47,628	INFO trainable.py:76 -- Checkpoint size is 377014546 bytes


Trial name,status,loc,lr,n_fmaps,bs,loss_G,loss_D,FID,auroc,FID_cross,auroc_cross,training_iteration
train_LensGAN128_5fa35_00003,RUNNING,,0.0001,64,8,1.65938,0.786239,22.25,0.492572,19.9219,0.49814,2
train_LensGAN128_5fa35_00000,PAUSED,,0.00012,32,8,12.3994,4.88757e-06,89.1875,0.509566,109.688,0.497898,4
train_LensGAN128_5fa35_00002,PENDING,,0.0001,32,8,10.2784,4.06049e-05,95.125,0.509944,170.75,0.5,2
train_LensGAN128_5fa35_00001,PENDING,,0.0001,16,8,9.88384,8.79142e-05,542.5,0.4942,647.5,0.491456,2


[2m[36m(pid=4575)[0m 2021-09-08 00:57:50,074	INFO trainable.py:383 -- Restored on 172.28.0.2 from checkpoint: /content/drive/MyDrive/Logs/F/LensGAN128/pbt_tanh/train_LensGAN128_5fa35_00003_3_n_fmaps=64_2021-09-08_00-09-33/checkpoint_tmpd86287/./
[2m[36m(pid=4575)[0m 2021-09-08 00:57:50,074	INFO trainable.py:390 -- Current state after restoring: {'_iteration': 2, '_timesteps_total': None, '_time_total': 821.8101160526276, '_episodes_total': None}
[2m[36m(pid=4575)[0m Using native 16bit precision.
[2m[36m(pid=4575)[0m GPU available: True, used: True
[2m[36m(pid=4575)[0m TPU available: False, using: 0 TPU cores
[2m[36m(pid=4575)[0m IPU available: False, using: 0 IPUs
[2m[36m(pid=4575)[0m Restoring states from the checkpoint file at /content/drive/MyDrive/Logs/F/LensGAN128/pbt_tanh/train_LensGAN128_5fa35_00003_3_n_fmaps=64_2021-09-08_00-09-33/checkpoint_tmpd86287/./checkpoint
[2m[36m(pid=4575)[0m LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Trial name,status,loc,lr,n_fmaps,bs,loss_G,loss_D,FID,auroc,FID_cross,auroc_cross,training_iteration
train_LensGAN128_5fa35_00003,RUNNING,,0.0001,64,8,1.65938,0.786239,22.25,0.492572,19.9219,0.49814,2
train_LensGAN128_5fa35_00000,PAUSED,,0.00012,32,8,12.3994,4.88757e-06,89.1875,0.509566,109.688,0.497898,4
train_LensGAN128_5fa35_00002,PENDING,,0.0001,32,8,10.2784,4.06049e-05,95.125,0.509944,170.75,0.5,2
train_LensGAN128_5fa35_00001,PENDING,,0.0001,16,8,9.88384,8.79142e-05,542.5,0.4942,647.5,0.491456,2


Trial name,status,loc,lr,n_fmaps,bs,loss_G,loss_D,FID,auroc,FID_cross,auroc_cross,training_iteration
train_LensGAN128_5fa35_00003,RUNNING,,0.0001,64,8,1.65938,0.786239,22.25,0.492572,19.9219,0.49814,2
train_LensGAN128_5fa35_00000,PAUSED,,0.00012,32,8,12.3994,4.88757e-06,89.1875,0.509566,109.688,0.497898,4
train_LensGAN128_5fa35_00002,PENDING,,0.0001,32,8,10.2784,4.06049e-05,95.125,0.509944,170.75,0.5,2
train_LensGAN128_5fa35_00001,PENDING,,0.0001,16,8,9.88384,8.79142e-05,542.5,0.4942,647.5,0.491456,2


[2m[36m(pid=4575)[0m Restored all states from the checkpoint file at /content/drive/MyDrive/Logs/F/LensGAN128/pbt_tanh/train_LensGAN128_5fa35_00003_3_n_fmaps=64_2021-09-08_00-09-33/checkpoint_tmpd86287/./checkpoint
[2m[36m(pid=4575)[0m 
[2m[36m(pid=4575)[0m   | Name          | Type               | Params
[2m[36m(pid=4575)[0m -----------------------------------------------------
[2m[36m(pid=4575)[0m 0 | generator     | DCGANGenerator     | 12.8 M
[2m[36m(pid=4575)[0m 1 | discriminator | DCGANDiscriminator | 11.2 M
[2m[36m(pid=4575)[0m 2 | criterion     | BCEWithLogitsLoss  | 0     
[2m[36m(pid=4575)[0m 3 | modelF        | ResNet             | 11.2 M
[2m[36m(pid=4575)[0m 4 | lastF         | Sequential         | 1.5 K 
[2m[36m(pid=4575)[0m 5 | modelJ        | ResNet             | 11.2 M
[2m[36m(pid=4575)[0m 6 | lastJ         | Sequential         | 1.5 K 
[2m[36m(pid=4575)[0m 7 | val_metrics   | MetricCollection   | 22.3 M
[2m[36m(pid=4575)[0m -------

[2m[36m(pid=4575)[0m Validation sanity check: 0it [00:00, ?it/s]Validation sanity check:   0%|          | 0/2 [00:00<?, ?it/s]


[2m[36m(pid=4575)[0m   return torch.max_pool2d(input, kernel_size, stride, padding, dilation, ceil_mode)
[2m[36m(pid=4575)[0m   "`Trainer.running_sanity_check` has been renamed to `Trainer.sanity_checking` and will be removed in v1.5."


Epoch 2:   0%|          | 0/1030 [00:00<00:00, 4161.02it/s]  
Epoch 2:  10%|█         | 103/1030 [00:43<06:29,  2.38it/s, loss=1.63, v_num=.]
Epoch 2:  20%|██        | 206/1030 [01:26<05:44,  2.39it/s, loss=1.23, v_num=.]
Epoch 2:  30%|███       | 309/1030 [02:09<05:01,  2.39it/s, loss=1.49, v_num=.]
Epoch 2:  40%|████      | 412/1030 [02:52<04:17,  2.40it/s, loss=1.34, v_num=.]
Epoch 2:  50%|█████     | 515/1030 [03:35<03:34,  2.40it/s, loss=1.44, v_num=.]
Epoch 2:  60%|██████    | 618/1030 [04:18<02:51,  2.40it/s, loss=1.58, v_num=.]
Epoch 2:  70%|███████   | 721/1030 [05:01<02:08,  2.40it/s, loss=1.33, v_num=.]
Epoch 2:  80%|████████  | 824/1030 [05:43<01:25,  2.40it/s, loss=1.58, v_num=.]
Epoch 2:  90%|█████████ | 927/1030 [06:26<00:42,  2.40it/s, loss=1.45, v_num=.]
Epoch 2: 100%|██████████| 1030/1030 [06:28<00:00,  2.66it/s, loss=1.45, v_num=.]
Validating: 0it [00:00, ?it/s][A
Validating:   0%|          | 0/93 [00:00<?, ?it/s][A
[2m[36m(pid=4575)[0m 
Validating: 100%|███████

[2m[36m(pid=4575)[0m   "`Trainer.running_sanity_check` has been renamed to `Trainer.sanity_checking` and will be removed in v1.5."


Result for train_LensGAN128_5fa35_00003:
  FID: 33.40625
  FID_cross: 30.1875
  auroc: 0.493870347738266
  auroc_cross: 0.46693044900894165
  date: 2021-09-08_01-04-43
  done: false
  experiment_id: d70fa34cadc242c5a19ba2deaa3a4f7c
  hostname: e89b1a816e20
  iterations_since_restore: 1
  loss_D: 0.8683701753616333
  loss_G: 3.6844687461853027
  node_ip: 172.28.0.2
  pid: 4575
  should_checkpoint: true
  time_since_restore: 413.25469303131104
  time_this_iter_s: 413.25469303131104
  time_total_s: 1235.0648090839386
  timestamp: 1631063083
  timesteps_since_restore: 0
  training_iteration: 3
  trial_id: 5fa35_00003
  


Trial name,status,loc,lr,n_fmaps,bs,loss_G,loss_D,FID,auroc,FID_cross,auroc_cross,training_iteration
train_LensGAN128_5fa35_00003,RUNNING,172.28.0.2:4575,0.0001,64,8,3.68447,0.86837,33.4062,0.49387,30.1875,0.46693,3
train_LensGAN128_5fa35_00000,PAUSED,,0.00012,32,8,12.3994,4.88757e-06,89.1875,0.509566,109.688,0.497898,4
train_LensGAN128_5fa35_00002,PENDING,,0.0001,32,8,10.2784,4.06049e-05,95.125,0.509944,170.75,0.5,2
train_LensGAN128_5fa35_00001,PENDING,,0.0001,16,8,9.88384,8.79142e-05,542.5,0.4942,647.5,0.491456,2


[2m[36m(pid=4575)[0m 2021-09-08 01:04:44,519	INFO trainable.py:76 -- Checkpoint size is 377014546 bytes


[2m[36m(pid=4575)[0m Epoch 2: 100%|██████████| 1030/1030 [06:47<00:00,  2.53it/s, loss=1.45, v_num=.]


[2m[36m(pid=4575)[0m 2021-09-08 01:04:49,878	INFO trainable.py:76 -- Checkpoint size is 377014546 bytes
2021-09-08 01:04:50,238	INFO trainable.py:76 -- Checkpoint size is 166470866 bytes


Trial name,status,loc,lr,n_fmaps,bs,loss_G,loss_D,FID,auroc,FID_cross,auroc_cross,training_iteration
train_LensGAN128_5fa35_00002,RUNNING,,0.0001,32,8,10.2784,4.06049e-05,95.125,0.509944,170.75,0.5,2
train_LensGAN128_5fa35_00000,PAUSED,,0.00012,32,8,12.3994,4.88757e-06,89.1875,0.509566,109.688,0.497898,4
train_LensGAN128_5fa35_00003,PAUSED,,0.0001,64,8,3.68447,0.86837,33.4062,0.49387,30.1875,0.46693,3
train_LensGAN128_5fa35_00001,PENDING,,0.0001,16,8,9.88384,8.79142e-05,542.5,0.4942,647.5,0.491456,2


Trial name,status,loc,lr,n_fmaps,bs,loss_G,loss_D,FID,auroc,FID_cross,auroc_cross,training_iteration
train_LensGAN128_5fa35_00002,RUNNING,,0.0001,32,8,10.2784,4.06049e-05,95.125,0.509944,170.75,0.5,2
train_LensGAN128_5fa35_00000,PAUSED,,0.00012,32,8,12.3994,4.88757e-06,89.1875,0.509566,109.688,0.497898,4
train_LensGAN128_5fa35_00003,PAUSED,,0.0001,64,8,3.68447,0.86837,33.4062,0.49387,30.1875,0.46693,3
train_LensGAN128_5fa35_00001,PENDING,,0.0001,16,8,9.88384,8.79142e-05,542.5,0.4942,647.5,0.491456,2


[2m[36m(pid=4690)[0m 2021-09-08 01:04:56,906	INFO trainable.py:383 -- Restored on 172.28.0.2 from checkpoint: /content/drive/MyDrive/Logs/F/LensGAN128/pbt_tanh/train_LensGAN128_5fa35_00002_2_n_fmaps=32_2021-09-08_00-06-53/checkpoint_tmp4c71ff/./
[2m[36m(pid=4690)[0m 2021-09-08 01:04:56,906	INFO trainable.py:390 -- Current state after restoring: {'_iteration': 2, '_timesteps_total': None, '_time_total': 448.7720639705658, '_episodes_total': None}
[2m[36m(pid=4690)[0m Using native 16bit precision.
[2m[36m(pid=4690)[0m GPU available: True, used: True
[2m[36m(pid=4690)[0m TPU available: False, using: 0 TPU cores
[2m[36m(pid=4690)[0m IPU available: False, using: 0 IPUs
[2m[36m(pid=4690)[0m Restoring states from the checkpoint file at /content/drive/MyDrive/Logs/F/LensGAN128/pbt_tanh/train_LensGAN128_5fa35_00002_2_n_fmaps=32_2021-09-08_00-06-53/checkpoint_tmp4c71ff/./checkpoint
[2m[36m(pid=4690)[0m LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
[2m[36m(pid=4690)[0m Rest

[2m[36m(pid=4690)[0m Validation sanity check: 0it [00:00, ?it/s]Validation sanity check:   0%|          | 0/2 [00:00<?, ?it/s]


[2m[36m(pid=4690)[0m   return torch.max_pool2d(input, kernel_size, stride, padding, dilation, ceil_mode)
[2m[36m(pid=4690)[0m   "`Trainer.running_sanity_check` has been renamed to `Trainer.sanity_checking` and will be removed in v1.5."


Epoch 2:   0%|          | 0/1030 [00:00<00:00, 2041.02it/s] 
Epoch 2:  10%|█         | 103/1030 [00:22<03:22,  4.57it/s, loss=5.31, v_num=.]
Epoch 2:  20%|██        | 206/1030 [00:45<03:01,  4.55it/s, loss=5.34, v_num=.]
Epoch 2:  30%|███       | 309/1030 [01:08<02:39,  4.53it/s, loss=5.29, v_num=.]
Epoch 2:  40%|████      | 412/1030 [01:31<02:16,  4.51it/s, loss=5.37, v_num=.]
Epoch 2:  50%|█████     | 515/1030 [01:54<01:54,  4.51it/s, loss=5.41, v_num=.]
Epoch 2:  60%|██████    | 618/1030 [02:17<01:31,  4.51it/s, loss=5.41, v_num=.]
Epoch 2:  70%|███████   | 721/1030 [02:40<01:08,  4.51it/s, loss=5.49, v_num=.]
Epoch 2:  80%|████████  | 824/1030 [03:03<00:45,  4.51it/s, loss=5.51, v_num=.]
Epoch 2:  90%|█████████ | 927/1030 [03:25<00:22,  4.51it/s, loss=5.52, v_num=.]
Epoch 2: 100%|██████████| 1030/1030 [03:26<00:00,  4.99it/s, loss=5.52, v_num=.]
Validating: 0it [00:00, ?it/s][A
Validating:   0%|          | 0/93 [00:00<?, ?it/s][A
[2m[36m(pid=4690)[0m 
Validating: 100%|████████

[2m[36m(pid=4690)[0m   "`Trainer.running_sanity_check` has been renamed to `Trainer.sanity_checking` and will be removed in v1.5."


[2m[36m(pid=4690)[0m Epoch 2: 100%|██████████| 1030/1030 [03:37<00:00,  4.74it/s, loss=5.52, v_num=.]


2021-09-08 01:08:44,086	INFO pbt.py:543 -- [exploit] transferring weights from trial train_LensGAN128_5fa35_00003 (score -33.40625) -> train_LensGAN128_5fa35_00002 (score -129.625)
2021-09-08 01:08:44,101	INFO pbt.py:558 -- [explore] perturbed config from {'lr': 0.0001, 'bs': 8} -> {'lr': 0.001098640749730333, 'bs': 16}


Result for train_LensGAN128_5fa35_00002:
  FID: 129.625
  FID_cross: 163.5
  auroc: 0.49680691957473755
  auroc_cross: 0.5
  date: 2021-09-08_01-08-44
  done: false
  experiment_id: ea0cffa9423142eda364d8edd8601bb1
  hostname: e89b1a816e20
  iterations_since_restore: 1
  loss_D: 1.7002103049890138e-05
  loss_G: 11.419933319091797
  node_ip: 172.28.0.2
  pid: 4690
  should_checkpoint: true
  time_since_restore: 227.15362787246704
  time_this_iter_s: 227.15362787246704
  time_total_s: 675.9256918430328
  timestamp: 1631063324
  timesteps_since_restore: 0
  training_iteration: 3
  trial_id: 5fa35_00002
  


Trial name,status,loc,lr,n_fmaps,bs,loss_G,loss_D,FID,auroc,FID_cross,auroc_cross,training_iteration
train_LensGAN128_5fa35_00002,RUNNING,172.28.0.2:4690,0.00109864,64,16,11.4199,1.70021e-05,129.625,0.496807,163.5,0.5,3
train_LensGAN128_5fa35_00000,PAUSED,,0.00012,32,8,12.3994,4.88757e-06,89.1875,0.509566,109.688,0.497898,4
train_LensGAN128_5fa35_00003,PAUSED,,0.0001,64,8,3.68447,0.86837,33.4062,0.49387,30.1875,0.46693,3
train_LensGAN128_5fa35_00001,PENDING,,0.0001,16,8,9.88384,8.79142e-05,542.5,0.4942,647.5,0.491456,2


[2m[36m(pid=4690)[0m 2021-09-08 01:08:47,014	INFO trainable.py:383 -- Restored on 172.28.0.2 from checkpoint: /content/drive/MyDrive/Logs/F/LensGAN128/pbt_tanh/train_LensGAN128_5fa35_00002_2_n_fmaps=32_2021-09-08_00-06-53/checkpoint_tmp991251/./
[2m[36m(pid=4690)[0m 2021-09-08 01:08:47,014	INFO trainable.py:390 -- Current state after restoring: {'_iteration': 3, '_timesteps_total': None, '_time_total': 1235.0648090839386, '_episodes_total': None}


Trial name,status,loc,lr,n_fmaps,bs,loss_G,loss_D,FID,auroc,FID_cross,auroc_cross,training_iteration
train_LensGAN128_5fa35_00000,PAUSED,,0.00012,32,8,12.3994,4.88757e-06,89.1875,0.509566,109.688,0.497898,4
train_LensGAN128_5fa35_00003,PAUSED,,0.0001,64,8,3.68447,0.86837,33.4062,0.49387,30.1875,0.46693,3
train_LensGAN128_5fa35_00002,PAUSED,,0.00109864,64,16,11.4199,1.70021e-05,129.625,0.496807,163.5,0.5,3
train_LensGAN128_5fa35_00001,PENDING,,0.0001,16,8,9.88384,8.79142e-05,542.5,0.4942,647.5,0.491456,2


2021-09-08 01:08:53,662	INFO trainable.py:76 -- Checkpoint size is 111334034 bytes


[2m[36m(pid=4690)[0m 
[2m[36m(pid=4690)[0m Validating: 100%|██████████| 93/93 [00:21<00:00, 10.57it/s][A


[2m[36m(pid=4690)[0m 2021-09-08 01:08:54,123	INFO trainable.py:76 -- Checkpoint size is 377014546 bytes


Trial name,status,loc,lr,n_fmaps,bs,loss_G,loss_D,FID,auroc,FID_cross,auroc_cross,training_iteration
train_LensGAN128_5fa35_00001,RUNNING,,0.0001,16,8,9.88384,8.79142e-05,542.5,0.4942,647.5,0.491456,2
train_LensGAN128_5fa35_00000,PAUSED,,0.00012,32,8,12.3994,4.88757e-06,89.1875,0.509566,109.688,0.497898,4
train_LensGAN128_5fa35_00002,PAUSED,,0.00109864,64,16,11.4199,1.70021e-05,129.625,0.496807,163.5,0.5,3
train_LensGAN128_5fa35_00003,PENDING,,0.0001,64,8,3.68447,0.86837,33.4062,0.49387,30.1875,0.46693,3


Trial name,status,loc,lr,n_fmaps,bs,loss_G,loss_D,FID,auroc,FID_cross,auroc_cross,training_iteration
train_LensGAN128_5fa35_00001,RUNNING,,0.0001,16,8,9.88384,8.79142e-05,542.5,0.4942,647.5,0.491456,2
train_LensGAN128_5fa35_00000,PAUSED,,0.00012,32,8,12.3994,4.88757e-06,89.1875,0.509566,109.688,0.497898,4
train_LensGAN128_5fa35_00002,PAUSED,,0.00109864,64,16,11.4199,1.70021e-05,129.625,0.496807,163.5,0.5,3
train_LensGAN128_5fa35_00003,PENDING,,0.0001,64,8,3.68447,0.86837,33.4062,0.49387,30.1875,0.46693,3


[2m[36m(pid=4788)[0m 2021-09-08 01:09:01,482	INFO trainable.py:383 -- Restored on 172.28.0.2 from checkpoint: /content/drive/MyDrive/Logs/F/LensGAN128/pbt_tanh/train_LensGAN128_5fa35_00001_1_n_fmaps=16_2021-09-08_00-04-46/checkpoint_tmp6f3b0d/./
[2m[36m(pid=4788)[0m 2021-09-08 01:09:01,482	INFO trainable.py:390 -- Current state after restoring: {'_iteration': 2, '_timesteps_total': None, '_time_total': 315.5841588973999, '_episodes_total': None}
[2m[36m(pid=4788)[0m Using native 16bit precision.
[2m[36m(pid=4788)[0m GPU available: True, used: True
[2m[36m(pid=4788)[0m TPU available: False, using: 0 TPU cores
[2m[36m(pid=4788)[0m IPU available: False, using: 0 IPUs
[2m[36m(pid=4788)[0m Restoring states from the checkpoint file at /content/drive/MyDrive/Logs/F/LensGAN128/pbt_tanh/train_LensGAN128_5fa35_00001_1_n_fmaps=16_2021-09-08_00-04-46/checkpoint_tmp6f3b0d/./checkpoint
[2m[36m(pid=4788)[0m LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
[2m[36m(pid=4788)[0m Rest

[2m[36m(pid=4788)[0m Validation sanity check: 0it [00:00, ?it/s]Validation sanity check:   0%|          | 0/2 [00:00<?, ?it/s]


[2m[36m(pid=4788)[0m   return torch.max_pool2d(input, kernel_size, stride, padding, dilation, ceil_mode)
[2m[36m(pid=4788)[0m   "`Trainer.running_sanity_check` has been renamed to `Trainer.sanity_checking` and will be removed in v1.5."


Epoch 2:   0%|          | 0/1030 [00:00<00:00, 2618.17it/s]  
Epoch 2:  10%|█         | 103/1030 [00:15<02:16,  6.79it/s, loss=4.85, v_num=.]
Epoch 2:  20%|██        | 206/1030 [00:30<02:03,  6.69it/s, loss=4.89, v_num=.]
Epoch 2:  30%|███       | 309/1030 [00:46<01:48,  6.66it/s, loss=4.92, v_num=.]
Epoch 2:  40%|████      | 412/1030 [01:01<01:32,  6.66it/s, loss=4.84, v_num=.]
Epoch 2:  50%|█████     | 515/1030 [01:17<01:17,  6.67it/s, loss=4.88, v_num=.]
Epoch 2:  60%|██████    | 618/1030 [01:32<01:01,  6.68it/s, loss=5, v_num=.]   
Epoch 2:  70%|███████   | 721/1030 [01:48<00:46,  6.68it/s, loss=5.18, v_num=.]
Epoch 2:  80%|████████  | 824/1030 [02:03<00:30,  6.68it/s, loss=5.34, v_num=.]
Epoch 2:  90%|█████████ | 927/1030 [02:18<00:15,  6.68it/s, loss=5.39, v_num=.]
Epoch 2: 100%|██████████| 1030/1030 [02:19<00:00,  7.39it/s, loss=5.39, v_num=.]
Validating: 0it [00:00, ?it/s][A
Validating:   0%|          | 0/93 [00:00<?, ?it/s][A
[2m[36m(pid=4788)[0m 
Validating: 100%|███████

[2m[36m(pid=4788)[0m   "`Trainer.running_sanity_check` has been renamed to `Trainer.sanity_checking` and will be removed in v1.5."
2021-09-08 01:11:45,030	INFO pbt.py:543 -- [exploit] transferring weights from trial train_LensGAN128_5fa35_00003 (score -33.40625) -> train_LensGAN128_5fa35_00001 (score -198.75)
2021-09-08 01:11:45,032	INFO pbt.py:558 -- [explore] perturbed config from {'lr': 0.0001, 'bs': 8} -> {'lr': 0.00012, 'bs': 8}


Result for train_LensGAN128_5fa35_00001:
  FID: 198.75
  FID_cross: 389.75
  auroc: 0.4966365396976471
  auroc_cross: 0.4956653118133545
  date: 2021-09-08_01-11-45
  done: false
  experiment_id: 64477961159b4495814f37f65b8b4591
  hostname: e89b1a816e20
  iterations_since_restore: 1
  loss_D: 3.0204355425667018e-05
  loss_G: 10.818380355834961
  node_ip: 172.28.0.2
  pid: 4788
  should_checkpoint: true
  time_since_restore: 163.53430032730103
  time_this_iter_s: 163.53430032730103
  time_total_s: 479.1184592247009
  timestamp: 1631063505
  timesteps_since_restore: 0
  training_iteration: 3
  trial_id: 5fa35_00001
  


Trial name,status,loc,lr,n_fmaps,bs,loss_G,loss_D,FID,auroc,FID_cross,auroc_cross,training_iteration
train_LensGAN128_5fa35_00001,RUNNING,172.28.0.2:4788,0.00012,64,8,10.8184,3.02044e-05,198.75,0.496637,389.75,0.495665,3
train_LensGAN128_5fa35_00000,PAUSED,,0.00012,32,8,12.3994,4.88757e-06,89.1875,0.509566,109.688,0.497898,4
train_LensGAN128_5fa35_00002,PAUSED,,0.00109864,64,16,11.4199,1.70021e-05,129.625,0.496807,163.5,0.5,3
train_LensGAN128_5fa35_00003,PENDING,,0.0001,64,8,3.68447,0.86837,33.4062,0.49387,30.1875,0.46693,3


[2m[36m(pid=4788)[0m 2021-09-08 01:11:47,604	INFO trainable.py:383 -- Restored on 172.28.0.2 from checkpoint: /content/drive/MyDrive/Logs/F/LensGAN128/pbt_tanh/train_LensGAN128_5fa35_00001_1_n_fmaps=16_2021-09-08_00-04-46/checkpoint_tmpf34756/./
[2m[36m(pid=4788)[0m 2021-09-08 01:11:47,604	INFO trainable.py:390 -- Current state after restoring: {'_iteration': 3, '_timesteps_total': None, '_time_total': 1235.0648090839386, '_episodes_total': None}


Trial name,status,loc,lr,n_fmaps,bs,loss_G,loss_D,FID,auroc,FID_cross,auroc_cross,training_iteration
train_LensGAN128_5fa35_00000,PAUSED,,0.00012,32,8,12.3994,4.88757e-06,89.1875,0.509566,109.688,0.497898,4
train_LensGAN128_5fa35_00002,PAUSED,,0.00109864,64,16,11.4199,1.70021e-05,129.625,0.496807,163.5,0.5,3
train_LensGAN128_5fa35_00001,PAUSED,,0.00012,64,8,10.8184,3.02044e-05,198.75,0.496637,389.75,0.495665,3
train_LensGAN128_5fa35_00003,PENDING,,0.0001,64,8,3.68447,0.86837,33.4062,0.49387,30.1875,0.46693,3


[2m[36m(pid=4788)[0m Epoch 2: 100%|██████████| 1030/1030 [02:37<00:00,  6.54it/s, loss=5.39, v_num=.]


[2m[36m(pid=4788)[0m 2021-09-08 01:11:57,598	INFO trainable.py:76 -- Checkpoint size is 377014546 bytes
2021-09-08 01:12:02,361	INFO trainable.py:76 -- Checkpoint size is 377014546 bytes


Trial name,status,loc,lr,n_fmaps,bs,loss_G,loss_D,FID,auroc,FID_cross,auroc_cross,training_iteration
train_LensGAN128_5fa35_00003,RUNNING,,0.0001,64,8,3.68447,0.86837,33.4062,0.49387,30.1875,0.46693,3
train_LensGAN128_5fa35_00000,PAUSED,,0.00012,32,8,12.3994,4.88757e-06,89.1875,0.509566,109.688,0.497898,4
train_LensGAN128_5fa35_00002,PAUSED,,0.00109864,64,16,11.4199,1.70021e-05,129.625,0.496807,163.5,0.5,3
train_LensGAN128_5fa35_00001,PAUSED,,0.00012,64,8,10.8184,3.02044e-05,198.75,0.496637,389.75,0.495665,3


Trial name,status,loc,lr,n_fmaps,bs,loss_G,loss_D,FID,auroc,FID_cross,auroc_cross,training_iteration
train_LensGAN128_5fa35_00003,RUNNING,,0.0001,64,8,3.68447,0.86837,33.4062,0.49387,30.1875,0.46693,3
train_LensGAN128_5fa35_00000,PAUSED,,0.00012,32,8,12.3994,4.88757e-06,89.1875,0.509566,109.688,0.497898,4
train_LensGAN128_5fa35_00001,PAUSED,,0.00012,64,8,10.8184,3.02044e-05,198.75,0.496637,389.75,0.495665,3
train_LensGAN128_5fa35_00002,PENDING,,0.00109864,64,16,11.4199,1.70021e-05,129.625,0.496807,163.5,0.5,3


[2m[36m(pid=4862)[0m 2021-09-08 01:12:09,404	INFO trainable.py:383 -- Restored on 172.28.0.2 from checkpoint: /content/drive/MyDrive/Logs/F/LensGAN128/pbt_tanh/train_LensGAN128_5fa35_00003_3_n_fmaps=64_2021-09-08_00-09-33/checkpoint_tmpc44cfa/./
[2m[36m(pid=4862)[0m 2021-09-08 01:12:09,405	INFO trainable.py:390 -- Current state after restoring: {'_iteration': 3, '_timesteps_total': None, '_time_total': 1235.0648090839386, '_episodes_total': None}


Trial name,status,loc,lr,n_fmaps,bs,loss_G,loss_D,FID,auroc,FID_cross,auroc_cross,training_iteration
train_LensGAN128_5fa35_00003,RUNNING,,0.0001,64,8,3.68447,0.86837,33.4062,0.49387,30.1875,0.46693,3
train_LensGAN128_5fa35_00000,PAUSED,,0.00012,32,8,12.3994,4.88757e-06,89.1875,0.509566,109.688,0.497898,4
train_LensGAN128_5fa35_00001,PAUSED,,0.00012,64,8,10.8184,3.02044e-05,198.75,0.496637,389.75,0.495665,3
train_LensGAN128_5fa35_00002,PENDING,,0.00109864,64,16,11.4199,1.70021e-05,129.625,0.496807,163.5,0.5,3


[2m[36m(pid=4862)[0m Using native 16bit precision.
[2m[36m(pid=4862)[0m GPU available: True, used: True
[2m[36m(pid=4862)[0m TPU available: False, using: 0 TPU cores
[2m[36m(pid=4862)[0m IPU available: False, using: 0 IPUs
[2m[36m(pid=4862)[0m Restoring states from the checkpoint file at /content/drive/MyDrive/Logs/F/LensGAN128/pbt_tanh/train_LensGAN128_5fa35_00003_3_n_fmaps=64_2021-09-08_00-09-33/checkpoint_tmpc44cfa/./checkpoint
[2m[36m(pid=4862)[0m LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
[2m[36m(pid=4862)[0m Restored all states from the checkpoint file at /content/drive/MyDrive/Logs/F/LensGAN128/pbt_tanh/train_LensGAN128_5fa35_00003_3_n_fmaps=64_2021-09-08_00-09-33/checkpoint_tmpc44cfa/./checkpoint
[2m[36m(pid=4862)[0m 
[2m[36m(pid=4862)[0m   | Name          | Type               | Params
[2m[36m(pid=4862)[0m -----------------------------------------------------
[2m[36m(pid=4862)[0m 0 | generator     | DCGANGenerator     | 12.8 M
[2m[36m(pid=4862)

[2m[36m(pid=4862)[0m Validation sanity check: 0it [00:00, ?it/s]Validation sanity check:   0%|          | 0/2 [00:00<?, ?it/s]


[2m[36m(pid=4862)[0m   return torch.max_pool2d(input, kernel_size, stride, padding, dilation, ceil_mode)
[2m[36m(pid=4862)[0m   "`Trainer.running_sanity_check` has been renamed to `Trainer.sanity_checking` and will be removed in v1.5."


Epoch 3:   0%|          | 0/1030 [00:00<00:00, 3865.72it/s]  
Epoch 3:  10%|█         | 103/1030 [00:43<06:29,  2.38it/s, loss=1.54, v_num=.]
Epoch 3:  20%|██        | 206/1030 [01:26<05:44,  2.39it/s, loss=1.76, v_num=.]
Epoch 3:  30%|███       | 309/1030 [02:09<05:01,  2.39it/s, loss=1.64, v_num=.]
Epoch 3:  40%|████      | 412/1030 [02:52<04:18,  2.39it/s, loss=1.43, v_num=.]
Epoch 3:  50%|█████     | 515/1030 [03:35<03:35,  2.39it/s, loss=1.58, v_num=.]
Epoch 3:  60%|██████    | 618/1030 [04:18<02:52,  2.39it/s, loss=1.44, v_num=.]




Epoch 3:  70%|███████   | 721/1030 [05:01<02:09,  2.39it/s, loss=1.89, v_num=.]
Epoch 3:  80%|████████  | 824/1030 [05:45<01:26,  2.39it/s, loss=1.48, v_num=.]
Epoch 3:  90%|█████████ | 927/1030 [06:28<00:43,  2.39it/s, loss=1.92, v_num=.]
Epoch 3: 100%|██████████| 1030/1030 [06:29<00:00,  2.64it/s, loss=1.92, v_num=.]
Validating: 0it [00:00, ?it/s][A
Validating:   0%|          | 0/93 [00:00<?, ?it/s][A
[2m[36m(pid=4862)[0m 
Validating: 100%|██████████| 93/93 [00:09<00:00,  9.46it/s][A


[2m[36m(pid=4862)[0m   "`Trainer.running_sanity_check` has been renamed to `Trainer.sanity_checking` and will be removed in v1.5."


Result for train_LensGAN128_5fa35_00003:
  FID: 31.15625
  FID_cross: 50.875
  auroc: 0.49936342239379883
  auroc_cross: 0.513091504573822
  date: 2021-09-08_01-19-12
  done: false
  experiment_id: d70fa34cadc242c5a19ba2deaa3a4f7c
  hostname: e89b1a816e20
  iterations_since_restore: 1
  loss_D: 0.8290479779243469
  loss_G: 2.579897403717041
  node_ip: 172.28.0.2
  pid: 4862
  should_checkpoint: true
  time_since_restore: 422.9766969680786
  time_this_iter_s: 422.9766969680786
  time_total_s: 1658.0415060520172
  timestamp: 1631063952
  timesteps_since_restore: 0
  training_iteration: 4
  trial_id: 5fa35_00003
  


Trial name,status,loc,lr,n_fmaps,bs,loss_G,loss_D,FID,auroc,FID_cross,auroc_cross,training_iteration
train_LensGAN128_5fa35_00003,RUNNING,172.28.0.2:4862,0.0001,64,8,2.5799,0.829048,31.1562,0.499363,50.875,0.513092,4
train_LensGAN128_5fa35_00000,PAUSED,,0.00012,32,8,12.3994,4.88757e-06,89.1875,0.509566,109.688,0.497898,4
train_LensGAN128_5fa35_00001,PAUSED,,0.00012,64,8,10.8184,3.02044e-05,198.75,0.496637,389.75,0.495665,3
train_LensGAN128_5fa35_00002,PENDING,,0.00109864,64,16,11.4199,1.70021e-05,129.625,0.496807,163.5,0.5,3


Trial name,status,loc,lr,n_fmaps,bs,loss_G,loss_D,FID,auroc,FID_cross,auroc_cross,training_iteration
train_LensGAN128_5fa35_00003,RUNNING,172.28.0.2:4862,0.0001,64,8,2.5799,0.829048,31.1562,0.499363,50.875,0.513092,4
train_LensGAN128_5fa35_00000,PAUSED,,0.00012,32,8,12.3994,4.88757e-06,89.1875,0.509566,109.688,0.497898,4
train_LensGAN128_5fa35_00001,PAUSED,,0.00012,64,8,10.8184,3.02044e-05,198.75,0.496637,389.75,0.495665,3
train_LensGAN128_5fa35_00002,PENDING,,0.00109864,64,16,11.4199,1.70021e-05,129.625,0.496807,163.5,0.5,3


2021-09-08 01:19:13,124	ERROR tune.py:557 -- Trials did not complete: [train_LensGAN128_5fa35_00000, train_LensGAN128_5fa35_00003, train_LensGAN128_5fa35_00002, train_LensGAN128_5fa35_00001]
2021-09-08 01:19:13,137	INFO tune.py:561 -- Total run time: 1535.32 seconds (1532.01 seconds for the tuning loop).


Best checkpoint path found is:  /content/drive/MyDrive/Logs/F/LensGAN128/pbt_tanh/train_LensGAN128_5fa35_00003_3_n_fmaps=64_2021-09-08_00-09-33/checkpoint_epoch=1-step=1873/


In [44]:
drive.flush_and_unmount()

## Stage 2
Here we tune hyperparameters as we train our modified DCGAN.

In [None]:
# __tune_train_checkpoint_begin
def train_Stage2(config, checkpoint_dir=None, num_epochs=10, num_gpus=torch.cuda.device_count()):
    # print(os.cpu_count(), torch.cuda.device_count())
    kwargs = {
        # 'limit_train_batches' : 0.05,
        # 'limit_val_batches' : 0.05,
        'progress_bar_refresh_rate' : math.ceil(8250//config['batch_size']),
        'max_epochs' : num_epochs,
        'prepare_data_per_node' : False,
        # If fractional GPUs passed in, convert to int.
        'gpus' : math.ceil(num_gpus),
        'logger' : TensorBoardLogger(save_dir=tune.get_trial_dir(), name='', version='.'),
        'callbacks' : [
            TuneReportCheckpointCallback(
                {
                    'loss_G': 'Stage2/G/train/loss', 
                    'loss_D': 'Stage2/D/train/loss', 
                    # Switch up the auroc vlues when training on different dataset -----------------------------------------------
                    'auroc': 'Stage2/ResNet(F)/val/auroc', 
                    'auroc_cross': 'Stage2/ResNet(J)/val/auroc',
                },
            ),
        ],
        # 'stochastic_weight_avg' : True,
        # works with only one optimizer
        # 'benchmark' : True,
    }
    
    dm = npyImageData(config)                                              # Specify image width here    
    if checkpoint_dir is not None:
        kwargs['resume_from_checkpoint'] = os.path.join(checkpoint_dir, 'checkpoint')
        # model = Stage2.load_from_checkpoint(kwargs['resume_from_checkpoint'], config=config)
    # else:
        # model = Stage2(config)
    model = Stage2(config)
    trainer = pl.Trainer(**kwargs)

    trainer.fit(model, dm)
# __tune_train_checkpoint_end__


# # # __tune_asha_begin__
# def tune_Stage2_asha(num_samples=10, num_epochs=10, gpus_per_trial=torch.cuda.device_count()):
#     # print(os.cpu_count(), torch.cuda.device_count())
#     analysis = tune.run(
#         tune.with_parameters(
#             train_Stage2,
#             num_epochs=num_epochs,
#             num_gpus=gpus_per_trial
#         ),
#         # Change the folder name when changing dataset--------------------------------------------------------------------------
#         name='Stage2/pbt/J',
#         metric='auroc',
#         mode='max',
#         config={'learning_rate': 1e-4,
#                 'n_fmaps': tune.grid_search([8, 16, 32, 64, 128]),
#                 'batch_size': 8,
#                 },
#         # config={'learning_rate': 0.01,
#         #         'n_fmaps': 32,
#         #         'batch_size': 32,
#         #         },
#         # stop=TrialPlateauStopper('loss_G'),
#         resources_per_trial={'cpu': os.cpu_count(),
#                              'gpu': gpus_per_trial,
#                             },
#         local_dir='./drive/MyDrive/Logs',
#         scheduler = ASHAScheduler(max_t=num_epochs, grace_period=2,  reduction_factor=2),
#         progress_reporter=JupyterNotebookReporter(
#             overwrite=True,
#             parameter_columns=['learning_rate', 'n_fmaps', 'batch_size'],
#             metric_columns=['loss_G', 'loss_D', 'auroc', 'auroc_cross', 'training_iteration'],
#             sort_by_metric=True,
#         ),
#         fail_fast = True,
#         # reuse_actors=True,
#         # num_samples=num_samples,
#         resume='PROMPT',
# #         restore='/content/drive/MyDrive/Logs/delete/train_Stage2_e42ac_00025_25_batch_size=8,learning_rate=0.01,n_fmaps=8_2021-07-28_21-16-18/checkpoint_epoch=4-step=2339',
#     )

# #     print('Best hyperparameters found were: ', analysis.best_config)

# # # __tune_asha_end__


# __tune_pbt_begin__
def tune_Stage2_pbt(num_samples=10, num_epochs=10, gpus_per_trial=torch.cuda.device_count()):
    # print(os.cpu_count(), torch.cuda.device_count())
    analysis = tune.run(
        tune.with_parameters(
            train_Stage2,
            num_epochs=num_epochs,
            num_gpus=gpus_per_trial
        ),
        # Change the folder name when changing dataset--------------------------------------------------------------------------
        name='Stage2/pbt/F',
        metric='auroc',
        mode='max',
        config={'learning_rate': 1e-4,
                'n_fmaps': tune.grid_search([8, 16, 32, 64, 128]),
                'res_depth': tune.choice([1, 2, 3, 4]),
                'batch_size': 8,
                },
        # config={'learning_rate': 0.01,
        #         'n_fmaps': 32,
        #         'batch_size': 32,
        #         },
        # stop=TrialPlateauStopper('loss_G'),
        resources_per_trial={'cpu': os.cpu_count(),
                             'gpu': gpus_per_trial,
                            },
        local_dir='./drive/MyDrive/Logs',
        scheduler = PopulationBasedTraining(time_attr='training_iteration',
                                            quantile_fraction=0.5,
                                            resample_probability=0.8,
                                            perturbation_interval=1,
                                            hyperparam_mutations={
                                                'learning_rate': tune.loguniform(1e-7, 1e-1),
                                                'batch_size': [8, 16, 32, 64, 128],
                                            },
                                            ),
        progress_reporter=JupyterNotebookReporter(
            overwrite=False,
            parameter_columns=['learning_rate', 'n_fmaps', 'res_depth', 'batch_size'],
            metric_columns=['loss_G', 'loss_D', 'auroc', 'auroc_cross', 'training_iteration'],
            sort_by_metric=True,
        ),
        fail_fast = True,
        # reuse_actors=True,
        # num_samples=num_samples,
        resume='PROMPT',
    )

    print('Best hyperparameters found were: ', analysis.best_config)

# __tune_pbt_end__


if __name__ == '__main__':
    import argparse

    parser = argparse.ArgumentParser()
    parser.add_argument(
        '--smoke-test', action='store_true', help='Finish quickly for testing')
    args, _ = parser.parse_known_args()

    if args.smoke_test:
        tune_Stage2_asha(num_samples=1, num_epochs=6, gpus_per_trial=torch.cuda.device_count())
        tune_Stage2_pbt(num_samples=1, num_epochs=6, gpus_per_trial=torch.cuda.device_count())
    else:
        # ASHA scheduler
        # tune_Stage2_asha(num_samples=1, num_epochs=10, gpus_per_trial=torch.cuda.device_count())
        # Population based training
        tune_Stage2_pbt(num_samples=1, num_epochs=30, gpus_per_trial=torch.cuda.device_count())

In [None]:
drive.flush_and_unmount()

In [None]:
!cat /content/drive/MyDrive/Logs/Stage2/pbt/F/train_Stage2_6f508_00000_0_n_fmaps=8_2021-08-13_14-04-54/error.txt

## StackGAN:
Here we tune hyperparameters for generating images that resemble the images from input.

In [None]:
# __tune_train_checkpoint_begin
def train_StackGAN_tune_checkpoint(config,
                                   checkpoint_dir=None,
                                   num_epochs=10,
                                   num_gpus=torch.cuda.device_count()):
    data_dir = os.path.expanduser('/content/images/')
    trainer = pl.Trainer(
        # accumulate_grad_batches=2,
        # limit_train_batches=0.20,
        # limit_val_batches=0.20,
        num_sanity_val_steps=-1,
        max_epochs=num_epochs,
        prepare_data_per_node = False,
        # If fractional GPUs passed in, convert to int.
        gpus=math.ceil(num_gpus),
        # tpu_cores = 8,
        logger=TensorBoardLogger(save_dir=tune.get_trial_dir(), name='', version='.'),
        # progress_bar_refresh_rate=1,
        callbacks=[
            TuneReportCheckpointCallback(
                metrics={
                    'loss_G1': 'G1/train/loss/full',
                    'loss_G2': 'G2/train/loss/full',
                    'loss_D1': 'D1/train/loss',
                    'loss_D2': 'D2/train/loss',
                    'lossR': 'R/train/loss',
                    'auroc': 'Pre/val/auroc',
                },
                filename='checkpoint',
                # on='training_end'
            )
        ],
        # stochastic_weight_avg=True,
        # works with only one optimizer
        )
    dm = npyImageData(config, data_dir)
    if checkpoint_dir:
        # Currently, this leads to errors:
        # model = StackGAN.load_from_checkpoint(
        #     os.path.join(checkpoint, 'checkpoint'))
        # Workaround:
        ckpt = pl_load(
            os.path.join(checkpoint_dir, 'checkpoint'),
            map_location=lambda storage, loc: storage)
        model = StackGAN._load_model_state(
            ckpt, config=config, 
            # data_dir=data_dir
            )
        trainer.current_epoch = ckpt['epoch']
    else:
        model = StackGAN(config)

    trainer.fit(model, dm)
# __tune_train_checkpoint_end__


# __tune_asha_begin__
def tune_StackGAN_asha(num_samples=10, num_epochs=10, gpus_per_trial=torch.cuda.device_count()):
    config = {
        'learning_rate': tune.choice([1e-4]),
        'feature_maps': tune.choice([64]),
        'batch_size': tune.choice([128, 64]),
    }

    scheduler = ASHAScheduler(
        max_t=num_epochs,
        grace_period=1,
        reduction_factor=2)

    reporter = CLIReporter(
        # overwrite=True,
        parameter_columns=['learning_rate', 'feature_maps', 'batch_size'],
        metric_columns=['loss_G1', 'loss_G2', 'loss_D1', 'loss_D2', 'lossR', 'auroc', 'training_iteration'],
        )

    analysis = tune.run(
        tune.with_parameters(
            train_StackGAN_tune_checkpoint,
            num_epochs=num_epochs,
            num_gpus=gpus_per_trial),
        name='tune_StackGAN_asha_model_j',
        metric='auroc',
        mode='max',
        config=config,
        resources_per_trial={
            'cpu': os.cpu_count(),
            'gpu': gpus_per_trial,
            # 'tpu': 8,
        },
        num_samples=num_samples,
        local_dir='./drive/MyDrive/Logs',
        scheduler=scheduler,
        progress_reporter=reporter,
        # restore='/content/drive/MyDrive/Logs/tune_StackGAN_1_asha_model_j/train_StackGAN_tune_checkpoint_fa25b_00000_0_batch_size=64,feature_maps=64,learning_rate=0.0001_2021-07-06_20-23-13/checkpoint_epoch=0-step=937',
        fail_fast = True,
        resume='PROMPT',
        )

    print('Best hyperparameters found were: ', analysis.best_config)

# __tune_asha_end__


# __tune_pbt_begin__
def tune_StackGAN_pbt(num_samples=10, num_epochs=10, gpus_per_trial=torch.cuda.device_count()):
    config = {
        'learning_rate': 1e-4,
        'feature_maps': 64,
        'batch_size': 64,
    }

    scheduler = PopulationBasedTraining(
        perturbation_interval=4,
        hyperparam_mutations={
            'learning_rate': [1e-4, 1e-3],
            'feature_maps': [64, 128],
            'batch_size': [32, 64, 128]
        })

    reporter = CLIReporter(
        # overwrite=True,
        parameter_columns=['learning_rate', 'feature_maps', 'batch_size'],
        metric_columns=['loss_G1', 'loss_G2', 'loss_D1', 'loss_D2', 'lossR', 'auroc', 'training_iteration'],
        )

    analysis = tune.run(
        # resume=True,
        tune.with_parameters(
            train_StackGAN_tune_checkpoint,
            num_epochs=num_epochs,
            num_gpus=gpus_per_trial),
        name='tune_StackGAN_pbt_model_j',
        metric='auroc',
        mode='max',
        resources_per_trial={
            'cpu': os.cpu_count(),
            'gpu': gpus_per_trial,
            # 'tpu': 8,
        },
        config=config,
        num_samples=num_samples,
        scheduler=scheduler,
        progress_reporter=reporter,
        local_dir='./drive/MyDrive/Logs',
        # restore='/content/drive/MyDrive/Logs/tune_StackGAN_1_asha_model_j/train_StackGAN_tune_checkpoint_fa25b_00000_0_batch_size=64,feature_maps=64,learning_rate=0.0001_2021-07-06_20-23-13/checkpoint_epoch=0-step=937',
        fail_fast = True,
        # resume='PROMPT',
        )

    print('Best hyperparameters found were: ', analysis.best_config)

# __tune_pbt_end__


if __name__ == '__main__':
    import argparse

    parser = argparse.ArgumentParser()
    parser.add_argument(
        '--smoke-test', action='store_true', help='Finish quickly for testing')
    args, _ = parser.parse_known_args()

    if args.smoke_test:
        tune_StackGAN_asha(num_samples=1, num_epochs=6, gpus_per_trial=torch.cuda.device_count())
        tune_StackGAN_pbt(num_samples=1, num_epochs=6, gpus_per_trial=torch.cuda.device_count())
    else:
        # ASHA scheduler
        tune_StackGAN_asha(num_samples=2, num_epochs=1, gpus_per_trial=torch.cuda.device_count())
        # Population based training
        # tune_StackGAN_pbt(num_samples=8, num_epochs=5, gpus_per_trial=torch.cuda.device_count())