<a href="https://colab.research.google.com/github/souravraha/galaxy/blob/main/Lightning_Tune.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Install packages and import

In [1]:
# If you are running on Google Colab, uncomment below to install the necessary dependencies 
# before beginning the exercise.

print("Setting up colab environment")
!pip uninstall -y -q pyarrow
!pip install -q ray[debug] lightning-bolts
!pip install -U -q ray[tune]
# !pip install cloud-tpu-client==0.10 https://storage.googleapis.com/tpu-pytorch/wheels/torch_xla-1.8-cp37-cp37m-linux_x86_64.whl

# # A hack to force the runtime to restart, needed to include the above dependencies.
# print("Done installing! Restarting via forced crash (this is not an issue).")
# import os
# os._exit(0)

Setting up colab environment
[K     |████████████████████████████████| 51.6MB 58kB/s 
[K     |████████████████████████████████| 256kB 49.2MB/s 
[K     |████████████████████████████████| 10.1MB 46.6MB/s 
[K     |████████████████████████████████| 81kB 11.4MB/s 
[K     |████████████████████████████████| 133kB 52.2MB/s 
[K     |████████████████████████████████| 81kB 10.7MB/s 
[K     |████████████████████████████████| 71kB 10.2MB/s 
[K     |████████████████████████████████| 3.1MB 48.7MB/s 
[K     |████████████████████████████████| 1.3MB 41.0MB/s 
[K     |████████████████████████████████| 235kB 34.2MB/s 
[K     |████████████████████████████████| 819kB 34.0MB/s 
[K     |████████████████████████████████| 92kB 11.4MB/s 
[K     |████████████████████████████████| 143kB 49.3MB/s 
[K     |████████████████████████████████| 296kB 52.5MB/s 
[K     |████████████████████████████████| 122kB 49.0MB/s 
[K     |████████████████████████████████| 829kB 50.8MB/s 
[K     |██████████████████████

In [1]:
# If you are running on Google Colab, please install TensorFlow 2.0 by uncommenting below..

try:
  # %tensorflow_version only exists in Colab.
  %tensorflow_version 2.x
except Exception:
  pass

In [2]:
# __import_lightning_begin__
import math
import gdown, tarfile           #
from zipfile import ZipFile
import shutil
import numpy as np              #
from matplotlib import pyplot as plt
from itertools import cycle
import torch
from torch import nn
import pytorch_lightning as pl
from filelock import FileLock
from torch.utils.data import DataLoader, random_split
from torch.nn import functional as F
import torchvision.datasets as datasets
from torchvision.models import resnet18
from pl_bolts.models.self_supervised.resnets import BasicBlock                  # problem with resnet18
from pl_bolts.models.gans import DCGAN
from pl_bolts.models.gans.dcgan.components import DCGANDiscriminator, DCGANGenerator
import torchmetrics as tm
from torchvision import transforms
import os
from os.path import basename
# __import_lightning_end__

# __import_tune_begin__
from pytorch_lightning.loggers import TensorBoardLogger
from pytorch_lightning.utilities.cloud_io import load as pl_load
from ray import tune
from ray.tune import CLIReporter, JupyterNotebookReporter
from ray.tune.schedulers import ASHAScheduler, PopulationBasedTraining
from ray.tune.integration.pytorch_lightning import TuneReportCallback, \
    TuneReportCheckpointCallback
# __import_tune_end__



# Download and extract data

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
# 'a': 1Cjcw2EWorhdhJSGoWOdxsEUDxvl943dt, 'b': 15yXXC4h5VsytP3Ak1jfUSjQhdgP2s23K, 'c': 1vuQ-pLzoKT4Hd_V7949r9eND9E2fB_u_,
# 'd': , 'e': 1wFuasvb7PthxXtMUlsD13uzYHWlWt06H, 'f': 17l6H61tLAu26zGuei38r_T5ssjbYUeaJ, 
# 'g': 1SxQVosWeEjY3Pyn8LRXA11rLnZ9HK_7B, 'h': 1Atau0RH4oyLAiYReW-G9a8l9pUNltglF, 'i': 15lEgsR1p00KSHieaT9a1nkbJ86pDxwgp, 
# 'j': 1m0EQUbqZZeyl76XsQIKWU5Qd7jGmmWhB, 'k': , 'l': 1meTDi4aeWfdChOiXeLtUOGhjVDVu000e

# !rm -rf images
!gdown --id 1m0EQUbqZZeyl76XsQIKWU5Qd7jGmmWhB
!tar zxf ./model_j.tgz

# def prepare_data(data_dir: str = '/content'):
#     gdown.download('https://drive.google.com/uc?id=17l6H61tLAu26zGuei38r_T5ssjbYUeaJ', data_dir+'/model_f.tgz', quiet=True)
    
#     temp = tarfile.open(data_dir+'/model_f.tgz', 'r|gz')
#     temp.extractall()
#     temp.close()

Downloading...
From: https://drive.google.com/uc?id=1m0EQUbqZZeyl76XsQIKWU5Qd7jGmmWhB
To: /content/model_j.tgz
2.16GB [00:22, 97.3MB/s]


# DataModule
This creates dataloaders which need to be supplied to train, validate or test the module we have.

In [22]:
class NpyDataModule(pl.LightningDataModule):

    def __init__(self, config, data_dir: str = '/content/images/', img_width: int = 150):
        super().__init__()
        # This method is not implemented
        # self.save_hyperparameters()
        self.batch_size = config['batch_size']
        self.data_dir = os.path.expanduser(data_dir)
        
        GLOBAL = np.load('/content/drive/MyDrive/git_repos/forging_new_worlds/GLOBAL_VALS_J.npz')
        self.transform = transforms.Compose([
            # transforms.ConvertImageDtype(torch.float32),
            # Can't use this, divides values by dtype.max, use float() in npyloader instead
            transforms.RandomHorizontalFlip(),
            transforms.RandomVerticalFlip(),
            transforms.Normalize(mean=(GLOBAL['VALS'][0],), std=(GLOBAL['VALS'][1],)),
            # this shift-scales the pixel values, N(mu, sigma) -> N(0, 1)
            transforms.Resize(img_width, transforms.InterpolationMode.NEAREST),
        ])
    
    @staticmethod
    def npy_loader(path):
        # s=np.load(path).astype('float',copy=False)
        return torch.from_numpy(np.load(path)).unsqueeze(0).float()
        # Convert to tenssor first, and then to float, otherwise final dtype 
        # would be float64, which would raise errors in conv layers      ###### type as

    def setup(self, stage: str = None):
        if stage in ('fit', None):
            self.train_set = datasets.DatasetFolder(os.path.join(self.data_dir,'train'),
                                                   self.npy_loader, 
                                                   ('.npy'), 
                                                   self.transform,
                                                   )
            # self.train_set, self.val_set = random_split(self.full_set, [60000, 15000])            
            self.val_set = datasets.DatasetFolder(os.path.join(self.data_dir,'val'), 
                                                   self.npy_loader, 
                                                   ('.npy'), 
                                                   self.transform,
                                                   )
            self.dims = tuple(self.train_set[0][0].shape)


        if stage in ('test', None):
            self.test_set = datasets.DatasetFolder(os.path.join(self.data_dir,'val'), 
                                                   self.npy_loader, 
                                                   ('.npy'), 
                                                   self.transform,
                                                   )
            self.dims = getattr(self, 'dims', self.test_set[0][0].shape)
    
    def train_dataloader(self):
        return DataLoader(self.train_set, self.batch_size, shuffle=True)

    def val_dataloader(self):
        return DataLoader(self.val_set, self.batch_size, shuffle=True)

    def test_dataloader(self):
        return DataLoader(self.test_set, self.batch_size, shuffle=True)


# ResNet:
We modify a ResNet slightly for our purpose.

In [23]:
class LensResnet(pl.LightningModule):

    def __init__(self, config, data_dir: str = '/content/images/', image_channels: int = 1, 
                 num_classes: int = 3, **kwargs):
        super().__init__()
        self.save_hyperparameters(ignore=config)
        self.learning_rate = config['learning_rate']

        # init a pretrained resnet
        self.backbone = resnet18(num_classes = self.hparams.num_classes)
        self.backbone.conv1 = nn.Conv2d(self.hparams.image_channels, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
        #  can't merely change the in_channels since weights have to changed as well
        self.backbone.fc = nn.Sequential(
            nn.Dropout(0.5),
            self.backbone.fc
        )
        # self.backbone.
        # metrics = tm.MetricCollection([
        #     # tm.AUROC(self.hparams.num_classes, average='weighted'),
        #     # tm.ROC(self.hparams.num_classes),
        # #     tm.PrecisionRecallCurve(self.hparams.num_classes),
        # ])
        # self.train_metrics = metrics.clone(prefix='ResNet/train/')
        # self.val_metrics = metrics.clone(prefix='ResNet/val/')

    def configure_optimizers(self):
        return torch.optim.Adam(self.backbone.parameters(), self.learning_rate)

    def forward(self, x):
        return F.softmax(self.backbone(x), 1)

    def training_step(self, batch, batch_idx):
        imgs, labels = batch
        self.log('ResNet/train/auroc', tm.functional.auroc(self(imgs),labels, average='weighted', num_classes=self.hparams.num_classes))
        loss = F.cross_entropy(self.backbone(imgs), labels)
        self.log('ResNet/train/loss', loss)
        #  keep only scalars here, for no errors
        return loss

    def validation_step(self, batch, batch_idx):
        imgs, labels = batch
        self.log('ResNet/val/loss', F.cross_entropy(self.backbone(imgs), labels))
        #  keep only scalars here, for no errors
        return {'pred': self(imgs), 'target': labels}

    def validation_epoch_end(self, Listofdicts):
        prediction, target = torch.cat([x["pred"] for x in Listofdicts]), torch.cat([x["target"] for x in Listofdicts])
        aurocTensor = tm.functional.auroc(prediction, target, num_classes=self.hparams.num_classes, average=None)
        self.log('ResNet/val/auroc', aurocTensor.min())
        fprList, tprList, _ = tm.functional.roc(prediction, target, num_classes=self.hparams.num_classes)
        
        f = plt.figure()
        colors = cycle(['aqua', 'darkorange', 'cornflowerblue'])
        for i, color in zip(range(self.hparams.num_classes), colors):
            plt.plot(fprList[i].cpu(), tprList[i].cpu(), color=color,
                    label='ROC curve of class {0} (area = {1:0.2f})'
                    ''.format(i, aurocTensor[i].cpu()))
        plt.plot([0, 1], [0, 1], 'k--')
        plt.xlim([0.0, 1.0])
        plt.ylim([0.0, 1.05])
        plt.xlabel('False Positive Rate')
        plt.ylabel('True Positive Rate')
        plt.title('Multi-class ROC')
        plt.legend(loc="lower right")

        self.logger.experiment.add_figure('ResNet/val/ROC', f)
        f.savefig(str(tune.get_trial_dir())+'ROC_epoch_'+str(self.current_epoch)+'.pdf')

# Trying out Auto Tuning of learning rate

In [None]:
# Can't work with multiple optimizers
config = {
    'learning_rate': 1e-4, 'batch_size': 128, 'feature_maps': 64,
}
dm = NpyDataModule(config)
generator = StackGAN(config)

trainer = pl.Trainer(
    # logger=,
    # checkpoint_callback=,
    default_root_dir='./drive/MyDrive/Logs/', 
    gpus=1,
    auto_select_gpus=True, 
    # tpu_cores=
    progress_bar_refresh_rate=1,
    # fast_dev_run=,
    max_epochs=5,
    # max_time=,
    # limit_train_batches=,
    # flush_logs_every_n_steps=,
    # log_every_n_steps=,
    # resume_from_checkpoint='./drive/MyDrive/Logs/lr_find_temp_model.ckpt',
    auto_lr_find = True,
    # auto_scale_batch_size=True,
    # prepare_data_per_node=,
    )

# Run learning rate finder
lr_finder = trainer.tuner.lr_find(generator, dm)

# # Results can be found in
# # lr_finder.results

# Plot with
fig = lr_finder.plot(suggest=True)
fig.show()

# Pick point based on plot, or get suggestion
new_lr = lr_finder.suggestion()

# # update hparams of the model
# model.hparams.lr = new_lr

# # Fit model
# trainer.fit(model)

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


AttributeError: ignored

# Tune ResNet hyperparameters:
Here we tune hyperparameters as we train our modified ResNet.

In [None]:
# __tune_train_checkpoint_begin
def train_LensResnet_tune_checkpoint(config,
                                    checkpoint_dir=None,
                                    num_epochs=10,
                                    num_gpus=1):
    data_dir = os.path.expanduser("/content/images/")

    trainer = pl.Trainer(
        max_epochs=num_epochs,
        prepare_data_per_node = False,
        num_sanity_val_steps=0,
        # If fractional GPUs passed in, convert to int.
        gpus=math.ceil(num_gpus),
        # tpu_cores = 8,
        logger=TensorBoardLogger(
            save_dir=tune.get_trial_dir(), name="", version="."),
        # progress_bar_refresh_rate=1,
        callbacks=[
            TuneReportCheckpointCallback(
                metrics={
                    "loss": "ResNet/val/loss",
                    "auroc": "ResNet/val/auroc",
                },
                filename="checkpoint",
                # on="validation_end"
            )
        ],
        stochastic_weight_avg=True,
    )

    dm = NpyDataModule(config, data_dir)
    
    if checkpoint_dir:
        # Currently, this leads to errors:
        # model = LensResnet.load_from_checkpoint(
        #     os.path.join(checkpoint, "checkpoint"))
        # Workaround:
        ckpt = pl_load(
            os.path.join(checkpoint_dir, "checkpoint"),
            map_location=lambda storage, loc: storage)
        model = LensResnet._load_model_state(
            ckpt, config=config, 
            # data_dir=data_dir
            )
        trainer.current_epoch = ckpt["epoch"]
    else:
        model = LensResnet(config, 
                        #  data_dir
                         )

    trainer.fit(model, dm)

# __tune_train_checkpoint_end__


# __tune_asha_begin__
def tune_LensResnet_asha(num_samples=10, num_epochs=10, gpus_per_trial=1):
    # config = {
    #     "learning_rate": tune.choice([1e-5, 1e-4, 1e-3, 1e-2]),
    #     "batch_size": tune.choice([128, 64, 32]),
    # }

    best = {'batch_size': 128, 'learning_rate': 0.0001}    

    scheduler = ASHAScheduler(
        max_t=num_epochs,
        grace_period=1,
        reduction_factor=2)

    reporter = CLIReporter(
        # overwrite=True,
        parameter_columns=["learning_rate", "batch_size"],
        metric_columns=["loss", "auroc", "training_iteration"])

    analysis = tune.run(
        tune.with_parameters(
            train_LensResnet_tune_checkpoint,
            num_epochs=num_epochs,
            num_gpus=gpus_per_trial),
        name="LensResNet_J",                                                    # Change with dataset change
        metric="auroc",
        mode="max",
        config=best,
        resources_per_trial={
            "cpu": 2,
            "gpu": gpus_per_trial,
            # "tpu": 8,
        },
        # num_samples=num_samples,
        local_dir='./drive/MyDrive/Logs',
        scheduler=scheduler,
        progress_reporter=reporter,
        fail_fast = True,
        restore = '/content/drive/MyDrive/Logs/LensResNet_J/train_LensResnet_tune_checkpoint_c74d9_00003_3_batch_size=128,learning_rate=0.0001_2021-07-08_01-03-43/checkpoint_epoch=2-step=1406',
        # '/content/drive/MyDrive/Logs/tune_LensResnet_asha_model_f/train_LensResnet_tune_checkpoint_e32ba_00000_0_batch_size=64,learning_rate=0.0001_2021-07-06_03-33-10/checkpoint_epoch=14-step=4689',
        # resume=True,
        )

    print("Best hyperparameters found were: ", analysis.best_config)
# __tune_asha_end__


# __tune_pbt_begin__
def tune_LensResnet_pbt(num_samples=10, num_epochs=10, gpus_per_trial=1):
    config = {
        "learning_rate": 1e-3,
        "batch_size": 64,
    }

    scheduler = PopulationBasedTraining(
        perturbation_interval=4,
        hyperparam_mutations={
            "learning_rate": [1e-5, 1e-4, 1e-3, 1e-2],
            "batch_size": [32, 64, 128]
        })

    reporter = CLIReporter(
        # overwrite=True,
        parameter_columns=["learning_rate", "batch_size"],
        metric_columns=["loss", "auroc", "training_iteration"])

    analysis = tune.run(
        # resume=True,
        tune.with_parameters(
            train_LensResnet_tune_checkpoint,
            num_epochs=num_epochs,
            num_gpus=gpus_per_trial),
        metric="auroc",
        mode="max",
        resources_per_trial={
            "cpu": 2,
            "gpu": gpus_per_trial,
            # "tpu": 8,
        },
        fail_fast = True,
        config=config,
        num_samples=num_samples,
        scheduler=scheduler,
        progress_reporter=reporter,
        local_dir='./drive/MyDrive/Logs' ,
        name="tune_LensResnet_pbt")

    print("Best hyperparameters found were: ", analysis.best_config)

# __tune_pbt_end__


if __name__ == "__main__":
    import argparse

    parser = argparse.ArgumentParser()
    parser.add_argument(
        "--smoke-test", action="store_true", help="Finish quickly for testing")
    args, _ = parser.parse_known_args()

    if args.smoke_test:
        tune_LensResnet_asha(num_samples=1, num_epochs=6, gpus_per_trial=1)
        tune_LensResnet_pbt(num_samples=1, num_epochs=6, gpus_per_trial=1)
    else:
        # ASHA scheduler
        tune_LensResnet_asha(num_samples=12, num_epochs=20, gpus_per_trial=1)
        # Population based training
        # tune_LensResnet_pbt(num_samples=10, num_epochs=10, gpus_per_trial=1)

== Status ==
Memory usage on this node: 1.6/12.7 GiB
Using AsyncHyperBand: num_stopped=0
Bracket: Iter 16.000: None | Iter 8.000: None | Iter 4.000: None | Iter 2.000: None | Iter 1.000: None
Resources requested: 0/2 CPUs, 0/1 GPUs, 0.0/7.43 GiB heap, 0.0/3.72 GiB objects (0.0/1.0 accelerator_type:T4)
Result logdir: /content/drive/MyDrive/Logs/LensResNet_J
Number of trials: 1/1 (1 PENDING)
+----------------------------------------------+----------+-------+-----------------+--------------+
| Trial name                                   | status   | loc   |   learning_rate |   batch_size |
|----------------------------------------------+----------+-------+-----------------+--------------|
| train_LensResnet_tune_checkpoint_fcb1c_00000 | PENDING  |       |          0.0001 |          128 |
+----------------------------------------------+----------+-------+-----------------+--------------+




2021-07-09 00:45:47,634	INFO trainable.py:76 -- Checkpoint size is 134179966 bytes


== Status ==
Memory usage on this node: 1.7/12.7 GiB
Using AsyncHyperBand: num_stopped=0
Bracket: Iter 16.000: None | Iter 8.000: None | Iter 4.000: None | Iter 2.000: None | Iter 1.000: None
Resources requested: 2.0/2 CPUs, 1.0/1 GPUs, 0.0/7.43 GiB heap, 0.0/3.72 GiB objects (0.0/1.0 accelerator_type:T4)
Result logdir: /content/drive/MyDrive/Logs/LensResNet_J
Number of trials: 1/1 (1 RUNNING)
+----------------------------------------------+----------+-------+-----------------+--------------+
| Trial name                                   | status   | loc   |   learning_rate |   batch_size |
|----------------------------------------------+----------+-------+-----------------+--------------|
| train_LensResnet_tune_checkpoint_fcb1c_00000 | RUNNING  |       |          0.0001 |          128 |
+----------------------------------------------+----------+-------+-----------------+--------------+




[2m[36m(pid=1409)[0m 2021-07-09 00:45:52,653	INFO trainable.py:378 -- Restored on 172.28.0.2 from checkpoint: /content/drive/MyDrive/Logs/LensResNet_J/train_LensResnet_tune_checkpoint_fcb1c_00000_0_2021-07-09_00-45-41/checkpoint_tmp4ff55c/./
[2m[36m(pid=1409)[0m 2021-07-09 00:45:52,653	INFO trainable.py:385 -- Current state after restoring: {'_iteration': 3, '_timesteps_total': None, '_time_total': 1592.1304562091827, '_episodes_total': None}


== Status ==
Memory usage on this node: 1.9/12.7 GiB
Using AsyncHyperBand: num_stopped=0
Bracket: Iter 16.000: None | Iter 8.000: None | Iter 4.000: None | Iter 2.000: None | Iter 1.000: None
Resources requested: 2.0/2 CPUs, 1.0/1 GPUs, 0.0/7.43 GiB heap, 0.0/3.72 GiB objects (0.0/1.0 accelerator_type:T4)
Result logdir: /content/drive/MyDrive/Logs/LensResNet_J
Number of trials: 1/1 (1 RUNNING)
+----------------------------------------------+----------+-------+-----------------+--------------+
| Trial name                                   | status   | loc   |   learning_rate |   batch_size |
|----------------------------------------------+----------+-------+-----------------+--------------|
| train_LensResnet_tune_checkpoint_fcb1c_00000 | RUNNING  |       |          0.0001 |          128 |
+----------------------------------------------+----------+-------+-----------------+--------------+




[2m[36m(pid=1409)[0m GPU available: True, used: True
[2m[36m(pid=1409)[0m TPU available: False, using: 0 TPU cores
[2m[36m(pid=1409)[0m LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
[2m[36m(pid=1409)[0m 2021-07-09 00:45:59.792079: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudart.so.11.0
[2m[36m(pid=1409)[0m 
[2m[36m(pid=1409)[0m   | Name     | Type   | Params
[2m[36m(pid=1409)[0m ------------------------------------
[2m[36m(pid=1409)[0m 0 | backbone | ResNet | 11.2 M
[2m[36m(pid=1409)[0m ------------------------------------
[2m[36m(pid=1409)[0m 11.2 M    Trainable params
[2m[36m(pid=1409)[0m 0         Non-trainable params
[2m[36m(pid=1409)[0m 11.2 M    Total params
[2m[36m(pid=1409)[0m 44.687    Total estimated model params size (MB)
[2m[36m(pid=1409)[0m   f'Your {mode}_dataloader has `shuffle=True`, it is best practice to turn'


[2m[36m(pid=1409)[0m Training: 0it [00:00, ?it/s]Training:   0%|          | 0/645 [00:00<?, ?it/s]Epoch 3:   0%|          | 0/645 [00:00<?, ?it/s] 


[2m[36m(pid=1409)[0m   return torch.max_pool2d(input, kernel_size, stride, padding, dilation, ceil_mode)


Epoch 3:   3%|▎         | 20/645 [00:17<08:58,  1.16it/s, loss=0.998, v_num=.]
Epoch 3:   6%|▌         | 40/645 [00:34<08:38,  1.17it/s, loss=0.99, v_num=.] 
Epoch 3:   9%|▉         | 60/645 [00:51<08:26,  1.15it/s, loss=0.975, v_num=.]
Epoch 3:  12%|█▏        | 80/645 [01:09<08:13,  1.14it/s, loss=0.96, v_num=.] 
Epoch 3:  16%|█▌        | 100/645 [01:27<07:58,  1.14it/s, loss=0.974, v_num=.]
Epoch 3:  19%|█▊        | 120/645 [01:45<07:42,  1.14it/s, loss=0.953, v_num=.]
Epoch 3:  22%|██▏       | 140/645 [02:03<07:26,  1.13it/s, loss=0.978, v_num=.]
Epoch 3:  25%|██▍       | 160/645 [02:21<07:09,  1.13it/s, loss=0.962, v_num=.]
Epoch 3:  28%|██▊       | 180/645 [02:39<06:52,  1.13it/s, loss=0.96, v_num=.] 
Epoch 3:  31%|███       | 200/645 [02:57<06:35,  1.13it/s, loss=0.956, v_num=.]
Epoch 3:  34%|███▍      | 220/645 [03:15<06:18,  1.12it/s, loss=0.945, v_num=.]
Epoch 3:  37%|███▋      | 240/645 [03:33<06:00,  1.12it/s, loss=0.938, v_num=.]
Epoch 3:  40%|████      | 260/645 [03:51<05:

[2m[36m(pid=1409)[0m   "The signature of `Callback.on_train_epoch_end` has changed in v1.3."


[2m[36m(pid=1409)[0m Epoch 3:  93%|█████████▎| 600/645 [08:45<00:39,  1.14it/s, loss=0.936, v_num=.]
[2m[36m(pid=1409)[0m Validating: 0it [00:00, ?it/s][A
Validating:   0%|          | 0/59 [00:00<?, ?it/s][A
[2m[36m(pid=1409)[0m 
Epoch 3:  96%|█████████▌| 620/645 [08:58<00:21,  1.15it/s, loss=0.936, v_num=.]
[2m[36m(pid=1409)[0m 
Epoch 3:  99%|█████████▉| 640/645 [09:10<00:04,  1.16it/s, loss=0.936, v_num=.]
[2m[36m(pid=1409)[0m 
Epoch 3: 100%|██████████| 645/645 [09:22<00:00,  1.15it/s, loss=0.936, v_num=.]


[2m[36m(pid=1409)[0m   "`Trainer.running_sanity_check` has been renamed to `Trainer.sanity_checking` and will be removed in v1.5."


Result for train_LensResnet_tune_checkpoint_fcb1c_00000:
  auroc: 0.637090802192688
  date: 2021-07-09_00-55-28
  done: false
  experiment_id: 4f2c598ef4624d8c81f7a686e1d1a060
  hostname: 78a21a78acd2
  iterations_since_restore: 1
  loss: 0.8968604803085327
  node_ip: 172.28.0.2
  pid: 1409
  should_checkpoint: true
  time_since_restore: 575.7721858024597
  time_this_iter_s: 575.7721858024597
  time_total_s: 2167.9026420116425
  timestamp: 1625792128
  timesteps_since_restore: 0
  training_iteration: 4
  trial_id: fcb1c_00000
  
== Status ==
Memory usage on this node: 4.5/12.7 GiB
Using AsyncHyperBand: num_stopped=0
Bracket: Iter 16.000: None | Iter 8.000: None | Iter 4.000: 0.637090802192688 | Iter 2.000: None | Iter 1.000: None
Resources requested: 2.0/2 CPUs, 1.0/1 GPUs, 0.0/7.43 GiB heap, 0.0/3.72 GiB objects (0.0/1.0 GPU_group_0_dcb3bbbff5cd51a591059579b5ca40bb, 0.0/1.0 accelerator_type:T4, 0.0/2.0 CPU_group_dcb3bbbff5cd51a591059579b5ca40bb, 0.0/1.0 GPU_group_dcb3bbbff5cd51a591059

[2m[36m(pid=1409)[0m   "`Trainer.running_sanity_check` has been renamed to `Trainer.sanity_checking` and will be removed in v1.5."


Result for train_LensResnet_tune_checkpoint_fcb1c_00000:
  auroc: 0.572493314743042
  date: 2021-07-09_01-04-57
  done: false
  experiment_id: 4f2c598ef4624d8c81f7a686e1d1a060
  hostname: 78a21a78acd2
  iterations_since_restore: 2
  loss: 1.5953080654144287
  node_ip: 172.28.0.2
  pid: 1409
  should_checkpoint: true
  time_since_restore: 1145.2105238437653
  time_this_iter_s: 569.4383380413055
  time_total_s: 2737.340980052948
  timestamp: 1625792697
  timesteps_since_restore: 0
  training_iteration: 5
  trial_id: fcb1c_00000
  
== Status ==
Memory usage on this node: 4.5/12.7 GiB
Using AsyncHyperBand: num_stopped=0
Bracket: Iter 16.000: None | Iter 8.000: None | Iter 4.000: 0.637090802192688 | Iter 2.000: 0.572493314743042 | Iter 1.000: None
Resources requested: 2.0/2 CPUs, 1.0/1 GPUs, 0.0/7.43 GiB heap, 0.0/3.72 GiB objects (0.0/1.0 GPU_group_dcb3bbbff5cd51a591059579b5ca40bb, 0.0/2.0 CPU_group_dcb3bbbff5cd51a591059579b5ca40bb, 0.0/1.0 accelerator_type:T4, 0.0/1.0 GPU_group_0_dcb3bbbf

[2m[36m(pid=1409)[0m   "`Trainer.running_sanity_check` has been renamed to `Trainer.sanity_checking` and will be removed in v1.5."


Result for train_LensResnet_tune_checkpoint_fcb1c_00000:
  auroc: 0.654522180557251
  date: 2021-07-09_01-14-26
  done: false
  experiment_id: 4f2c598ef4624d8c81f7a686e1d1a060
  hostname: 78a21a78acd2
  iterations_since_restore: 3
  loss: 1.0105997323989868
  node_ip: 172.28.0.2
  pid: 1409
  should_checkpoint: true
  time_since_restore: 1714.112450838089
  time_this_iter_s: 568.9019269943237
  time_total_s: 3306.2429070472717
  timestamp: 1625793266
  timesteps_since_restore: 0
  training_iteration: 6
  trial_id: fcb1c_00000
  
== Status ==
Memory usage on this node: 4.5/12.7 GiB
Using AsyncHyperBand: num_stopped=0
Bracket: Iter 16.000: None | Iter 8.000: None | Iter 4.000: 0.637090802192688 | Iter 2.000: 0.572493314743042 | Iter 1.000: 0.654522180557251
Resources requested: 2.0/2 CPUs, 1.0/1 GPUs, 0.0/7.43 GiB heap, 0.0/3.72 GiB objects (0.0/2.0 CPU_group_dcb3bbbff5cd51a591059579b5ca40bb, 0.0/1.0 GPU_group_0_dcb3bbbff5cd51a591059579b5ca40bb, 0.0/2.0 CPU_group_0_dcb3bbbff5cd51a5910595

[2m[36m(pid=1409)[0m   "`Trainer.running_sanity_check` has been renamed to `Trainer.sanity_checking` and will be removed in v1.5."


Result for train_LensResnet_tune_checkpoint_fcb1c_00000:
  auroc: 0.7034956812858582
  date: 2021-07-09_01-23-55
  done: false
  experiment_id: 4f2c598ef4624d8c81f7a686e1d1a060
  hostname: 78a21a78acd2
  iterations_since_restore: 4
  loss: 1.1493151187896729
  node_ip: 172.28.0.2
  pid: 1409
  should_checkpoint: true
  time_since_restore: 2282.5348331928253
  time_this_iter_s: 568.4223823547363
  time_total_s: 3874.665289402008
  timestamp: 1625793835
  timesteps_since_restore: 0
  training_iteration: 7
  trial_id: fcb1c_00000
  
== Status ==
Memory usage on this node: 4.5/12.7 GiB
Using AsyncHyperBand: num_stopped=0
Bracket: Iter 16.000: None | Iter 8.000: None | Iter 4.000: 0.637090802192688 | Iter 2.000: 0.572493314743042 | Iter 1.000: 0.654522180557251
Resources requested: 2.0/2 CPUs, 1.0/1 GPUs, 0.0/7.43 GiB heap, 0.0/3.72 GiB objects (0.0/1.0 accelerator_type:T4, 0.0/1.0 GPU_group_0_dcb3bbbff5cd51a591059579b5ca40bb, 0.0/2.0 CPU_group_0_dcb3bbbff5cd51a591059579b5ca40bb, 0.0/2.0 CP

In [5]:
answer = tune.Analysis('drive/MyDrive/Logs/LensResNet_J/', 'auroc', 'max')

In [16]:
answer.get_best_config()

{'batch_size': 128, 'learning_rate': 0.0001}

In [None]:
# # create a ZipFile object
# with ZipFile('/content/sampleDir.zip', 'w') as zipObj:
#    # Iterate over all the files in directory
#    for folderName, subfolders, filenames in os.walk('drive/MyDrive/Logs/LensResNet_J'):
#        for filename in filenames:
#            #create complete filepath of file in directory
#            filePath = os.path.join(folderName, filename)
#            # Add file to zip
#            zipObj.write(filePath, basename(filePath))
shutil.make_archive('/content/folder', 'zip', 'drive/MyDrive/Logs/LensResNet_J')

'/content/folder.zip'

In [None]:
!tensorboard dev delete --experiment_id vbghQrP0RWWfyBY3KRip0g
# upload --logdir ./drive/MyDrive/Logs/ \
# --name "My experiments" \
# # --description "Lightning_StackGAN"

2021-07-07 22:48:05.538117: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudart.so.11.0
Data for the "text" plugin is now uploaded to TensorBoard.dev! Note that uploaded data is public. If you do not want to upload data for this plugin, use the "--plugins" command line argument.
Cannot delete experiment vbghQrP0RWWfyBY3KRip0g because it is owned by a different user.


# Experimental

In [None]:
class Generator1(DCGANGenerator):
    def __init__(self, latent_dim: int, feature_maps: int, image_channels: int, num_classes: int):
        super().__init__(latent_dim, feature_maps, image_channels)

        self.gen[0][1] = nn.Identity()
        self.add_module('emb', nn.Embedding(num_classes, latent_dim))

    def forward(self, noise, labels = None):
        if labels is None:
            labels = torch.randint(*noise.shape[:-1])                           # last dimension is the hidden dimension
        inp = noise.mul(self.emb(labels))
        return self.gen(inp.view(-1, inp.shape[-1], 1, 1))

In [None]:
class Stage1(DCGAN)

In [None]:
p=DCGAN()
p.generator = Generator1(100, 64, 1, 3)
p(torch.randn(32, 100), torch.randint(3, (32, 100)))

TypeError: ignored

In [None]:
dummy=Generator1(100, 64, 1, 3)

In [None]:
dummy

Generator1(
  (gen): Sequential(
    (0): Sequential(
      (0): ConvTranspose2d(100, 512, kernel_size=(4, 4), stride=(1, 1), bias=False)
      (1): Identity()
      (2): ReLU(inplace=True)
    )
    (1): Sequential(
      (0): ConvTranspose2d(512, 256, kernel_size=(4, 4), stride=(2, 2), padding=(1, 1), bias=False)
      (1): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (2): ReLU(inplace=True)
    )
    (2): Sequential(
      (0): ConvTranspose2d(256, 128, kernel_size=(4, 4), stride=(2, 2), padding=(1, 1), bias=False)
      (1): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (2): ReLU(inplace=True)
    )
    (3): Sequential(
      (0): ConvTranspose2d(128, 64, kernel_size=(4, 4), stride=(2, 2), padding=(1, 1), bias=False)
      (1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (2): ReLU(inplace=True)
    )
    (4): Sequential(
      (0): ConvTranspose2d(64, 1, kern

# StackGAN:
Here we define the GAN module, that we shall use to generate representative images.

In [None]:
class Generator2(nn.Module):
    def __init__(self, image_channels: int = 1, ngf: int = 128,
                 ker: int = 4, strd: int = 2,
                 res_ker: int = 3, res_strd: int = 1, res_pad: int = 1):
        super().__init__()

        pad = int((ker - 2)/2)
        # 64 -> 32
        self.preprocessing = nn.Sequential(
            nn.Conv2d(image_channels, ngf, ker, strd, pad, bias=False),
            nn.ReLU(True)
        )
        # residuals
        self.residual = nn.Sequential(
            BasicBlock(ngf, ngf),
            BasicBlock(ngf, ngf),
            BasicBlock(ngf, ngf),
            BasicBlock(ngf, ngf),
            BasicBlock(ngf, ngf),
            BasicBlock(ngf, ngf),
        )
        self.ending_residual = nn.Sequential(
            nn.Conv2d(ngf, ngf, res_ker, res_strd, res_pad, bias=False),
            nn.BatchNorm2d(ngf),
            nn.ReLU(True)
        )

        # at this part, add the residual inputs from after the preprocessing

        image_width = 150 # upscaling should be factor of 2 increase
        mode = 'nearest' # upscaling method is nearest-neighbour
        self.main = nn.Sequential(
            # 32 -> 64
            nn.Upsample(image_width//2, mode=mode),
            nn.Conv2d(ngf, ngf*4, res_ker, res_strd, res_pad, bias=False),
            nn.BatchNorm2d(ngf*4),
            nn.ReLU(True),
            # 64 -> 128
            nn.Upsample(image_width, mode=mode),
            nn.Conv2d(ngf*4, image_channels, res_ker, res_strd, res_pad, bias=False),
            nn.Tanh()
        )

    def forward(self, in_x):
        x_p = self.preprocessing(in_x)
        x_r = x_p
        x_r = self.residual(x_r)
        x_r = self.ending_residual(x_r)
        # large residual connections
        x_f = x_r + x_p
        return self.main(x_f)

In [None]:
class StackGAN(pl.LightningModule):
    def __init__(self, config, noise_size: int = 100, image_width = 64,
                    num_classes: int = 3, image_channels: int = 1, b1: float = 0.5, **kwargs):
        super().__init__()
        self.save_hyperparameters(ignore = config)
        self.feature_maps = config['feature_maps']
        self.lr = config['learning_rate']
        # -------------------------------------
        # Need to create a subclass because we couldn't simply add/remove a layer;
        # there are two inputs of the superclas' forward method.
        self.G1 = DCGANGenerator(self.hparams.noise_size, self.feature_maps, self.hparams.image_channels).apply(self._weights_init)
        l = list(self.G1.gen[0])
        del l[1]
        self.G1.gen[0] = nn.Sequential(*l)
        self.G1.add_module('label_emb', nn.Embedding(self.hparams.num_classes, self.hparams.noise_size))
        # ------------------------------------
        self.D1 = DCGANDiscriminator(self.feature_maps, self.hparams.image_channels).apply(self._weights_init)
        # -------------------------------------
        self.G2 = Generator2(self.hparams.image_channels, self.feature_maps).apply(self._weights_init)
        # -------------------------------------
        self.D2 = DCGANDiscriminator(self.feature_maps, self.hparams.image_channels)
        #  steps to mutate the instance, not the class definition
        extra = self.D2._make_disc_block(self.feature_maps * 2, self.feature_maps * 2)
        l = list(self.D2.disc)
        l.insert(2, extra)
        self.D2.disc = nn.Sequential(*l)
        self.D2.apply(self._weights_init)
        # No need for subclassing as the forward method need not be modified.
        # -------------------------------------
        self.R = LensResnet(config, num_classes = 4).apply(self._weights_init)
        # -------------------------------------
        self.pretrained = LensResnet(config)
        ckpt = pl_load(os.path.join(
            '/content/drive/MyDrive/Logs/tune_LensResnet_asha_model_j/train_LensResnet_tune_checkpoint_e38cb_00000_0_batch_size=128,learning_rate=0.001_2021-07-06_17-52-11/checkpoint_epoch=17-step=1406',
            # '/content/drive/MyDrive/Logs/tune_LensResnet_asha_model_f/train_LensResnet_tune_checkpoint_e32ba_00000_0_batch_size=64,learning_rate=0.0001_2021-07-06_03-33-10/checkpoint_epoch=14-step=4689',
            "checkpoint"),
            map_location=lambda storage, loc: storage)
        self.pretrained._load_model_state(ckpt)
        # -------------------------------------
        self.criterion1 = nn.BCELoss()
        self.criterion2 = nn.CrossEntropyLoss()

    @staticmethod
    def _weights_init(m):
        classname = m.__class__.__name__
        if classname.find("Conv") != -1:
            torch.nn.init.normal_(m.weight, 0.0, 0.02)
        elif classname.find("BatchNorm") != -1:
            torch.nn.init.normal_(m.weight, 1.0, 0.02)
            torch.nn.init.zeros_(m.bias)

    def forward(self, noise, labels = None):
        if labels is None:
            labels = torch.randint(*noise.shape[:-1])                           # last dimension is the hidden dimension
        inp = torch.mul(noise, self.G1.label_emb(labels))
        out1 = self.G1(inp.view(-1, inp.shape[-1], 1, 1))
        out2 = self.G2(out1.detach())
        return out2, out1

    def training_step(self, batch, batch_idx, optimizer_idx):
        imgs, labels = batch
        temp2, temp1 = self(torch.randn(labels.shape[0], self.hparams.noise_size).type_as(imgs), labels)

        if optimizer_idx == 0:
            loss = self.criterion1(self.D1(temp1), torch.ones_like(labels, dtype=torch.float32))
            self.log('G1/train/loss/disc', loss)
            loss.add_(self.criterion2(self.R.backbone(self.G2(temp1)), labels))
            self.log('G1/train/loss/full', loss)

        elif optimizer_idx == 1:
            real, fake = self.D1(F.interpolate(imgs, self.hparams.image_width, mode='nearest')), self.D1(temp1.detach())
            prediction, target = torch.cat((real, fake)), torch.cat((torch.ones_like(real),torch.zeros_like(fake)))
            loss = self.criterion1(prediction, target)
            self.log('D1/train/loss', loss)

        elif optimizer_idx == 2:
            loss = self.criterion1(self.D2(temp2), torch.ones_like(labels, dtype=torch.float32))
            self.log('G2/train/loss/disc', loss)
            loss.add_(self.criterion2(self.R.backbone(temp2), labels))
            self.log('G2/train/loss/full', loss)

        elif optimizer_idx == 3:
            real, fake = self.D2(imgs), self.D2(temp2.detach())
            prediction, target = torch.cat((real, fake)), torch.cat((torch.ones_like(real),torch.zeros_like(fake)))
            loss = self.criterion1(prediction, target)
            self.log('D2/train/loss', loss)

        elif optimizer_idx == 4:
            real, fake = self.R.backbone(imgs), self.R.backbone(temp2.detach())
            prediction, target = torch.cat((real, fake)), torch.cat((labels, self.hparams.num_classes * torch.ones_like(labels)))
            loss = self.criterion2(prediction, target)
            self.log('R/train/loss', loss)
        
        return loss

    def configure_optimizers(self):
        opt_g1 = torch.optim.Adam(self.G1.parameters(), self.lr, (self.hparams.b1, 0.999))
        opt_d1 = torch.optim.Adam(self.D1.parameters(), self.lr, (self.hparams.b1, 0.999))
        opt_g2 = torch.optim.Adam(self.G2.parameters(), self.lr, (self.hparams.b1, 0.999))
        opt_d2 = torch.optim.Adam(self.D2.parameters(), self.lr, (self.hparams.b1, 0.999))
        opt_r = torch.optim.Adam(self.R.parameters(), self.lr, (self.hparams.b1, 0.999))
        return opt_g1, opt_d1, opt_g2, opt_d2, opt_r

    def validation_step(self, batch, batch_idx):
        imgs, labels = batch
        temp2, _ = self(torch.randn(labels.shape[0], self.hparams.noise_size).type_as(imgs), labels)
        return {'pred': self.pretrained(temp2.detach()), 'target': labels}

    def validation_epoch_end(self, listofDicts):
        prediction, target = torch.cat([x["pred"] for x in listofDicts]), torch.cat([x["target"] for x in listofDicts])
        aurocTensor = tm.functional.auroc(prediction, target, num_classes=self.hparams.num_classes, average=None)
        self.log('Pre/val/auroc', aurocTensor.min())
        fprList, tprList, _ = tm.functional.roc(prediction, target, num_classes=self.hparams.num_classes)
        
        f = plt.figure()
        colors = cycle(['aqua', 'darkorange', 'cornflowerblue'])
        for i, color in zip(range(self.hparams.num_classes), colors):
            plt.plot(fprList[i].cpu(), tprList[i].cpu(), color=color,
                    label='ROC curve of class {0} (area = {1:0.2f})'
                    ''.format(i, aurocTensor[i].cpu()))
        plt.plot([0, 1], [0, 1], 'k--')
        plt.xlim([0.0, 1.0])
        plt.ylim([0.0, 1.05])
        plt.xlabel('False Positive Rate')
        plt.ylabel('True Positive Rate')
        plt.title('Multi-class ROC')
        plt.legend(loc="lower right")

        self.logger.experiment.add_figure('StackGAN/val/ROC', f)
        f.savefig(str(tune.get_trial_dir())+'ROC_epoch_'+str(self.current_epoch)+'.pdf')

# Tune StackGAN:
Here we tune hyperparameters for generating images that resemble the images from input.

In [None]:
# __tune_train_checkpoint_begin
def train_StackGAN_tune_checkpoint(config,
                                   checkpoint_dir=None,
                                   num_epochs=10,
                                   num_gpus=1):
    data_dir = os.path.expanduser("/content/images/")
    trainer = pl.Trainer(
        # accumulate_grad_batches=2,
        # limit_train_batches=0.20,
        # limit_val_batches=0.20,
        num_sanity_val_steps=-1,
        max_epochs=num_epochs,
        prepare_data_per_node = False,
        # If fractional GPUs passed in, convert to int.
        gpus=math.ceil(num_gpus),
        # tpu_cores = 8,
        logger=TensorBoardLogger(save_dir=tune.get_trial_dir(), name="", version="."),
        # progress_bar_refresh_rate=1,
        callbacks=[
            TuneReportCheckpointCallback(
                metrics={
                    "lossG1": "G1/train/loss/full",
                    "lossG2": "G2/train/loss/full",
                    "lossD1": "D1/train/loss",
                    "lossD2": "D2/train/loss",
                    "lossR": "R/train/loss",
                    "auroc": "Pre/val/auroc",
                },
                filename="checkpoint",
                # on="training_end"
            )
        ],
        # stochastic_weight_avg=True,
        # works with only one optimizer
        )
    dm = NpyDataModule(config, data_dir)
    if checkpoint_dir:
        # Currently, this leads to errors:
        # model = StackGAN.load_from_checkpoint(
        #     os.path.join(checkpoint, "checkpoint"))
        # Workaround:
        ckpt = pl_load(
            os.path.join(checkpoint_dir, "checkpoint"),
            map_location=lambda storage, loc: storage)
        model = StackGAN._load_model_state(
            ckpt, config=config, 
            # data_dir=data_dir
            )
        trainer.current_epoch = ckpt["epoch"]
    else:
        model = StackGAN(config)

    trainer.fit(model, dm)
# __tune_train_checkpoint_end__


# __tune_asha_begin__
def tune_StackGAN_asha(num_samples=10, num_epochs=10, gpus_per_trial=1):
    config = {
        "learning_rate": tune.choice([1e-4]),
        "feature_maps": tune.choice([64]),
        "batch_size": tune.choice([128, 64]),
    }

    scheduler = ASHAScheduler(
        max_t=num_epochs,
        grace_period=1,
        reduction_factor=2)

    reporter = CLIReporter(
        # overwrite=True,
        parameter_columns=["learning_rate", "feature_maps", "batch_size"],
        metric_columns=["lossG1", "lossG2", "lossD1", "lossD2", "lossR", "auroc", "training_iteration"],
        )

    analysis = tune.run(
        tune.with_parameters(
            train_StackGAN_tune_checkpoint,
            num_epochs=num_epochs,
            num_gpus=gpus_per_trial),
        name="tune_StackGAN_asha_model_j",
        metric="auroc",
        mode="max",
        config=config,
        resources_per_trial={
            "cpu": 2,
            "gpu": gpus_per_trial,
            # "tpu": 8,
        },
        num_samples=num_samples,
        local_dir='./drive/MyDrive/Logs',
        scheduler=scheduler,
        progress_reporter=reporter,
        # restore='/content/drive/MyDrive/Logs/tune_StackGAN_1_asha_model_j/train_StackGAN_tune_checkpoint_fa25b_00000_0_batch_size=64,feature_maps=64,learning_rate=0.0001_2021-07-06_20-23-13/checkpoint_epoch=0-step=937',
        fail_fast = True,
        resume='PROMPT',
        )

    print("Best hyperparameters found were: ", analysis.best_config)

# __tune_asha_end__


# __tune_pbt_begin__
def tune_StackGAN_pbt(num_samples=10, num_epochs=10, gpus_per_trial=1):
    config = {
        "learning_rate": 1e-4,
        "feature_maps": 64,
        "batch_size": 64,
    }

    scheduler = PopulationBasedTraining(
        perturbation_interval=4,
        hyperparam_mutations={
            "learning_rate": [1e-4, 1e-3],
            "feature_maps": [64, 128],
            "batch_size": [32, 64, 128]
        })

    reporter = CLIReporter(
        # overwrite=True,
        parameter_columns=["learning_rate", "feature_maps", "batch_size"],
        metric_columns=["lossG1", "lossG2", "lossD1", "lossD2", "lossR", "auroc", "training_iteration"],
        )

    analysis = tune.run(
        # resume=True,
        tune.with_parameters(
            train_StackGAN_tune_checkpoint,
            num_epochs=num_epochs,
            num_gpus=gpus_per_trial),
        name="tune_StackGAN_pbt_model_j",
        metric="auroc",
        mode="max",
        resources_per_trial={
            "cpu": 2,
            "gpu": gpus_per_trial,
            # "tpu": 8,
        },
        config=config,
        num_samples=num_samples,
        scheduler=scheduler,
        progress_reporter=reporter,
        local_dir='./drive/MyDrive/Logs',
        # restore='/content/drive/MyDrive/Logs/tune_StackGAN_1_asha_model_j/train_StackGAN_tune_checkpoint_fa25b_00000_0_batch_size=64,feature_maps=64,learning_rate=0.0001_2021-07-06_20-23-13/checkpoint_epoch=0-step=937',
        fail_fast = True,
        # resume='PROMPT',
        )

    print("Best hyperparameters found were: ", analysis.best_config)

# __tune_pbt_end__


if __name__ == "__main__":
    import argparse

    parser = argparse.ArgumentParser()
    parser.add_argument(
        "--smoke-test", action="store_true", help="Finish quickly for testing")
    args, _ = parser.parse_known_args()

    if args.smoke_test:
        tune_StackGAN_asha(num_samples=1, num_epochs=6, gpus_per_trial=1)
        tune_StackGAN_pbt(num_samples=1, num_epochs=6, gpus_per_trial=1)
    else:
        # ASHA scheduler
        tune_StackGAN_asha(num_samples=2, num_epochs=1, gpus_per_trial=1)
        # Population based training
        # tune_StackGAN_pbt(num_samples=8, num_epochs=5, gpus_per_trial=1)

Resume from local directory? [y/N]: y


2021-07-07 15:06:21,237	INFO tune.py:467 -- TrialRunner resumed, ignoring new add_experiment.
[2m[36m(pid=2877)[0m 2021-07-07 15:06:21,249	ERROR worker.py:418 -- SystemExit was raised from the worker
[2m[36m(pid=2877)[0m Traceback (most recent call last):
[2m[36m(pid=2877)[0m   File "python/ray/_raylet.pyx", line 595, in ray._raylet.task_execution_handler
[2m[36m(pid=2877)[0m   File "python/ray/_raylet.pyx", line 453, in ray._raylet.execute_task
[2m[36m(pid=2877)[0m   File "python/ray/_raylet.pyx", line 490, in ray._raylet.execute_task
[2m[36m(pid=2877)[0m   File "python/ray/_raylet.pyx", line 497, in ray._raylet.execute_task
[2m[36m(pid=2877)[0m   File "python/ray/_raylet.pyx", line 501, in ray._raylet.execute_task
[2m[36m(pid=2877)[0m   File "python/ray/_raylet.pyx", line 451, in ray._raylet.execute_task.function_executor
[2m[36m(pid=2877)[0m   File "/usr/local/lib/python3.7/dist-packages/ray/_private/function_manager.py", line 563, in actor_method_executor

== Status ==
Memory usage on this node: 3.2/12.7 GiB
Using AsyncHyperBand: num_stopped=0
Bracket: Iter 1.000: None
Resources requested: 0/2 CPUs, 0/1 GPUs, 0.0/7.54 GiB heap, 0.0/3.77 GiB objects (0.0/1.0 GPU_group_989fc9f1de0f0c7a947b8c51120efc30, 0.0/2.0 CPU_group_989fc9f1de0f0c7a947b8c51120efc30, 0.0/2.0 CPU_group_0_989fc9f1de0f0c7a947b8c51120efc30, 0.0/1.0 accelerator_type:T4, 0.0/1.0 GPU_group_0_989fc9f1de0f0c7a947b8c51120efc30)
Result logdir: /content/drive/MyDrive/Logs/tune_StackGAN_asha_model_j
Number of trials: 2/2 (2 PENDING)
+--------------------------------------------+----------+-------+-----------------+----------------+--------------+
| Trial name                                 | status   | loc   |   learning_rate |   feature_maps |   batch_size |
|--------------------------------------------+----------+-------+-----------------+----------------+--------------|
| train_StackGAN_tune_checkpoint_1f244_00000 | PENDING  |       |          0.0001 |             64 |          

[2m[36m(pid=2878)[0m GPU available: True, used: True
[2m[36m(pid=2878)[0m TPU available: False, using: 0 TPU cores


== Status ==
Memory usage on this node: 1.9/12.7 GiB
Using AsyncHyperBand: num_stopped=0
Bracket: Iter 1.000: None
Resources requested: 2.0/2 CPUs, 1.0/1 GPUs, 0.0/7.54 GiB heap, 0.0/3.77 GiB objects (0.0/1.0 GPU_group_989fc9f1de0f0c7a947b8c51120efc30, 0.0/2.0 CPU_group_989fc9f1de0f0c7a947b8c51120efc30, 0.0/2.0 CPU_group_0_989fc9f1de0f0c7a947b8c51120efc30, 0.0/1.0 accelerator_type:T4, 0.0/1.0 GPU_group_0_989fc9f1de0f0c7a947b8c51120efc30)
Result logdir: /content/drive/MyDrive/Logs/tune_StackGAN_asha_model_j
Number of trials: 2/2 (1 PENDING, 1 RUNNING)
+--------------------------------------------+----------+-------+-----------------+----------------+--------------+
| Trial name                                 | status   | loc   |   learning_rate |   feature_maps |   batch_size |
|--------------------------------------------+----------+-------+-----------------+----------------+--------------|
| train_StackGAN_tune_checkpoint_1f244_00000 | RUNNING  |       |          0.0001 |            

[2m[36m(pid=2878)[0m LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


== Status ==
Memory usage on this node: 3.1/12.7 GiB
Using AsyncHyperBand: num_stopped=0
Bracket: Iter 1.000: None
Resources requested: 2.0/2 CPUs, 1.0/1 GPUs, 0.0/7.54 GiB heap, 0.0/3.77 GiB objects (0.0/1.0 GPU_group_11925c938e061af52a49e839fa9f7d2e, 0.0/1.0 accelerator_type:T4, 0.0/2.0 CPU_group_0_11925c938e061af52a49e839fa9f7d2e, 0.0/2.0 CPU_group_11925c938e061af52a49e839fa9f7d2e, 0.0/1.0 GPU_group_0_11925c938e061af52a49e839fa9f7d2e)
Result logdir: /content/drive/MyDrive/Logs/tune_StackGAN_asha_model_j
Number of trials: 2/2 (1 PENDING, 1 RUNNING)
+--------------------------------------------+----------+-------+-----------------+----------------+--------------+
| Trial name                                 | status   | loc   |   learning_rate |   feature_maps |   batch_size |
|--------------------------------------------+----------+-------+-----------------+----------------+--------------|
| train_StackGAN_tune_checkpoint_1f244_00000 | RUNNING  |       |          0.0001 |            

[2m[36m(pid=2878)[0m 2021-07-07 15:06:32.107077: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudart.so.11.0
[2m[36m(pid=2878)[0m 
[2m[36m(pid=2878)[0m   | Name       | Type               | Params
[2m[36m(pid=2878)[0m --------------------------------------------------
[2m[36m(pid=2878)[0m 0 | G1         | DCGANGenerator     | 3.6 M 
[2m[36m(pid=2878)[0m 1 | D1         | DCGANDiscriminator | 2.8 M 
[2m[36m(pid=2878)[0m 2 | G2         | Generator2         | 632 K 
[2m[36m(pid=2878)[0m 3 | D2         | DCGANDiscriminator | 3.0 M 
[2m[36m(pid=2878)[0m 4 | R          | LensResnet         | 11.2 M
[2m[36m(pid=2878)[0m 5 | pretrained | LensResnet         | 11.2 M
[2m[36m(pid=2878)[0m 6 | criterion1 | BCELoss            | 0     
[2m[36m(pid=2878)[0m 7 | criterion2 | CrossEntropyLoss   | 0     
[2m[36m(pid=2878)[0m --------------------------------------------------
[2m[36m(pid=2878)[0m 32.3 M    

[2m[36m(pid=2878)[0m Validation sanity check: 0it [00:00, ?it/s]Validation sanity check:   0%|          | 0/118 [00:00<?, ?it/s]


[2m[36m(pid=2878)[0m   return torch.max_pool2d(input, kernel_size, stride, padding, dilation, ceil_mode)


Validation sanity check:  17%|█▋        | 20/118 [00:11<00:54,  1.80it/s]
Validation sanity check:  34%|███▍      | 40/118 [00:22<00:43,  1.80it/s]
Validation sanity check:  51%|█████     | 60/118 [00:33<00:32,  1.78it/s]
Validation sanity check:  68%|██████▊   | 80/118 [00:45<00:21,  1.75it/s]
Validation sanity check:  85%|████████▍ | 100/118 [01:02<00:11,  1.54it/s]
Validation sanity check: 100%|██████████| 118/118 [01:17<00:00,  1.42it/s]


[2m[36m(pid=2878)[0m   "`Trainer.running_sanity_check` has been renamed to `Trainer.sanity_checking` and will be removed in v1.5."


[2m[36m(pid=2878)[0m                                                                           Training: 0it [00:00, ?it/s]Training:   0%|          | 0/587 [00:00<?, ?it/s]Epoch 0:   0%|          | 0/587 [00:00<?, ?it/s] 




KeyboardInterrupt: ignored