In [2]:
from torch import load
from pprint import pprint
from numpy import where, array, asarray, nonzero
import os
import yaml
import wandb
import argparse
#import numpy as np
from pathlib import Path
from models import *
from experiment import VAEXperiment
#import torch.backends.cudnn as cudnn
from pytorch_lightning import Trainer
#from pytorch_lightning.loggers import TensorBoardLogger
from pytorch_lightning.utilities.seed import seed_everything
from pytorch_lightning.loggers import WandbLogger
from pytorch_lightning.callbacks import LearningRateMonitor, ModelCheckpoint, EarlyStopping
from dataset import VAEDataset

os.environ["KMP_DUPLICATE_LIB_OK"]="TRUE"

In [15]:
checkpoint = load('logs\checkpoints\VanillaVAEepoch=0-val_loss=0.82.ckpt')
keys = checkpoint.keys() #['epoch', 'global_step', 'loops', 'callbacks']
keys

dict_keys(['epoch', 'global_step', 'pytorch-lightning_version', 'state_dict', 'loops', 'callbacks', 'optimizer_states', 'lr_schedulers'])

In [3]:
parser = argparse.ArgumentParser(description='Generic runner for VAE models')
parser.add_argument('--config',  '-c',
                dest="filename",
                metavar='FILE',
                help =  'path to the config file',
                default='configs/debug.yaml')

args = parser.parse_args([]) # add [] when running on Colab/Jupyter Notebook

with open(args.filename, 'r') as file:
    try:
        config = yaml.safe_load(file)
        print(config)
    except yaml.YAMLError as exc:
        print(exc)

{'model_params': {'name': 'VanillaVAE', 'in_channels': 3, 'latent_dim': 128}, 'data_params': {'data_path': 'D:/LaureaMagistrale/PrimoAnno/Secondo_Semestre/DeepLearning/datasets/CelebA', 'train_batch_size': 64, 'val_batch_size': 64, 'test_batch_size': 16, 'patch_size': 64, 'num_workers': 0}, 'exp_params': {'LR': 0.005, 'weight_decay': 0.003, 'scheduler_gamma': 0.95, 'kld_weight': 0.00025, 'manual_seed': 42}, 'trainer_params': {'accelerator': 'gpu', 'devices': 1, 'max_epochs': 5, 'max_time': '00:00:10:00'}, 'training_params': {'tune_lr': True, 'tune_batch_size': True, 'lr_min': 0.009, 'lr_max': 1, 'resume_train': False, 'ckpt_path': None, 'patience': 5, 'every_n_epochs': 1}, 'logging_params': {'enable_wandb': True, 'save_dir': 'logs/', 'manual_seed': 42, 'name': 'VanillaVAE'}}


In [4]:
# wandb setting
KEY = "b2f43af624f34e36163a25d9c7a60d3385d7d46a"
user = 'mattiacapparella'
project = "DLAI AA 2022 - Disentangling VAE"

# Ckpt formatting
model_ckpt_name = config['logging_params']['name']


# For reproducibility
seed_everything(config['exp_params']['manual_seed'],True)

model = vae_models[config['model_params']['name']](**config['model_params'])
data = VAEDataset(**config['data_params'], pin_memory=config['trainer_params']['devices'] != 0)
#data.setup() # CALL IT MANUALLY JUST FOR DEBUGGING. THEN IT CAN BE REMOVED

experiment = VAEXperiment(model, config['exp_params'])

if config['logging_params']['enable_wandb']:
    wandb.login(key = KEY)
    wandb.init(entity=user, project = project)
    wandb_logger = WandbLogger(
        project = project,
        save_dir = config['logging_params']['save_dir'],
        log_model = True
    )



if config['training_params']['tune_lr']:
    trainer = Trainer(**config['trainer_params'])

Global seed set to 42
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
wandb: Currently logged in as: mattiacapparella (use `wandb login --relogin` to force relogin)
wandb: Appending key for api.wandb.ai to your netrc file: C:\Users\User/.netrc


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


## Tuning Initial LR 

In [None]:
# Run learning rate finder
lr_finder = trainer.tuner.lr_find(experiment,
                                datamodule=data,
                                min_lr= config['training_params']['lr_min'],
                                max_lr= config['training_params']['lr_max'],)

# Results can be found in
print(lr_finder.results)

# Plot with
fig = lr_finder.plot(suggest=True)

In [23]:
import plotly.express as px
import plotly.graph_objects as go

In [47]:
best_lr = lr_finder.suggestion()
best_lr_index = where(array(lr_finder.results['lr'])==best_lr)[0].item()
best_loss = lr_finder.results['loss'][best_lr_index]
best_lr, best_loss

(0.009956435938778324, 0.1353527790889947)

In [59]:
fig = go.Figure()

# Add traces
fig.add_trace(go.Scatter(x=lr_finder.results['lr'],
                        y=lr_finder.results['loss'],
                        name="lrVSloss",
                        line_shape='linear',
                        mode='lines'))

fig.add_trace(go.Scatter(x=[best_lr],
                        y=[best_loss],
                        name="best_lr",
                        mode = 'markers'))

fig.update_layout(
    title='Learning Rate Finder',
    xaxis_title='LR',
    yaxis_title='Loss',
    xaxis = dict(range=[0, 0.2])
    
)        

## Tuning Batch Size

In [9]:
# Run learning rate finder
batch_size_finder = trainer.tuner.scale_batch_size(
                                experiment,
                                datamodule = data,
                                mode = 'binsearch',
                                init_val = 64,
                                steps_per_trial = 50,
                                max_trials = 5,
                                batch_arg_name='train_batch_size')

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
  rank_zero_warn(
  rank_zero_warn(
`Trainer.fit` stopped: `max_steps=50` reached.
Batch size 64 succeeded, trying batch size 128
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
`Trainer.fit` stopped: `max_steps=50` reached.
Batch size 128 succeeded, trying batch size 256
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
`Trainer.fit` stopped: `max_steps=50` reached.
Batch size 256 succeeded, trying batch size 512
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
`Trainer.fit` stopped: `max_steps=50` reached.
Batch size 512 succeeded, trying batch size 1024
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Batch size 1024 failed, trying batch size 768
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Batch size 768 failed, trying batch size 640
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


KeyboardInterrupt: 

In [6]:
import pytorch_lightning as pl
pl.__version__ 

'1.7.7'