From 0c7f78f0336a1334aa3221f272a10df20842188d Mon Sep 17 00:00:00 2001 From: vfdev-5 Date: Sat, 6 Feb 2021 00:44:57 +0000 Subject: [PATCH 1/7] Updates for cifar10 example --- examples/contrib/cifar10/main.py | 33 +++++++++++++++++--------------- 1 file changed, 18 insertions(+), 15 deletions(-) diff --git a/examples/contrib/cifar10/main.py b/examples/contrib/cifar10/main.py index 1a5773e71e7b..07b2d2232bc3 100644 --- a/examples/contrib/cifar10/main.py +++ b/examples/contrib/cifar10/main.py @@ -5,6 +5,7 @@ import torch import torch.nn as nn import torch.optim as optim +from torch.cuda.amp import GradScaler, autocast import utils import ignite @@ -152,6 +153,7 @@ def run( nproc_per_node=None, stop_iteration=None, with_clearml=False, + with_amp=False, **spawn_kwargs, ): """Main entry to train an model on CIFAR10 dataset. @@ -179,6 +181,7 @@ def run( It can be 0 to disable it. Default, 15. stop_iteration (int, optional): iteration to stop the training. Can be used to check resume from checkpoint. with_clearml (bool): if True, experiment ClearML logger is setup. Default, False. + with_amp (bool): if True, enables native automatic mixed precision. Default, False. **spawn_kwargs: Other kwargs to spawn run in child processes: master_addr, master_port, node_rank, nnodes """ @@ -252,6 +255,10 @@ def log_basic_info(logger, config): logger.info(f"Train {config['model']} on CIFAR10") logger.info(f"- PyTorch version: {torch.__version__}") logger.info(f"- Ignite version: {ignite.__version__}") + if torch.cuda.is_available(): + logger.info(f"- GPU Device: {torch.cuda.get_device_name(idist.get_local_rank())}") + logger.info(f"- CUDA version: {torch.version.cuda}") + logger.info(f"- CUDNN version: {torch.backends.cudnn.version()}") logger.info("\n") logger.info("Configuration:") @@ -279,6 +286,9 @@ def create_trainer(model, optimizer, criterion, lr_scheduler, train_sampler, con # - RunningAverage` on `train_step` output # - Two progress bars on epochs and optionally on iterations + with_amp = config["with_amp"] + scaler = GradScaler(enabled=with_amp) + def train_step(engine, batch): x, y = batch[0], batch[1] @@ -288,28 +298,21 @@ def train_step(engine, batch): y = y.to(device, non_blocking=True) model.train() - # Supervised part - y_pred = model(x) - loss = criterion(y_pred, y) + + with autocast(enabled=with_amp): + y_pred = model(x) + loss = criterion(y_pred, y) optimizer.zero_grad() - loss.backward() - optimizer.step() - - # This can be helpful for XLA to avoid performance slow down if fetch loss.item() every iteration - if config["log_every_iters"] > 0 and (engine.state.iteration - 1) % config["log_every_iters"] == 0: - batch_loss = loss.item() - engine.state.saved_batch_loss = batch_loss - else: - batch_loss = engine.state.saved_batch_loss + scaler.scale(loss).backward() + scaler.step(optimizer) + scaler.update() return { - "batch loss": batch_loss, + "batch loss": loss.item(), } trainer = Engine(train_step) - trainer.state.saved_batch_loss = -1.0 - trainer.state_dict_user_keys.append("saved_batch_loss") trainer.logger = logger to_save = {"trainer": trainer, "model": model, "optimizer": optimizer, "lr_scheduler": lr_scheduler} From 1bb5fb126a7c1cd754939e21c95a5f961e901236 Mon Sep 17 00:00:00 2001 From: vfdev-5 Date: Sat, 6 Feb 2021 00:44:57 +0000 Subject: [PATCH 2/7] Updates for cifar10 example --- examples/contrib/cifar10/main.py | 39 +++++++++++++++++--------------- 1 file changed, 21 insertions(+), 18 deletions(-) diff --git a/examples/contrib/cifar10/main.py b/examples/contrib/cifar10/main.py index 1a5773e71e7b..226a0a4162c9 100644 --- a/examples/contrib/cifar10/main.py +++ b/examples/contrib/cifar10/main.py @@ -5,6 +5,7 @@ import torch import torch.nn as nn import torch.optim as optim +from torch.cuda.amp import GradScaler, autocast import utils import ignite @@ -102,13 +103,13 @@ def run_validation(engine): evaluators = {"training": train_evaluator, "test": evaluator} tb_logger = common.setup_tb_logging(output_path, trainer, optimizer, evaluators=evaluators) - # Store 3 best models by validation accuracy: + # Store 3 best models by validation accuracy starting from num_epochs / 2: common.gen_save_best_models_by_val_score( save_handler=get_save_handler(config), evaluator=evaluator, models={"model": model}, metric_name="accuracy", - n_saved=3, + n_saved=2, trainer=trainer, tag="test", ) @@ -145,13 +146,14 @@ def run( learning_rate=0.4, num_warmup_epochs=4, validate_every=3, - checkpoint_every=200, + checkpoint_every=1000, backend=None, resume_from=None, log_every_iters=15, nproc_per_node=None, stop_iteration=None, with_clearml=False, + with_amp=False, **spawn_kwargs, ): """Main entry to train an model on CIFAR10 dataset. @@ -179,6 +181,7 @@ def run( It can be 0 to disable it. Default, 15. stop_iteration (int, optional): iteration to stop the training. Can be used to check resume from checkpoint. with_clearml (bool): if True, experiment ClearML logger is setup. Default, False. + with_amp (bool): if True, enables native automatic mixed precision. Default, False. **spawn_kwargs: Other kwargs to spawn run in child processes: master_addr, master_port, node_rank, nnodes """ @@ -252,6 +255,10 @@ def log_basic_info(logger, config): logger.info(f"Train {config['model']} on CIFAR10") logger.info(f"- PyTorch version: {torch.__version__}") logger.info(f"- Ignite version: {ignite.__version__}") + if torch.cuda.is_available(): + logger.info(f"- GPU Device: {torch.cuda.get_device_name(idist.get_local_rank())}") + logger.info(f"- CUDA version: {torch.version.cuda}") + logger.info(f"- CUDNN version: {torch.backends.cudnn.version()}") logger.info("\n") logger.info("Configuration:") @@ -279,6 +286,9 @@ def create_trainer(model, optimizer, criterion, lr_scheduler, train_sampler, con # - RunningAverage` on `train_step` output # - Two progress bars on epochs and optionally on iterations + with_amp = config["with_amp"] + scaler = GradScaler(enabled=with_amp) + def train_step(engine, batch): x, y = batch[0], batch[1] @@ -288,28 +298,21 @@ def train_step(engine, batch): y = y.to(device, non_blocking=True) model.train() - # Supervised part - y_pred = model(x) - loss = criterion(y_pred, y) + + with autocast(enabled=with_amp): + y_pred = model(x) + loss = criterion(y_pred, y) optimizer.zero_grad() - loss.backward() - optimizer.step() - - # This can be helpful for XLA to avoid performance slow down if fetch loss.item() every iteration - if config["log_every_iters"] > 0 and (engine.state.iteration - 1) % config["log_every_iters"] == 0: - batch_loss = loss.item() - engine.state.saved_batch_loss = batch_loss - else: - batch_loss = engine.state.saved_batch_loss + scaler.scale(loss).backward() + scaler.step(optimizer) + scaler.update() return { - "batch loss": batch_loss, + "batch loss": loss.item(), } trainer = Engine(train_step) - trainer.state.saved_batch_loss = -1.0 - trainer.state_dict_user_keys.append("saved_batch_loss") trainer.logger = logger to_save = {"trainer": trainer, "model": model, "optimizer": optimizer, "lr_scheduler": lr_scheduler} From 8c928897e106e6cc43258bafbf30e46867832a55 Mon Sep 17 00:00:00 2001 From: vfdev-5 Date: Sun, 7 Feb 2021 00:50:02 +0000 Subject: [PATCH 3/7] More updates --- examples/contrib/cifar10/main.py | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/examples/contrib/cifar10/main.py b/examples/contrib/cifar10/main.py index 226a0a4162c9..a8e90b515972 100644 --- a/examples/contrib/cifar10/main.py +++ b/examples/contrib/cifar10/main.py @@ -13,7 +13,7 @@ from ignite.contrib.engines import common from ignite.contrib.handlers import PiecewiseLinear from ignite.engine import Engine, Events, create_supervised_evaluator -from ignite.handlers import Checkpoint, DiskSaver +from ignite.handlers import Checkpoint, DiskSaver, global_step_from_engine from ignite.metrics import Accuracy, Loss from ignite.utils import manual_seed, setup_logger @@ -103,16 +103,17 @@ def run_validation(engine): evaluators = {"training": train_evaluator, "test": evaluator} tb_logger = common.setup_tb_logging(output_path, trainer, optimizer, evaluators=evaluators) - # Store 3 best models by validation accuracy starting from num_epochs / 2: - common.gen_save_best_models_by_val_score( - save_handler=get_save_handler(config), - evaluator=evaluator, - models={"model": model}, - metric_name="accuracy", + # Store 2 best models by validation accuracy starting from num_epochs / 2: + best_model_handler = Checkpoint( + {"model": model}, + get_save_handler(config), + filename_prefix="best", n_saved=2, - trainer=trainer, - tag="test", + global_step_transform=global_step_from_engine(trainer), + score_name="test_accuracy", + score_function=common.get_default_score_fn("accuracy") ) + evaluator.add_event_handler(Events.COMPLETED(lambda *_: trainer.state.epoch > config["num_epochs"] // 2), best_model_handler) # In order to check training resuming we can stop training on a given iteration if config["stop_iteration"] is not None: @@ -125,9 +126,8 @@ def _(): try: trainer.run(train_loader, max_epochs=config["num_epochs"]) except Exception as e: - import traceback - - print(traceback.format_exc()) + logger.exception("") + raise e if rank == 0: tb_logger.close() @@ -248,7 +248,7 @@ def initialize(config): def log_metrics(logger, epoch, elapsed, tag, metrics): metrics_output = "\n".join([f"\t{k}: {v}" for k, v in metrics.items()]) - logger.info(f"\nEpoch {epoch} - Evaluation time (seconds): {int(elapsed)} - {tag} metrics:\n {metrics_output}") + logger.info(f"\nEpoch {epoch} - Evaluation time (seconds): {elapsed:.2f} - {tag} metrics:\n {metrics_output}") def log_basic_info(logger, config): From c790a5b269fd46b65c1d63be6818200308287a5b Mon Sep 17 00:00:00 2001 From: vfdev-5 Date: Thu, 11 Feb 2021 20:46:33 +0000 Subject: [PATCH 4/7] Updated code --- examples/contrib/cifar10/main.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/contrib/cifar10/main.py b/examples/contrib/cifar10/main.py index a8e90b515972..bc27c82e31ae 100644 --- a/examples/contrib/cifar10/main.py +++ b/examples/contrib/cifar10/main.py @@ -111,7 +111,7 @@ def run_validation(engine): n_saved=2, global_step_transform=global_step_from_engine(trainer), score_name="test_accuracy", - score_function=common.get_default_score_fn("accuracy") + score_function=Checkpoint.get_default_score_fn("accuracy") ) evaluator.add_event_handler(Events.COMPLETED(lambda *_: trainer.state.epoch > config["num_epochs"] // 2), best_model_handler) From 863b35a05de73d4bf834b3f108a15e93545c8d0f Mon Sep 17 00:00:00 2001 From: vfdev-5 Date: Thu, 11 Feb 2021 21:18:25 +0000 Subject: [PATCH 5/7] Fixed code-formatting --- examples/contrib/cifar10/main.py | 14 ++-- examples/contrib/cifar10_qat/main.py | 111 +++++++++++++++++++-------- 2 files changed, 87 insertions(+), 38 deletions(-) diff --git a/examples/contrib/cifar10/main.py b/examples/contrib/cifar10/main.py index bc27c82e31ae..4838f2e096be 100644 --- a/examples/contrib/cifar10/main.py +++ b/examples/contrib/cifar10/main.py @@ -5,8 +5,8 @@ import torch import torch.nn as nn import torch.optim as optim -from torch.cuda.amp import GradScaler, autocast import utils +from torch.cuda.amp import GradScaler, autocast import ignite import ignite.distributed as idist @@ -77,8 +77,8 @@ def training(local_rank, config): # Let's now setup evaluator engine to perform model's validation and compute metrics metrics = { - "accuracy": Accuracy(), - "loss": Loss(criterion), + "Accuracy": Accuracy(), + "Loss": Loss(criterion), } # We define two evaluators as they wont have exactly similar roles: @@ -111,9 +111,11 @@ def run_validation(engine): n_saved=2, global_step_transform=global_step_from_engine(trainer), score_name="test_accuracy", - score_function=Checkpoint.get_default_score_fn("accuracy") + score_function=Checkpoint.get_default_score_fn("accuracy"), + ) + evaluator.add_event_handler( + Events.COMPLETED(lambda *_: trainer.state.epoch > config["num_epochs"] // 2), best_model_handler ) - evaluator.add_event_handler(Events.COMPLETED(lambda *_: trainer.state.epoch > config["num_epochs"] // 2), best_model_handler) # In order to check training resuming we can stop training on a given iteration if config["stop_iteration"] is not None: @@ -298,7 +300,7 @@ def train_step(engine, batch): y = y.to(device, non_blocking=True) model.train() - + with autocast(enabled=with_amp): y_pred = model(x) loss = criterion(y_pred, y) diff --git a/examples/contrib/cifar10_qat/main.py b/examples/contrib/cifar10_qat/main.py index 5e612bfd1184..bac448f846e6 100644 --- a/examples/contrib/cifar10_qat/main.py +++ b/examples/contrib/cifar10_qat/main.py @@ -6,13 +6,14 @@ import torch.nn as nn import torch.optim as optim import utils +from torch.cuda.amp import GradScaler, autocast import ignite import ignite.distributed as idist from ignite.contrib.engines import common from ignite.contrib.handlers import PiecewiseLinear from ignite.engine import Engine, Events, create_supervised_evaluator -from ignite.handlers import Checkpoint, DiskSaver +from ignite.handlers import Checkpoint, DiskSaver, global_step_from_engine from ignite.metrics import Accuracy, Loss from ignite.utils import manual_seed, setup_logger @@ -31,16 +32,37 @@ def training(local_rank, config): if rank == 0: now = datetime.now().strftime("%Y%m%d-%H%M%S") - folder_name = "{}_backend-{}-{}_{}".format(config["model"], idist.backend(), idist.get_world_size(), now) + folder_name = f"{config['model']}_backend-{idist.backend()}-{idist.get_world_size()}_{now}" output_path = Path(output_path) / folder_name if not output_path.exists(): output_path.mkdir(parents=True) config["output_path"] = output_path.as_posix() - logger.info("Output path: {}".format(config["output_path"])) + logger.info(f"Output path: {config['output_path']}") if "cuda" in device.type: config["cuda device name"] = torch.cuda.get_device_name(local_rank) + if config["with_clearml"]: + try: + from clearml import Task + except ImportError: + # Backwards-compatibility for legacy Trains SDK + from trains import Task + + task = Task.init("CIFAR10-Training", task_name=output_path.stem) + task.connect_configuration(config) + # Log hyper parameters + hyper_params = [ + "model", + "batch_size", + "momentum", + "weight_decay", + "num_epochs", + "learning_rate", + "num_warmup_epochs", + ] + task.connect({k: config[k] for k in hyper_params}) + # Setup dataflow, model, optimizer, criterion train_loader, test_loader = get_dataflow(config) @@ -78,15 +100,18 @@ def run_validation(engine): evaluators = {"training": train_evaluator, "test": evaluator} tb_logger = common.setup_tb_logging(output_path, trainer, optimizer, evaluators=evaluators) - # Store 3 best models by validation accuracy: - common.save_best_model_by_val_score( - output_path=config["output_path"], - evaluator=evaluator, - model=model, - metric_name="Accuracy", - n_saved=1, - trainer=trainer, - tag="test", + # Store 2 best models by validation accuracy starting from num_epochs / 2: + best_model_handler = Checkpoint( + {"model": model}, + get_save_handler(config), + filename_prefix="best", + n_saved=2, + global_step_transform=global_step_from_engine(trainer), + score_name="test_accuracy", + score_function=Checkpoint.get_default_score_fn("accuracy"), + ) + evaluator.add_event_handler( + Events.COMPLETED(lambda *_: trainer.state.epoch > config["num_epochs"] // 2), best_model_handler ) trainer.run(train_loader, max_epochs=config["num_epochs"]) @@ -108,11 +133,13 @@ def run( learning_rate=0.4, num_warmup_epochs=4, validate_every=3, - checkpoint_every=200, + checkpoint_every=1000, backend=None, resume_from=None, log_every_iters=15, nproc_per_node=None, + with_clearml=False, + with_amp=False, **spawn_kwargs, ): """Main entry to train an model on CIFAR10 dataset. @@ -138,6 +165,8 @@ def run( resume_from (str, optional): path to checkpoint to use to resume the training from. Default, None. log_every_iters (int): argument to log batch loss every ``log_every_iters`` iterations. It can be 0 to disable it. Default, 15. + with_clearml (bool): if True, experiment ClearML logger is setup. Default, False. + with_amp (bool): if True, enables native automatic mixed precision. Default, False. **spawn_kwargs: Other kwargs to spawn run in child processes: master_addr, master_port, node_rank, nnodes """ @@ -149,10 +178,8 @@ def run( spawn_kwargs["nproc_per_node"] = nproc_per_node with idist.Parallel(backend=backend, **spawn_kwargs) as parallel: - try: - parallel.run(training, config) - except Exception as e: - raise e + + parallel.run(training, config) def get_dataflow(config): @@ -167,7 +194,7 @@ def get_dataflow(config): # Ensure that only rank 0 download the dataset idist.barrier() - # Setup data loader also adapted to distributed config + # Setup data loader also adapted to distributed config: nccl, gloo, xla-tpu train_loader = idist.auto_dataloader( train_dataset, batch_size=config["batch_size"], num_workers=config["num_workers"], shuffle=True, drop_last=True, ) @@ -180,6 +207,7 @@ def get_dataflow(config): def initialize(config): model = utils.get_model(config["model"]) + # Adapt model for distributed settings if configured model = idist.auto_model(model, find_unused_parameters=True) optimizer = optim.SGD( @@ -205,24 +233,28 @@ def initialize(config): def log_metrics(logger, epoch, elapsed, tag, metrics): metrics_output = "\n".join([f"\t{k}: {v}" for k, v in metrics.items()]) - logger.info(f"\nEpoch {epoch} - Time taken (seconds) : {elapsed:.02f} - {tag} metrics:\n {metrics_output}") + logger.info(f"\nEpoch {epoch} - Evaluation time (seconds): {elapsed:.2f} - {tag} metrics:\n {metrics_output}") def log_basic_info(logger, config): - logger.info("Quantization Aware Training {} on CIFAR10".format(config["model"])) - logger.info("- PyTorch version: {}".format(torch.__version__)) - logger.info("- Ignite version: {}".format(ignite.__version__)) + logger.info(f"Quantization Aware Training {config['model']} on CIFAR10") + logger.info(f"- PyTorch version: {torch.__version__}") + logger.info(f"- Ignite version: {ignite.__version__}") + if torch.cuda.is_available(): + logger.info(f"- GPU Device: {torch.cuda.get_device_name(idist.get_local_rank())}") + logger.info(f"- CUDA version: {torch.version.cuda}") + logger.info(f"- CUDNN version: {torch.backends.cudnn.version()}") logger.info("\n") logger.info("Configuration:") for key, value in config.items(): - logger.info("\t{}: {}".format(key, value)) + logger.info(f"\t{key}: {value}") logger.info("\n") if idist.get_world_size() > 1: logger.info("\nDistributed setting:") - logger.info("\tbackend: {}".format(idist.backend())) - logger.info("\tworld size: {}".format(idist.get_world_size())) + logger.info(f"\tbackend: {idist.backend()}") + logger.info(f"\tworld size: {idist.get_world_size()}") logger.info("\n") @@ -239,6 +271,9 @@ def create_trainer(model, optimizer, criterion, lr_scheduler, train_sampler, con # - RunningAverage` on `train_step` output # - Two progress bars on epochs and optionally on iterations + with_amp = config["with_amp"] + scaler = GradScaler(enabled=with_amp) + def train_step(engine, batch): x, y = batch[0], batch[1] @@ -248,12 +283,15 @@ def train_step(engine, batch): y = y.to(device, non_blocking=True) model.train() - y_pred = model(x) - loss = criterion(y_pred, y) + + with autocast(enabled=with_amp): + y_pred = model(x) + loss = criterion(y_pred, y) optimizer.zero_grad() - loss.backward() - optimizer.step() + scaler.scale(loss).backward() + scaler.step(optimizer) + scaler.update() return { "batch loss": loss.item(), @@ -272,7 +310,7 @@ def train_step(engine, batch): train_sampler=train_sampler, to_save=to_save, save_every_iters=config["checkpoint_every"], - output_path=config["output_path"], + save_handler=get_save_handler(config), lr_scheduler=lr_scheduler, output_names=metric_names if config["log_every_iters"] > 0 else None, with_pbars=False, @@ -282,13 +320,22 @@ def train_step(engine, batch): resume_from = config["resume_from"] if resume_from is not None: checkpoint_fp = Path(resume_from) - assert checkpoint_fp.exists(), "Checkpoint '{}' is not found".format(checkpoint_fp.as_posix()) - logger.info("Resume from a checkpoint: {}".format(checkpoint_fp.as_posix())) + assert checkpoint_fp.exists(), f"Checkpoint '{checkpoint_fp.as_posix()}' is not found" + logger.info(f"Resume from a checkpoint: {checkpoint_fp.as_posix()}") checkpoint = torch.load(checkpoint_fp.as_posix(), map_location="cpu") Checkpoint.load_objects(to_load=to_save, checkpoint=checkpoint) return trainer +def get_save_handler(config): + if config["with_clearml"]: + from ignite.contrib.handlers.clearml_logger import ClearMLSaver + + return ClearMLSaver(dirname=config["output_path"]) + + return DiskSaver(config["output_path"], require_empty=False) + + if __name__ == "__main__": fire.Fire({"run": run}) From f226cf0a1500214f746e45bde2790d01f7b299bc Mon Sep 17 00:00:00 2001 From: vfdev-5 Date: Thu, 11 Feb 2021 22:34:15 +0000 Subject: [PATCH 6/7] Fixed typo and failing CI --- .circleci/config.yml | 10 +++++----- examples/contrib/cifar10/main.py | 4 ++-- examples/contrib/cifar10_qat/main.py | 2 +- 3 files changed, 8 insertions(+), 8 deletions(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index b62b52c3dd78..6a9e1af11565 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -256,7 +256,7 @@ jobs: export example_path="examples/contrib/cifar10" # initial run export stop_cmd="--stop_iteration=500" - export test_cmd="CI=1 python ${example_path}/main.py run" + export test_cmd="CI=1 python ${example_path}/main.py run --checkpoint_every=200" docker exec -it pthd /bin/bash -c "${test_cmd} ${stop_cmd}" # resume export resume_opt="--resume-from=/tmp/output-cifar10/resnet18_backend-None-1_stop-on-500/training_checkpoint_400.pt" @@ -268,7 +268,7 @@ jobs: export example_path="examples/contrib/cifar10" # initial run export stop_cmd="--stop_iteration=500" - export test_cmd="CI=1 python -u -m torch.distributed.launch --nproc_per_node=2 --use_env ${example_path}/main.py run --backend=nccl" + export test_cmd="CI=1 python -u -m torch.distributed.launch --nproc_per_node=2 --use_env ${example_path}/main.py run --backend=nccl --checkpoint_every=200" docker exec -it pthd /bin/bash -c "${test_cmd} ${stop_cmd}" # resume export resume_opt="--resume-from=/tmp/output-cifar10/resnet18_backend-nccl-2_stop-on-500/training_checkpoint_400.pt" @@ -280,7 +280,7 @@ jobs: export example_path="examples/contrib/cifar10" # initial run export stop_cmd="--stop_iteration=500" - export test_cmd="CI=1 python -u ${example_path}/main.py run --backend=nccl --nproc_per_node=2" + export test_cmd="CI=1 python -u ${example_path}/main.py run --backend=nccl --nproc_per_node=2 --checkpoint_every=200" docker exec -it pthd /bin/bash -c "${test_cmd} ${stop_cmd}" # resume export resume_opt="--resume-from=/tmp/output-cifar10/resnet18_backend-nccl-2_stop-on-500/training_checkpoint_400.pt" @@ -334,7 +334,7 @@ jobs: export example_path="examples/contrib/cifar10" # initial run export stop_cmd="--stop_iteration=500" - export test_cmd="cd ${example_path} && CI=1 horovodrun -np 2 python -u main.py run --backend=horovod" + export test_cmd="cd ${example_path} && CI=1 horovodrun -np 2 python -u main.py run --backend=horovod --checkpoint_every=200" docker exec -it pthd /bin/bash -c "${test_cmd} ${stop_cmd}" # resume export resume_opt="--resume-from=/tmp/output-cifar10/resnet18_backend-horovod-2_stop-on-500/training_checkpoint_400.pt" @@ -346,7 +346,7 @@ jobs: export example_path="examples/contrib/cifar10" # initial run export stop_cmd="--stop_iteration=500" - export test_cmd="cd ${example_path} && CI=1 python -u main.py run --backend=horovod --nproc_per_node=2" + export test_cmd="cd ${example_path} && CI=1 python -u main.py run --backend=horovod --nproc_per_node=2 --checkpoint_every=200" docker exec -it pthd /bin/bash -c "${test_cmd} ${stop_cmd}" # resume export resume_opt="--resume-from=/tmp/output-cifar10/resnet18_backend-horovod-2_stop-on-500/training_checkpoint_400.pt" diff --git a/examples/contrib/cifar10/main.py b/examples/contrib/cifar10/main.py index 4838f2e096be..ff426109889d 100644 --- a/examples/contrib/cifar10/main.py +++ b/examples/contrib/cifar10/main.py @@ -111,7 +111,7 @@ def run_validation(engine): n_saved=2, global_step_transform=global_step_from_engine(trainer), score_name="test_accuracy", - score_function=Checkpoint.get_default_score_fn("accuracy"), + score_function=Checkpoint.get_default_score_fn("Accuracy"), ) evaluator.add_event_handler( Events.COMPLETED(lambda *_: trainer.state.epoch > config["num_epochs"] // 2), best_model_handler @@ -173,7 +173,7 @@ def run( learning_rate (float): peak of piecewise linear learning rate scheduler. Default, 0.4. num_warmup_epochs (int): number of warm-up epochs before learning rate decay. Default, 4. validate_every (int): run model's validation every ``validate_every`` epochs. Default, 3. - checkpoint_every (int): store training checkpoint every ``checkpoint_every`` iterations. Default, 200. + checkpoint_every (int): store training checkpoint every ``checkpoint_every`` iterations. Default, 1000. backend (str, optional): backend to use for distributed configuration. Possible values: None, "nccl", "xla-tpu", "gloo" etc. Default, None. nproc_per_node (int, optional): optional argument to setup number of processes per node. It is useful, diff --git a/examples/contrib/cifar10_qat/main.py b/examples/contrib/cifar10_qat/main.py index bac448f846e6..ec585e8e85d8 100644 --- a/examples/contrib/cifar10_qat/main.py +++ b/examples/contrib/cifar10_qat/main.py @@ -108,7 +108,7 @@ def run_validation(engine): n_saved=2, global_step_transform=global_step_from_engine(trainer), score_name="test_accuracy", - score_function=Checkpoint.get_default_score_fn("accuracy"), + score_function=Checkpoint.get_default_score_fn("Accuracy"), ) evaluator.add_event_handler( Events.COMPLETED(lambda *_: trainer.state.epoch > config["num_epochs"] // 2), best_model_handler From 4379f7823b667c801533270e3faacd6e0e5da034 Mon Sep 17 00:00:00 2001 From: vfdev-5 Date: Fri, 12 Feb 2021 00:36:00 +0000 Subject: [PATCH 7/7] Fixed hvd spawn fail and better synced qat code --- examples/contrib/cifar10/main.py | 6 +++++- examples/contrib/cifar10_qat/main.py | 12 ++++++++++-- 2 files changed, 15 insertions(+), 3 deletions(-) diff --git a/examples/contrib/cifar10/main.py b/examples/contrib/cifar10/main.py index ff426109889d..c27c80906444 100644 --- a/examples/contrib/cifar10/main.py +++ b/examples/contrib/cifar10/main.py @@ -258,9 +258,13 @@ def log_basic_info(logger, config): logger.info(f"- PyTorch version: {torch.__version__}") logger.info(f"- Ignite version: {ignite.__version__}") if torch.cuda.is_available(): + # explicitly import cudnn as + # torch.backends.cudnn can not be pickled with hvd spawning procs + from torch.backends import cudnn + logger.info(f"- GPU Device: {torch.cuda.get_device_name(idist.get_local_rank())}") logger.info(f"- CUDA version: {torch.version.cuda}") - logger.info(f"- CUDNN version: {torch.backends.cudnn.version()}") + logger.info(f"- CUDNN version: {cudnn.version()}") logger.info("\n") logger.info("Configuration:") diff --git a/examples/contrib/cifar10_qat/main.py b/examples/contrib/cifar10_qat/main.py index ec585e8e85d8..364e4d4cdbeb 100644 --- a/examples/contrib/cifar10_qat/main.py +++ b/examples/contrib/cifar10_qat/main.py @@ -114,7 +114,11 @@ def run_validation(engine): Events.COMPLETED(lambda *_: trainer.state.epoch > config["num_epochs"] // 2), best_model_handler ) - trainer.run(train_loader, max_epochs=config["num_epochs"]) + try: + trainer.run(train_loader, max_epochs=config["num_epochs"]) + except Exception as e: + logger.exception("") + raise e if rank == 0: tb_logger.close() @@ -241,9 +245,13 @@ def log_basic_info(logger, config): logger.info(f"- PyTorch version: {torch.__version__}") logger.info(f"- Ignite version: {ignite.__version__}") if torch.cuda.is_available(): + # explicitly import cudnn as + # torch.backends.cudnn can not be pickled with hvd spawning procs + from torch.backends import cudnn + logger.info(f"- GPU Device: {torch.cuda.get_device_name(idist.get_local_rank())}") logger.info(f"- CUDA version: {torch.version.cuda}") - logger.info(f"- CUDNN version: {torch.backends.cudnn.version()}") + logger.info(f"- CUDNN version: {cudnn.version()}") logger.info("\n") logger.info("Configuration:")