diff --git a/.circleci/config.yml b/.circleci/config.yml index b62b52c3dd78..6a9e1af11565 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -256,7 +256,7 @@ jobs: export example_path="examples/contrib/cifar10" # initial run export stop_cmd="--stop_iteration=500" - export test_cmd="CI=1 python ${example_path}/main.py run" + export test_cmd="CI=1 python ${example_path}/main.py run --checkpoint_every=200" docker exec -it pthd /bin/bash -c "${test_cmd} ${stop_cmd}" # resume export resume_opt="--resume-from=/tmp/output-cifar10/resnet18_backend-None-1_stop-on-500/training_checkpoint_400.pt" @@ -268,7 +268,7 @@ jobs: export example_path="examples/contrib/cifar10" # initial run export stop_cmd="--stop_iteration=500" - export test_cmd="CI=1 python -u -m torch.distributed.launch --nproc_per_node=2 --use_env ${example_path}/main.py run --backend=nccl" + export test_cmd="CI=1 python -u -m torch.distributed.launch --nproc_per_node=2 --use_env ${example_path}/main.py run --backend=nccl --checkpoint_every=200" docker exec -it pthd /bin/bash -c "${test_cmd} ${stop_cmd}" # resume export resume_opt="--resume-from=/tmp/output-cifar10/resnet18_backend-nccl-2_stop-on-500/training_checkpoint_400.pt" @@ -280,7 +280,7 @@ jobs: export example_path="examples/contrib/cifar10" # initial run export stop_cmd="--stop_iteration=500" - export test_cmd="CI=1 python -u ${example_path}/main.py run --backend=nccl --nproc_per_node=2" + export test_cmd="CI=1 python -u ${example_path}/main.py run --backend=nccl --nproc_per_node=2 --checkpoint_every=200" docker exec -it pthd /bin/bash -c "${test_cmd} ${stop_cmd}" # resume export resume_opt="--resume-from=/tmp/output-cifar10/resnet18_backend-nccl-2_stop-on-500/training_checkpoint_400.pt" @@ -334,7 +334,7 @@ jobs: export example_path="examples/contrib/cifar10" # initial run export stop_cmd="--stop_iteration=500" - export test_cmd="cd ${example_path} && CI=1 horovodrun -np 2 python -u main.py run --backend=horovod" + export test_cmd="cd ${example_path} && CI=1 horovodrun -np 2 python -u main.py run --backend=horovod --checkpoint_every=200" docker exec -it pthd /bin/bash -c "${test_cmd} ${stop_cmd}" # resume export resume_opt="--resume-from=/tmp/output-cifar10/resnet18_backend-horovod-2_stop-on-500/training_checkpoint_400.pt" @@ -346,7 +346,7 @@ jobs: export example_path="examples/contrib/cifar10" # initial run export stop_cmd="--stop_iteration=500" - export test_cmd="cd ${example_path} && CI=1 python -u main.py run --backend=horovod --nproc_per_node=2" + export test_cmd="cd ${example_path} && CI=1 python -u main.py run --backend=horovod --nproc_per_node=2 --checkpoint_every=200" docker exec -it pthd /bin/bash -c "${test_cmd} ${stop_cmd}" # resume export resume_opt="--resume-from=/tmp/output-cifar10/resnet18_backend-horovod-2_stop-on-500/training_checkpoint_400.pt" diff --git a/examples/contrib/cifar10/main.py b/examples/contrib/cifar10/main.py index 4838f2e096be..c27c80906444 100644 --- a/examples/contrib/cifar10/main.py +++ b/examples/contrib/cifar10/main.py @@ -111,7 +111,7 @@ def run_validation(engine): n_saved=2, global_step_transform=global_step_from_engine(trainer), score_name="test_accuracy", - score_function=Checkpoint.get_default_score_fn("accuracy"), + score_function=Checkpoint.get_default_score_fn("Accuracy"), ) evaluator.add_event_handler( Events.COMPLETED(lambda *_: trainer.state.epoch > config["num_epochs"] // 2), best_model_handler @@ -173,7 +173,7 @@ def run( learning_rate (float): peak of piecewise linear learning rate scheduler. Default, 0.4. num_warmup_epochs (int): number of warm-up epochs before learning rate decay. Default, 4. validate_every (int): run model's validation every ``validate_every`` epochs. Default, 3. - checkpoint_every (int): store training checkpoint every ``checkpoint_every`` iterations. Default, 200. + checkpoint_every (int): store training checkpoint every ``checkpoint_every`` iterations. Default, 1000. backend (str, optional): backend to use for distributed configuration. Possible values: None, "nccl", "xla-tpu", "gloo" etc. Default, None. nproc_per_node (int, optional): optional argument to setup number of processes per node. It is useful, @@ -258,9 +258,13 @@ def log_basic_info(logger, config): logger.info(f"- PyTorch version: {torch.__version__}") logger.info(f"- Ignite version: {ignite.__version__}") if torch.cuda.is_available(): + # explicitly import cudnn as + # torch.backends.cudnn can not be pickled with hvd spawning procs + from torch.backends import cudnn + logger.info(f"- GPU Device: {torch.cuda.get_device_name(idist.get_local_rank())}") logger.info(f"- CUDA version: {torch.version.cuda}") - logger.info(f"- CUDNN version: {torch.backends.cudnn.version()}") + logger.info(f"- CUDNN version: {cudnn.version()}") logger.info("\n") logger.info("Configuration:") diff --git a/examples/contrib/cifar10_qat/main.py b/examples/contrib/cifar10_qat/main.py index bac448f846e6..364e4d4cdbeb 100644 --- a/examples/contrib/cifar10_qat/main.py +++ b/examples/contrib/cifar10_qat/main.py @@ -108,13 +108,17 @@ def run_validation(engine): n_saved=2, global_step_transform=global_step_from_engine(trainer), score_name="test_accuracy", - score_function=Checkpoint.get_default_score_fn("accuracy"), + score_function=Checkpoint.get_default_score_fn("Accuracy"), ) evaluator.add_event_handler( Events.COMPLETED(lambda *_: trainer.state.epoch > config["num_epochs"] // 2), best_model_handler ) - trainer.run(train_loader, max_epochs=config["num_epochs"]) + try: + trainer.run(train_loader, max_epochs=config["num_epochs"]) + except Exception as e: + logger.exception("") + raise e if rank == 0: tb_logger.close() @@ -241,9 +245,13 @@ def log_basic_info(logger, config): logger.info(f"- PyTorch version: {torch.__version__}") logger.info(f"- Ignite version: {ignite.__version__}") if torch.cuda.is_available(): + # explicitly import cudnn as + # torch.backends.cudnn can not be pickled with hvd spawning procs + from torch.backends import cudnn + logger.info(f"- GPU Device: {torch.cuda.get_device_name(idist.get_local_rank())}") logger.info(f"- CUDA version: {torch.version.cuda}") - logger.info(f"- CUDNN version: {torch.backends.cudnn.version()}") + logger.info(f"- CUDNN version: {cudnn.version()}") logger.info("\n") logger.info("Configuration:")