pytorch · vfdev-5 · Jun 9, 2020 · Jun 5, 2020 · Jun 5, 2020 · Jun 6, 2020
diff --git a/.circleci/config.yml b/.circleci/config.yml
@@ -80,7 +80,7 @@ jobs:
 
             # pytest on cuda
             export test_cmd='sh tests/run_gpu_tests.sh'
-            docker exec -it pthd /bin/bash -c "$test_cmd"
+            docker exec -it pthd /bin/bash -c "${test_cmd}"
 
             # MNIST tests
 
@@ -135,7 +135,58 @@ jobs:
           name: Run 1 Node 2 GPUs Unit Tests
           command: |
             export test_cmd='sh tests/run_gpu_tests.sh 2'
-            docker exec -it pthd /bin/bash -c "$test_cmd"
+            docker exec -it pthd /bin/bash -c "${test_cmd}"
+
+  two_gpus_check_dist_cifar10_example:
+    <<: *two_gpus
+
+    working_directory: << pipeline.parameters.workingdir >>
+
+    steps:
+      - checkout
+      - <<: *pull_pytorch_stable_image
+      - <<: *run_pytorch_container
+      - <<: *install_dependencies
+      - run:
+          name: "Install additional example dependencies"
+          command: |
+            docker exec -it pthd pip install fire
+      - run:
+          name: "Run without backend"
+          command: |
+            export example_path="examples/contrib/new-cifar10"
+            # initial run
+            export stop_cmd="--stop_iteration=1000"
+            export test_cmd="CI=1 python ${example_path}/main.py run"
+            docker exec -it pthd /bin/bash -c "${test_cmd} ${stop_cmd}"
+            # resume
+            export resume_opt="--resume-from=/tmp/output-cifar10/resnet18_backend-None-1_stop-on-1000/training_checkpoint_1000.pt"
+            docker exec -it pthd /bin/bash -c "${test_cmd} ${resume_opt}"
+
+      - run:
+          name: "Run with NCCL backend using torch dist launch"
+          command: |
+            export example_path="examples/contrib/new-cifar10"
+            # initial run
+            export stop_cmd="--stop_iteration=1000"
+            export test_cmd="CI=1 python -u -m torch.distributed.launch --nproc_per_node=2 --use_env ${example_path}/main.py run --backend=nccl"
+            docker exec -it pthd /bin/bash -c "${test_cmd} ${stop_cmd}"
+            # resume
+            export resume_opt="--resume-from=/tmp/output-cifar10/resnet18_backend-nccl-2_stop-on-1000/training_checkpoint_1000.pt"
+            docker exec -it pthd /bin/bash -c "${test_cmd} ${resume_opt}"
+
+      - run:
+          name: "Run with NCCL backend using spawn"
+          command: |
+            export example_path="examples/contrib/new-cifar10"
+            # initial run
+            export stop_cmd="--stop_iteration=1000"
+            export test_cmd="CI=1 python -u ${example_path}/main.py run --backend=nccl --num_procs_per_node=2"
+            docker exec -it pthd /bin/bash -c "${test_cmd} ${stop_cmd}"
+            # resume
+            export resume_opt="--resume-from=/tmp/output-cifar10/resnet18_backend-nccl-2_stop-on-1000/training_checkpoint_1000.pt"
+            docker exec -it pthd /bin/bash -c "${test_cmd} ${resume_opt}"
+
 
 # -------------------------------------------------------------------------------------
 # Workflows
@@ -146,3 +197,4 @@ workflows:
     jobs:
       - one_gpu_tests
       - two_gpus_tests
+      - two_gpus_check_dist_cifar10_example
diff --git a/docs/source/distributed.rst b/docs/source/distributed.rst
@@ -7,20 +7,86 @@ Helper module to use distributed settings for multiple backends:
 
 - XLA on TPUs via `pytorch/xla <https://github.com/pytorch/xla>`_
 
-This module wraps common methods to fetch information about distributed configuration, initialize/finalize process
-group or spawn multiple processes.
+Distributed launcher and `auto` helpers
+---------------------------------------
+
+We provide a context manager to simplify the code of distributed configuration setup for all above supported backends.
+In addition, methods like :meth:`~ignite.distributed.auto.auto_model`, :meth:`~ignite.distributed.auto.auto_optim` and
+:meth:`~ignite.distributed.auto.auto_dataloader` helps to adapt in a transparent way provided model, optimizer and data
+loaders to existing configuration:
+
+.. code-block:: python
+
+    # main.py
+
+    import ignite.distributed as idist
+
+    def training(local_rank, config, **kwargs):
+
+        print(idist.get_rank(), ": run with config:", config, "- backend=", idist.backend())
+
+        train_loader = idist.auto_dataloader(dataset, batch_size=32, num_workers=12, shuffle=True, **kwargs)
+        # batch size, num_workers and sampler are automatically adapted to existing configuration
+        # ...
+        model = resnet50()
+        model = idist.auto_model(model)
+        # model is DDP or DP or just itself according to existing configuration
+        # ...
+        optimizer = optim.SGD(model.parameters(), lr=0.01)
+        optimizer = idist.auto_optim(optimizer)
+        # optimizer is itself, except XLA configuration and overrides `step()` method.
+        # User can safely call `optimizer.step()` (behind `xm.optimizer_step(optimizier)` is performed)
+
+
+    backend = "nccl"  # torch native distributed configuration on multiple GPUs
+    # backend = "xla-tpu"  # XLA TPUs distributed configuration
+    # backend = None  # no distributed configuration
+    with idist.Parallel(backend=backend, **dist_configs) as parallel:
+        parallel.run(training, config, a=1, b=2)
+
+Above code may be executed with `torch.distributed.launch`_ tool or by python and specifying distributed configuration
+in the code. For more details, please, see :class:`~ignite.distributed.launcher.Parallel`,
+:meth:`~ignite.distributed.auto.auto_model`, :meth:`~ignite.distributed.auto.auto_optim` and
+:meth:`~ignite.distributed.auto.auto_dataloader`.
+
+Complete example on training on CIFAR10 can be found: LINK
+
 
+.. _torch.distributed.launch: https://pytorch.org/docs/stable/distributed.html#launch-utility
 
-Examples:
 
-    - Example to spawn `nprocs` processes that run `fn` with `args`: :meth:`~ignite.distributed.spawn`
+ignite.distributed.auto
+-----------------------
 
+.. currentmodule:: ignite.distributed.auto
+
+.. automodule:: ignite.distributed.auto
+    :members:
+
+
+ignite.distributed.launcher
+---------------------------
+
+.. currentmodule:: ignite.distributed.launcher
+
+.. automodule:: ignite.distributed.launcher
+    :members:
 
-.. currentmodule:: ignite.distributed
 
-.. automodule:: ignite.distributed
+ignite.distributed.utils
+------------------------
+
+This module wraps common methods to fetch information about distributed configuration, initialize/finalize process
+group or spawn multiple processes.
+
+.. currentmodule:: ignite.distributed.utils
+
+.. automodule:: ignite.distributed.utils
     :members:
-    :imported-members:
+
+    .. attribute:: has_native_dist_support
+
+        True if `torch.distributed` is available
 
     .. attribute:: has_xla_support
 

diff --git a/examples/contrib/cifar10/README.md b/examples/contrib/cifar10/README.md
@@ -1,49 +1,71 @@
 # CIFAR10 Example with Ignite
 
-In this example, we show how to use *Ignite* to train a neural network on 1 or more GPUs, save the best model weights, 
-log learning rate, training/validation metrics.
+In this example, we show how to use *Ignite* to train a neural network:
+- on 1 or more GPUs or TPUs
+- compute training/validation metrics
+- log learning rate, metrics etc
+- save the best model weights
 
 Configurations:
 
 * [x] single GPU
 * [x] multi GPUs on a single node
 * [x] multi GPUs on multiple nodes
+* [x] TPUs on Colab
 
 ## Requirements:
 
 - [torchvision](https://github.com/pytorch/vision/): `pip install torchvision`
 - [tqdm](https://github.com/tqdm/tqdm/): `pip install tqdm`
 - [tensorboardx](https://github.com/lanpa/tensorboard-pytorch): `pip install tensorboardX`
+- [python-fire](https://github.com/google/python-fire): `pip install fire`
+- [trains](https://github.com/allegroai/trains): `pip install trains`
 
 ## Usage:
 
-Run the example on a single GPU (script will not run without a GPU):
+Run the example on a single GPU:
 ```bash
-python main.py
+python main.py run
+```
+
+For details on accepted arguments:
+```bash
+python main.py run -- --help
 ```
 
 If user would like to provide already downloaded dataset, the path can be setup in parameters as
 ```bash
---params="data_path=/path/to/cifar10/"
+--data_path="/path/to/cifar10/"
 ```
 
+TODO: LINK on TRAINS SERVER
 
 ### Distributed training
 
 #### Single node, multiple GPUs
 
 Let's start training on a single node with 2 gpus:
 ```bash
-python -u -m torch.distributed.launch --nproc_per_node=2 main.py --params="dist_backend='nccl'"
+# using torch.distributed.launch
+python -u -m torch.distributed.launch --nproc_per_node=2 --use_env main.py run --backend="nccl"
+```
+or 
+```bash
+# using function spawn inside the code
+python -u main.py run --backend="nccl" --num_procs_per_node=2
 ```
 
 If user would like to provide already downloaded dataset, the path can be setup in parameters as
 ```bash
---params="data_path=/path/to/cifar10/;batch_size=512"
+--data_path="/path/to/cifar10/"
 ```
 
-![tb1](assets/tb_logger.png)
+TODO: LINK on TRAINS SERVER
+
 
+#### Colab, on 8 TPUs
+
+TODO: LINK on Colab
 
 #### Multiple nodes, multiple GPUs
 
@@ -55,8 +77,8 @@ python -u -m torch.distributed.launch \
     --nnodes=2 \
     --nproc_per_node=2 \
     --node_rank=0 \
-    --master_addr=master --master_port=2222 \
-    main.py --params="dist_backend='nccl'"
+    --master_addr=master --master_port=2222 --use_env \
+    main.py run --backend="nccl"
 ```
 
 2) Execute on worker node
@@ -66,108 +88,44 @@ python -u -m torch.distributed.launch \
     --nproc_per_node=2 \
     --node_rank=1 \
     --master_addr=master --master_port=2222 \
-    main.py --params="dist_backend='nccl'"
+    main.py run --backend="nccl"
 ```
 
-![tb2](assets/tb_logger_gcp.png)
+TODO: LINK on TRAINS SERVER
 
-## Reproduce trainings
 
-- To reproduce trainings with [Polyaxon](https://polyaxon.com/), please see [plx_configs/README.md](plx_configs/README.md)
-- To reproduce trainings on [GCP AI platform](https://cloud.google.com/ml-engine/docs/), please see [gcp_ai_platform](gcp_ai_platform/README.md).
+### Check resume training
 
-## Acknowledgements
+#### Single GPU
 
-In this repository we are using the code from 
-- [cifar10-fast repository](https://github.com/davidcpage/cifar10-fast)
-
-Thanks to the authors for sharing their code!
-
-
-## Check resume training
-
-### Single GPU
-
-Initial training with a crash at 1000 iteration (~10 epochs)
+Initial training with a stop on 1000 iteration (~11 epochs)
 ```bash
-python main.py --params="crash_iteration=1000"
-# or same in deterministic mode
-python main.py --params="crash_iteration=1000;deterministic=True;output_path=/tmp/output-cifar10/deterministic"
+python main.py run --stop_iteration=1000
 ```
 
 Resume from the latest checkpoint
 ```bash
-python main.py --params="resume_from=/tmp/output-cifar10/XYZ-single-gpu/training_checkpoint_800.pt"
-# or same in deterministic mode
-python main.py --params="resume_from=/tmp/output-cifar10/deterministic/XYZ-single-gpu/training_checkpoint_800.pt;deterministic=True;output_path=/tmp/output-cifar10/deterministic" 
-```
-
-Training without crashing
-```bash
-python main.py
-# or same in deterministic mode
-python main.py --params="deterministic=True;output_path=/tmp/output-cifar10/deterministic"
+python main.py run --resume-from=/tmp/output-cifar10/resnet18_backend-None-1_stop-on-1000/training_checkpoint_1000.pt
 ```
 
-Non-deterministic| Deterministic
----|---
-![img11](assets/tb_logger_run_resume_ndet.png) | ![img12](assets/tb_logger_run_resume_det.png) 
-
-**Note 1:** We can observe a gap on `train/batch_loss` curves between intial training and resumed training. This metric is 
-computed as a running average and while resuming the training, the cumulative part is not restored.
-
-
-Relative performances comparision
-![img13](assets/tb_logger_det_vs_ndet.png)
-
-**Note 2:** Please, keep in mind, that `torch.backends.cudnn.deterministic=True` and 
-`torch.backends.cudnn.benchmark=False` used in [`DeterministicEngine`](https://pytorch.org/ignite/engine.html#ignite.engine.deterministic.DeterministicEngine)
-have a negative single-run performance impact (see [official torch notes](https://pytorch.org/docs/stable/notes/randomness.html#cudnn)).
+TODO: LINK on TRAINS SERVER
 
+### Distributed training
 
-### Single Node, multiple GPUs
+#### Single node, multiple GPUs
 
-Initial training with a crash at 1000 iteration (~10 epochs)
+Initial training on a single node with 2 gpus with a stop on 1000 iteration (~11 epochs):
 ```bash
-python -u -m torch.distributed.launch --nproc_per_node=2 main.py --params="dist_backend='nccl';crash_iteration=1000"
-# or same in deterministic mode
-python -u -m torch.distributed.launch --nproc_per_node=2 main.py --params="dist_backend='nccl';crash_iteration=1000;deterministic=True;output_path=/tmp/output-cifar10/deterministic"
+# using torch.distributed.launch
+python -u -m torch.distributed.launch --nproc_per_node=2 --use_env main.py run --backend="nccl" --stop_iteration=1000
 ```
 
 Resume from the latest checkpoint
 ```bash
-python -u -m torch.distributed.launch --nproc_per_node=2 main.py --params="dist_backend='nccl';resume_from=/tmp/output-cifar10/XYZ-distributed-1nodes-2gpus/training_checkpoint_800.pt"
-# or same in deterministic mode
-python -u -m torch.distributed.launch --nproc_per_node=2 main.py --params="dist_backend='nccl';resume_from=/tmp/output-cifar10/deterministic/XYZ-distributed-1nodes-2gpus/training_checkpoint_800.pt;deterministic=True;output_path=/tmp/output-cifar10/deterministic" 
+python -u -m torch.distributed.launch --nproc_per_node=2 --use_env main.py run --backend="nccl" \
+    --resume-from=/tmp/output-cifar10/resnet18_backend-nccl-2_stop-on-1000/training_checkpoint_1000.pt
 ```
 
-Training without crashing
-```bash
-python -u -m torch.distributed.launch --nproc_per_node=2 main.py --params="dist_backend='nccl'"
-# or same in deterministic mode
-python -u -m torch.distributed.launch --nproc_per_node=2 main.py --params="dist_backend='nccl';deterministic=True;output_path=/tmp/output-cifar10/deterministic"
-```
-
-<!---
-Non-deterministic| Deterministic
----|---
-![img21](assets/tb_logger_2x_run_resume_ndet.png) | ![img22](assets/tb_logger_2x_run_resume_det.png) 
-
-Relative performances comparision
-![img23](assets/tb_logger_2x_det_vs_ndet.png)
-
-![tbresume](assets/tb_logger_resume1.png)
-
-- Orange curves represent the training with a crash at the iteration 1000
-- Blue curves show resumed training from the last checkpoint (iteration 800)
-- Red curves display complete training without crashing  
-
-
-**Note 2:** As we are resuming the training from an iteration between epochs, even if Ignite's engine handles the dataflow by
-correctly providing data samples for the resumed iteration, random data augmentations are not synchronized. This causes a gap  
-in validation curves (`train/loss`, `train/accuracy` etc.) at the begining of training resuming. 
-
-![tb-resume](assets/tb_logger_resume2.png)
-
--->
+TODO: LINK on TRAINS SERVER
 
+Similar commands can be adapted for other cases.
diff --git a/examples/contrib/cifar10/assets/tb_logger.png b/examples/contrib/cifar10/assets/tb_logger.png
diff --git a/examples/contrib/cifar10/assets/tb_logger_det_vs_ndet.png b/examples/contrib/cifar10/assets/tb_logger_det_vs_ndet.png
diff --git a/examples/contrib/cifar10/assets/tb_logger_gcp.png b/examples/contrib/cifar10/assets/tb_logger_gcp.png
diff --git a/examples/contrib/cifar10/assets/tb_logger_run_resume_det.png b/examples/contrib/cifar10/assets/tb_logger_run_resume_det.png
diff --git a/examples/contrib/cifar10/assets/tb_logger_run_resume_ndet.png b/examples/contrib/cifar10/assets/tb_logger_run_resume_ndet.png