diff --git a/ignite/distributed/auto.py b/ignite/distributed/auto.py index 1d6a585b0e63..e92bec43a764 100644 --- a/ignite/distributed/auto.py +++ b/ignite/distributed/auto.py @@ -188,23 +188,23 @@ def auto_model(model: nn.Module, sync_bn: bool = False, **kwargs: Any) -> nn.Mod # distributed data parallel model if idist.get_world_size() > 1: bnd = idist.backend() - if idist.has_native_dist_support and bnd == idist_native.NCCL: + if idist.has_native_dist_support and bnd in (idist_native.NCCL, idist_native.GLOO, idist_native.MPI): if sync_bn: logger.info("Convert batch norm to sync batch norm") model = nn.SyncBatchNorm.convert_sync_batchnorm(model) - if "device_ids" in kwargs: - raise ValueError(f"Argument kwargs should not contain 'device_ids', but got {kwargs}") + if torch.cuda.is_available(): + if "device_ids" in kwargs: + raise ValueError(f"Argument kwargs should not contain 'device_ids', but got {kwargs}") - lrank = idist.get_local_rank() - logger.info(f"Apply torch DistributedDataParallel on model, device id: {lrank}") - model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[lrank,], **kwargs) - elif idist.has_native_dist_support and bnd == idist_native.GLOO: - if sync_bn: - logger.info("Convert batch norm to sync batch norm") - model = nn.SyncBatchNorm.convert_sync_batchnorm(model) + lrank = idist.get_local_rank() + logger.info(f"Apply torch DistributedDataParallel on model, device id: {lrank}") + kwargs["device_ids"] = [ + lrank, + ] + else: + logger.info("Apply torch DistributedDataParallel on model") - logger.info("Apply torch DistributedDataParallel on model") model = torch.nn.parallel.DistributedDataParallel(model, **kwargs) elif idist.has_hvd_support and bnd == idist_hvd.HOROVOD: import horovod.torch as hvd diff --git a/ignite/distributed/comp_models/native.py b/ignite/distributed/comp_models/native.py index c723f4184347..d5bde07329cf 100644 --- a/ignite/distributed/comp_models/native.py +++ b/ignite/distributed/comp_models/native.py @@ -31,7 +31,7 @@ class _NativeDistModel(ComputationModel): In this implementation we assume the following mapping between backend and devices: - NCCL <-> GPU - - GLOO <-> CPU + - GLOO <-> CPU or GPU - MPI <-> CPU """ @@ -127,7 +127,7 @@ def _create_from_backend( # https://github.com/facebookresearch/maskrcnn-benchmark/issues/172 dist.barrier() - if backend == dist.Backend.NCCL: + if torch.cuda.is_available(): torch.cuda.set_device(self._local_rank) self._setup_attrs() @@ -140,7 +140,7 @@ def _init_from_context(self) -> None: def _compute_nproc_per_node(self) -> int: local_rank = self.get_local_rank() device = torch.device("cpu") - if self.backend() == dist.Backend.NCCL: + if torch.cuda.is_available(): # we manually set cuda device to local rank in order to avoid a hang on all_reduce device = torch.device(f"cuda:{local_rank}") tensor = torch.tensor([self.get_local_rank() + 1]).to(device) @@ -151,7 +151,7 @@ def _get_all_hostnames(self) -> List[Tuple[str, ...]]: import socket device = "cpu" - if self.backend() == dist.Backend.NCCL: + if torch.cuda.is_available(): index = torch.cuda.current_device() device = f"cuda:{index}" hostname = socket.gethostname() @@ -281,7 +281,7 @@ def get_node_rank(self) -> int: return cast(int, self._node) def device(self) -> torch.device: - if self.backend() == dist.Backend.NCCL: + if torch.cuda.is_available(): index = torch.cuda.current_device() if index < self.get_local_rank(): warnings.warn( diff --git a/tests/ignite/conftest.py b/tests/ignite/conftest.py index a909d1c695e6..ce52495c32c5 100644 --- a/tests/ignite/conftest.py +++ b/tests/ignite/conftest.py @@ -94,7 +94,7 @@ def _create_dist_context(dist_info, lrank): dist.init_process_group(**dist_info) dist.barrier() - if dist_info["backend"] == "nccl": + if torch.cuda.is_available(): torch.cuda.set_device(lrank) return {"local_rank": lrank, "world_size": dist_info["world_size"], "rank": dist_info["rank"]} @@ -150,8 +150,6 @@ def distributed_context_single_node_nccl(local_rank, world_size): free_port = _setup_free_port(local_rank) - print(local_rank, "Port:", free_port) - dist_info = { "backend": "nccl", "world_size": world_size, @@ -174,7 +172,6 @@ def distributed_context_single_node_gloo(local_rank, world_size): init_method = f'file:///{temp_file.name.replace(backslash, "/")}' else: free_port = _setup_free_port(local_rank) - print(local_rank, "Port:", free_port) init_method = f"tcp://localhost:{free_port}" temp_file = None @@ -213,7 +210,7 @@ def _create_mnodes_dist_context(dist_info, mnodes_conf): dist.init_process_group(**dist_info) dist.barrier() - if dist_info["backend"] == "nccl": + if torch.cuda.is_available(): torch.cuda.device(mnodes_conf["local_rank"]) return mnodes_conf diff --git a/tests/ignite/contrib/engines/test_common.py b/tests/ignite/contrib/engines/test_common.py index 292a3ab553dd..aed310d66bd6 100644 --- a/tests/ignite/contrib/engines/test_common.py +++ b/tests/ignite/contrib/engines/test_common.py @@ -56,7 +56,7 @@ def _test_setup_common_training_handlers( num_epochs = 10 model = DummyModel().to(device) - if distributed and "cuda" in device: + if distributed and "cuda" in torch.device(device).type: model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[local_rank,], output_device=local_rank) optimizer = torch.optim.SGD(model.parameters(), lr=lr) @@ -581,17 +581,19 @@ def test_setup_neptune_logging(dirname): @pytest.mark.distributed @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif(torch.cuda.device_count() < 1, reason="Skip if no GPU") -def test_distrib_gpu(dirname, distributed_context_single_node_nccl): +def test_distrib_nccl_gpu(dirname, distributed_context_single_node_nccl): + local_rank = distributed_context_single_node_nccl["local_rank"] - device = f"cuda:{local_rank}" + device = idist.device() _test_setup_common_training_handlers(dirname, device, rank=local_rank, local_rank=local_rank, distributed=True) test_add_early_stopping_by_val_score() @pytest.mark.distributed @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") -def test_distrib_cpu(dirname, distributed_context_single_node_gloo): - device = "cpu" +def test_distrib_gloo_cpu_or_gpu(dirname, distributed_context_single_node_gloo): + + device = idist.device() local_rank = distributed_context_single_node_gloo["local_rank"] _test_setup_common_training_handlers(dirname, device, rank=local_rank, local_rank=local_rank, distributed=True) _test_setup_common_training_handlers( @@ -606,8 +608,9 @@ def test_distrib_cpu(dirname, distributed_context_single_node_gloo): @pytest.mark.multinode_distributed @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif("MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed") -def test_multinode_distrib_cpu(dirname, distributed_context_multi_node_gloo): - device = "cpu" +def test_multinode_distrib_gloo_cpu_or_gpu(dirname, distributed_context_multi_node_gloo): + + device = idist.device() rank = distributed_context_multi_node_gloo["rank"] _test_setup_common_training_handlers(dirname, device, rank=rank) test_add_early_stopping_by_val_score() @@ -616,9 +619,10 @@ def test_multinode_distrib_cpu(dirname, distributed_context_multi_node_gloo): @pytest.mark.multinode_distributed @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif("GPU_MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed") -def test_multinode_distrib_gpu(dirname, distributed_context_multi_node_nccl): +def test_multinode_distrib_nccl_gpu(dirname, distributed_context_multi_node_nccl): + local_rank = distributed_context_multi_node_nccl["local_rank"] rank = distributed_context_multi_node_nccl["rank"] - device = f"cuda:{local_rank}" + device = idist.device() _test_setup_common_training_handlers(dirname, device, rank=rank, local_rank=local_rank, distributed=True) test_add_early_stopping_by_val_score() diff --git a/tests/ignite/contrib/handlers/test_clearml_logger.py b/tests/ignite/contrib/handlers/test_clearml_logger.py index 0d2f9e7c8fcd..9ac30f7cda56 100644 --- a/tests/ignite/contrib/handlers/test_clearml_logger.py +++ b/tests/ignite/contrib/handlers/test_clearml_logger.py @@ -888,18 +888,21 @@ def update_fn(engine, batch): @pytest.mark.distributed @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") -def test_distrib_cpu(distributed_context_single_node_gloo): - _test_save_model_optimizer_lr_scheduler_with_state_dict("cpu") - _test_save_model_optimizer_lr_scheduler_with_state_dict("cpu", on_zero_rank=True) +def test_distrib_gloo_cpu_or_gpu(distributed_context_single_node_gloo): + + device = idist.device() + _test_save_model_optimizer_lr_scheduler_with_state_dict(device) + _test_save_model_optimizer_lr_scheduler_with_state_dict(device, on_zero_rank=True) @pytest.mark.distributed @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif(torch.cuda.device_count() < 1, reason="Skip if no GPU") -def test_distrib_gpu(distributed_context_single_node_nccl): +def test_distrib_nccl_gpu(distributed_context_single_node_nccl): + device = idist.device() _test_save_model_optimizer_lr_scheduler_with_state_dict(device) - _test_save_model_optimizer_lr_scheduler_with_state_dict("cpu", on_zero_rank=True) + _test_save_model_optimizer_lr_scheduler_with_state_dict(device, on_zero_rank=True) @pytest.mark.tpu diff --git a/tests/ignite/contrib/handlers/test_lr_finder.py b/tests/ignite/contrib/handlers/test_lr_finder.py index 8ffd2536901d..c0ac998bb4af 100644 --- a/tests/ignite/contrib/handlers/test_lr_finder.py +++ b/tests/ignite/contrib/handlers/test_lr_finder.py @@ -539,8 +539,9 @@ def forward(self, x): @pytest.mark.distributed @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") -def test_distrib_cpu(distributed_context_single_node_gloo): - device = torch.device("cpu") +def test_distrib_gloo_cpu_or_gpu(distributed_context_single_node_gloo): + + device = idist.device() _test_distrib_log_lr_and_loss(device) _test_distrib_integration_mnist(device) @@ -548,8 +549,9 @@ def test_distrib_cpu(distributed_context_single_node_gloo): @pytest.mark.distributed @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif(torch.cuda.device_count() < 1, reason="Skip if no GPU") -def test_distrib_gpu(distributed_context_single_node_nccl): - device = torch.device(f"cuda:{distributed_context_single_node_nccl['local_rank']}") +def test_distrib_nccl_gpu(distributed_context_single_node_nccl): + + device = idist.device() _test_distrib_log_lr_and_loss(device) _test_distrib_integration_mnist(device) diff --git a/tests/ignite/contrib/handlers/test_neptune_logger.py b/tests/ignite/contrib/handlers/test_neptune_logger.py index b26e1e8546ca..c50ac64bcfda 100644 --- a/tests/ignite/contrib/handlers/test_neptune_logger.py +++ b/tests/ignite/contrib/handlers/test_neptune_logger.py @@ -516,13 +516,16 @@ def test_no_neptune_client(no_site_packages): @pytest.mark.distributed @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") -def test_distrib_cpu(distributed_context_single_node_gloo): - _test_neptune_saver_integration("cpu") +def test_distrib_gloo_cpu_or_gpu(distributed_context_single_node_gloo): + + device = idist.device() + _test_neptune_saver_integration(device) @pytest.mark.distributed @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif(torch.cuda.device_count() < 1, reason="Skip if no GPU") -def test_distrib_gpu(distributed_context_single_node_nccl): +def test_distrib_nccl_gpu(distributed_context_single_node_nccl): + device = idist.device() _test_neptune_saver_integration(device) diff --git a/tests/ignite/contrib/metrics/regression/test_canberra_metric.py b/tests/ignite/contrib/metrics/regression/test_canberra_metric.py index 677e1c274313..b30a9c402e68 100644 --- a/tests/ignite/contrib/metrics/regression/test_canberra_metric.py +++ b/tests/ignite/contrib/metrics/regression/test_canberra_metric.py @@ -184,17 +184,18 @@ def update(engine, i): @pytest.mark.distributed @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif(torch.cuda.device_count() < 1, reason="Skip if no GPU") -def test_distrib_gpu(distributed_context_single_node_nccl): - device = torch.device(f"cuda:{distributed_context_single_node_nccl['local_rank']}") +def test_distrib_nccl_gpu(distributed_context_single_node_nccl): + + device = idist.device() _test_distrib_compute(device) _test_distrib_integration(device) @pytest.mark.distributed @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") -def test_distrib_cpu(distributed_context_single_node_gloo): +def test_distrib_gloo_cpu_or_gpu(distributed_context_single_node_gloo): - device = torch.device("cpu") + device = idist.device() _test_distrib_compute(device) _test_distrib_integration(device) @@ -214,8 +215,9 @@ def test_distrib_hvd(gloo_hvd_executor): @pytest.mark.multinode_distributed @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif("MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed") -def test_multinode_distrib_cpu(distributed_context_multi_node_gloo): - device = torch.device("cpu") +def test_multinode_distrib_gloo_cpu_or_gpu(distributed_context_multi_node_gloo): + + device = idist.device() _test_distrib_compute(device) _test_distrib_integration(device) @@ -223,8 +225,9 @@ def test_multinode_distrib_cpu(distributed_context_multi_node_gloo): @pytest.mark.multinode_distributed @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif("GPU_MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed") -def test_multinode_distrib_gpu(distributed_context_multi_node_nccl): - device = torch.device(f"cuda:{distributed_context_multi_node_nccl['local_rank']}") +def test_multinode_distrib_nccl_gpu(distributed_context_multi_node_nccl): + + device = idist.device() _test_distrib_compute(device) _test_distrib_integration(device) diff --git a/tests/ignite/contrib/metrics/regression/test_fractional_absolute_error.py b/tests/ignite/contrib/metrics/regression/test_fractional_absolute_error.py index 39c0ec6d324a..dc45bd6d0a98 100644 --- a/tests/ignite/contrib/metrics/regression/test_fractional_absolute_error.py +++ b/tests/ignite/contrib/metrics/regression/test_fractional_absolute_error.py @@ -191,16 +191,18 @@ def update(engine, i): @pytest.mark.distributed @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif(torch.cuda.device_count() < 1, reason="Skip if no GPU") -def test_distrib_gpu(distributed_context_single_node_nccl): - device = torch.device(f"cuda:{distributed_context_single_node_nccl['local_rank']}") +def test_distrib_nccl_gpu(distributed_context_single_node_nccl): + + device = idist.device() _test_distrib_compute(device) _test_distrib_integration(device) @pytest.mark.distributed @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") -def test_distrib_cpu(distributed_context_single_node_gloo): - device = torch.device("cpu") +def test_distrib_gloo_cpu_or_gpu(distributed_context_single_node_gloo): + + device = idist.device() _test_distrib_compute(device) _test_distrib_integration(device) @@ -219,8 +221,9 @@ def test_distrib_hvd(gloo_hvd_executor): @pytest.mark.multinode_distributed @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif("MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed") -def test_multinode_distrib_cpu(distributed_context_multi_node_gloo): - device = torch.device("cpu") +def test_multinode_distrib_gloo_cpu_or_gpu(distributed_context_multi_node_gloo): + + device = idist.device() _test_distrib_compute(device) _test_distrib_integration(device) @@ -228,8 +231,9 @@ def test_multinode_distrib_cpu(distributed_context_multi_node_gloo): @pytest.mark.multinode_distributed @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif("GPU_MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed") -def test_multinode_distrib_gpu(distributed_context_multi_node_nccl): - device = torch.device(f"cuda:{distributed_context_multi_node_nccl['local_rank']}") +def test_multinode_distrib_nccl_gpu(distributed_context_multi_node_nccl): + + device = idist.device() _test_distrib_compute(device) diff --git a/tests/ignite/contrib/metrics/regression/test_fractional_bias.py b/tests/ignite/contrib/metrics/regression/test_fractional_bias.py index b3b32c504a32..7079b2baa438 100644 --- a/tests/ignite/contrib/metrics/regression/test_fractional_bias.py +++ b/tests/ignite/contrib/metrics/regression/test_fractional_bias.py @@ -197,17 +197,18 @@ def update(engine, i): @pytest.mark.distributed @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif(torch.cuda.device_count() < 1, reason="Skip if no GPU") -def test_distrib_gpu(distributed_context_single_node_nccl): - device = torch.device(f"cuda:{distributed_context_single_node_nccl['local_rank']}") +def test_distrib_nccl_gpu(distributed_context_single_node_nccl): + + device = idist.device() _test_distrib_compute(device) _test_distrib_integration(device) @pytest.mark.distributed @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") -def test_distrib_cpu(distributed_context_single_node_gloo): +def test_distrib_gloo_cpu_or_gpu(distributed_context_single_node_gloo): - device = torch.device("cpu") + device = idist.device() _test_distrib_compute(device) _test_distrib_integration(device) @@ -227,8 +228,9 @@ def test_distrib_hvd(gloo_hvd_executor): @pytest.mark.multinode_distributed @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif("MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed") -def test_multinode_distrib_cpu(distributed_context_multi_node_gloo): - device = torch.device("cpu") +def test_multinode_distrib_gloo_cpu_or_gpu(distributed_context_multi_node_gloo): + + device = idist.device() _test_distrib_compute(device) _test_distrib_integration(device) @@ -236,8 +238,9 @@ def test_multinode_distrib_cpu(distributed_context_multi_node_gloo): @pytest.mark.multinode_distributed @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif("GPU_MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed") -def test_multinode_distrib_gpu(distributed_context_multi_node_nccl): - device = torch.device(f"cuda:{distributed_context_multi_node_nccl['local_rank']}") +def test_multinode_distrib_nccl_gpu(distributed_context_multi_node_nccl): + + device = idist.device() _test_distrib_compute(device) _test_distrib_integration(device) diff --git a/tests/ignite/contrib/metrics/regression/test_geometric_mean_absolute_error.py b/tests/ignite/contrib/metrics/regression/test_geometric_mean_absolute_error.py index 299098697df3..eae47c7fa71a 100644 --- a/tests/ignite/contrib/metrics/regression/test_geometric_mean_absolute_error.py +++ b/tests/ignite/contrib/metrics/regression/test_geometric_mean_absolute_error.py @@ -194,17 +194,18 @@ def update(engine, i): @pytest.mark.distributed @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif(torch.cuda.device_count() < 1, reason="Skip if no GPU") -def test_distrib_gpu(distributed_context_single_node_nccl): - device = torch.device(f"cuda:{distributed_context_single_node_nccl['local_rank']}") +def test_distrib_nccl_gpu(distributed_context_single_node_nccl): + + device = idist.device() _test_distrib_compute(device) _test_distrib_integration(device) @pytest.mark.distributed @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") -def test_distrib_cpu(distributed_context_single_node_gloo): +def test_distrib_gloo_cpu_or_gpu(distributed_context_single_node_gloo): - device = torch.device("cpu") + device = idist.device() _test_distrib_compute(device) _test_distrib_integration(device) @@ -224,8 +225,8 @@ def test_distrib_hvd(gloo_hvd_executor): @pytest.mark.multinode_distributed @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif("MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed") -def test_multinode_distrib_cpu(distributed_context_multi_node_gloo): - device = torch.device("cpu") +def test_multinode_distrib_gloo_cpu_or_gpu(distributed_context_multi_node_gloo): + device = torch.device("cpu" if not torch.cuda.is_available() else "cuda") _test_distrib_compute(device) _test_distrib_integration(device) @@ -233,8 +234,9 @@ def test_multinode_distrib_cpu(distributed_context_multi_node_gloo): @pytest.mark.multinode_distributed @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif("GPU_MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed") -def test_multinode_distrib_gpu(distributed_context_multi_node_nccl): - device = torch.device(f"cuda:{distributed_context_multi_node_nccl['local_rank']}") +def test_multinode_distrib_nccl_gpu(distributed_context_multi_node_nccl): + + device = idist.device() _test_distrib_compute(device) _test_distrib_integration(device) diff --git a/tests/ignite/contrib/metrics/regression/test_manhattan_distance.py b/tests/ignite/contrib/metrics/regression/test_manhattan_distance.py index ce079eaaae4c..2ed6726db4a6 100644 --- a/tests/ignite/contrib/metrics/regression/test_manhattan_distance.py +++ b/tests/ignite/contrib/metrics/regression/test_manhattan_distance.py @@ -185,17 +185,18 @@ def update(engine, i): @pytest.mark.distributed @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif(torch.cuda.device_count() < 1, reason="Skip if no GPU") -def test_distrib_gpu(distributed_context_single_node_nccl): - device = torch.device(f"cuda:{distributed_context_single_node_nccl['local_rank']}") +def test_distrib_nccl_gpu(distributed_context_single_node_nccl): + + device = idist.device() _test_distrib_compute(device) _test_distrib_integration(device) @pytest.mark.distributed @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") -def test_distrib_cpu(distributed_context_single_node_gloo): +def test_distrib_gloo_cpu_or_gpu(distributed_context_single_node_gloo): - device = torch.device("cpu") + device = idist.device() _test_distrib_compute(device) _test_distrib_integration(device) @@ -215,8 +216,9 @@ def test_distrib_hvd(gloo_hvd_executor): @pytest.mark.multinode_distributed @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif("MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed") -def test_multinode_distrib_cpu(distributed_context_multi_node_gloo): - device = torch.device("cpu") +def test_multinode_distrib_gloo_cpu_or_gpu(distributed_context_multi_node_gloo): + + device = idist.device() _test_distrib_compute(device) _test_distrib_integration(device) @@ -224,8 +226,9 @@ def test_multinode_distrib_cpu(distributed_context_multi_node_gloo): @pytest.mark.multinode_distributed @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif("GPU_MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed") -def test_multinode_distrib_gpu(distributed_context_multi_node_nccl): - device = torch.device(f"cuda:{distributed_context_multi_node_nccl['local_rank']}") +def test_multinode_distrib_nccl_gpu(distributed_context_multi_node_nccl): + + device = idist.device() _test_distrib_compute(device) _test_distrib_integration(device) diff --git a/tests/ignite/contrib/metrics/regression/test_maximum_absolute_error.py b/tests/ignite/contrib/metrics/regression/test_maximum_absolute_error.py index 20c44d862148..828a9dcc1ee4 100644 --- a/tests/ignite/contrib/metrics/regression/test_maximum_absolute_error.py +++ b/tests/ignite/contrib/metrics/regression/test_maximum_absolute_error.py @@ -181,17 +181,18 @@ def update(engine, i): @pytest.mark.distributed @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif(torch.cuda.device_count() < 1, reason="Skip if no GPU") -def test_distrib_gpu(distributed_context_single_node_nccl): - device = torch.device(f"cuda:{distributed_context_single_node_nccl['local_rank']}") +def test_distrib_nccl_gpu(distributed_context_single_node_nccl): + + device = idist.device() _test_distrib_compute(device) _test_distrib_integration(device) @pytest.mark.distributed @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") -def test_distrib_cpu(distributed_context_single_node_gloo): +def test_distrib_gloo_cpu_or_gpu(distributed_context_single_node_gloo): - device = torch.device("cpu") + device = idist.device() _test_distrib_compute(device) _test_distrib_integration(device) @@ -211,8 +212,9 @@ def test_distrib_hvd(gloo_hvd_executor): @pytest.mark.multinode_distributed @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif("MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed") -def test_multinode_distrib_cpu(distributed_context_multi_node_gloo): - device = torch.device("cpu") +def test_multinode_distrib_gloo_cpu_or_gpu(distributed_context_multi_node_gloo): + + device = idist.device() _test_distrib_compute(device) _test_distrib_integration(device) @@ -220,8 +222,9 @@ def test_multinode_distrib_cpu(distributed_context_multi_node_gloo): @pytest.mark.multinode_distributed @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif("GPU_MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed") -def test_multinode_distrib_gpu(distributed_context_multi_node_nccl): - device = torch.device(f"cuda:{distributed_context_multi_node_nccl['local_rank']}") +def test_multinode_distrib_nccl_gpu(distributed_context_multi_node_nccl): + + device = idist.device() _test_distrib_compute(device) _test_distrib_integration(device) diff --git a/tests/ignite/contrib/metrics/regression/test_mean_absolute_relative_error.py b/tests/ignite/contrib/metrics/regression/test_mean_absolute_relative_error.py index 4841139e712f..90e8baceb497 100644 --- a/tests/ignite/contrib/metrics/regression/test_mean_absolute_relative_error.py +++ b/tests/ignite/contrib/metrics/regression/test_mean_absolute_relative_error.py @@ -205,17 +205,18 @@ def update(engine, i): @pytest.mark.distributed @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif(torch.cuda.device_count() < 1, reason="Skip if no GPU") -def test_distrib_gpu(distributed_context_single_node_nccl): - device = torch.device(f"cuda:{distributed_context_single_node_nccl['local_rank']}") +def test_distrib_nccl_gpu(distributed_context_single_node_nccl): + + device = idist.device() _test_distrib_compute(device) _test_distrib_integration(device) @pytest.mark.distributed @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") -def test_distrib_cpu(distributed_context_single_node_gloo): +def test_distrib_gloo_cpu_or_gpu(distributed_context_single_node_gloo): - device = torch.device("cpu") + device = idist.device() _test_distrib_compute(device) _test_distrib_integration(device) @@ -235,8 +236,9 @@ def test_distrib_hvd(gloo_hvd_executor): @pytest.mark.multinode_distributed @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif("MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed") -def test_multinode_distrib_cpu(distributed_context_multi_node_gloo): - device = torch.device("cpu") +def test_multinode_distrib_gloo_cpu_or_gpu(distributed_context_multi_node_gloo): + + device = idist.device() _test_distrib_compute(device) _test_distrib_integration(device) @@ -244,8 +246,9 @@ def test_multinode_distrib_cpu(distributed_context_multi_node_gloo): @pytest.mark.multinode_distributed @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif("GPU_MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed") -def test_multinode_distrib_gpu(distributed_context_multi_node_nccl): - device = torch.device(f"cuda:{distributed_context_multi_node_nccl['local_rank']}") +def test_multinode_distrib_nccl_gpu(distributed_context_multi_node_nccl): + + device = idist.device() _test_distrib_compute(device) _test_distrib_integration(device) diff --git a/tests/ignite/contrib/metrics/regression/test_mean_normalized_bias.py b/tests/ignite/contrib/metrics/regression/test_mean_normalized_bias.py index 41d7a46c7725..b9036287ff1c 100644 --- a/tests/ignite/contrib/metrics/regression/test_mean_normalized_bias.py +++ b/tests/ignite/contrib/metrics/regression/test_mean_normalized_bias.py @@ -199,17 +199,18 @@ def update(engine, i): @pytest.mark.distributed @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif(torch.cuda.device_count() < 1, reason="Skip if no GPU") -def test_distrib_gpu(distributed_context_single_node_nccl): - device = torch.device(f"cuda:{distributed_context_single_node_nccl['local_rank']}") +def test_distrib_nccl_gpu(distributed_context_single_node_nccl): + + device = idist.device() _test_distrib_compute(device) _test_distrib_integration(device) @pytest.mark.distributed @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") -def test_distrib_cpu(distributed_context_single_node_gloo): +def test_distrib_gloo_cpu_or_gpu(distributed_context_single_node_gloo): - device = torch.device("cpu") + device = idist.device() _test_distrib_compute(device) _test_distrib_integration(device) @@ -229,8 +230,9 @@ def test_distrib_hvd(gloo_hvd_executor): @pytest.mark.multinode_distributed @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif("MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed") -def test_multinode_distrib_cpu(distributed_context_multi_node_gloo): - device = torch.device("cpu") +def test_multinode_distrib_gloo_cpu_or_gpu(distributed_context_multi_node_gloo): + + device = idist.device() _test_distrib_compute(device) _test_distrib_integration(device) @@ -238,8 +240,9 @@ def test_multinode_distrib_cpu(distributed_context_multi_node_gloo): @pytest.mark.multinode_distributed @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif("GPU_MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed") -def test_multinode_distrib_gpu(distributed_context_multi_node_nccl): - device = torch.device(f"cuda:{distributed_context_multi_node_nccl['local_rank']}") +def test_multinode_distrib_nccl_gpu(distributed_context_multi_node_nccl): + + device = idist.device() _test_distrib_compute(device) _test_distrib_integration(device) diff --git a/tests/ignite/contrib/metrics/regression/test_median_absolute_error.py b/tests/ignite/contrib/metrics/regression/test_median_absolute_error.py index a77c1de06bdd..ebe063293626 100644 --- a/tests/ignite/contrib/metrics/regression/test_median_absolute_error.py +++ b/tests/ignite/contrib/metrics/regression/test_median_absolute_error.py @@ -199,17 +199,18 @@ def update(engine, i): @pytest.mark.distributed @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif(torch.cuda.device_count() < 1, reason="Skip if no GPU") -def test_distrib_gpu(distributed_context_single_node_nccl): - device = torch.device(f"cuda:{distributed_context_single_node_nccl['local_rank']}") +def test_distrib_nccl_gpu(distributed_context_single_node_nccl): + + device = idist.device() _test_distrib_compute(device) _test_distrib_integration(device) @pytest.mark.distributed @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") -def test_distrib_cpu(distributed_context_single_node_gloo): +def test_distrib_gloo_cpu_or_gpu(distributed_context_single_node_gloo): - device = torch.device("cpu") + device = idist.device() _test_distrib_compute(device) _test_distrib_integration(device) @@ -229,8 +230,9 @@ def test_distrib_hvd(gloo_hvd_executor): @pytest.mark.multinode_distributed @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif("MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed") -def test_multinode_distrib_cpu(distributed_context_multi_node_gloo): - device = torch.device("cpu") +def test_multinode_distrib_gloo_cpu_or_gpu(distributed_context_multi_node_gloo): + + device = idist.device() _test_distrib_compute(device) _test_distrib_integration(device) @@ -238,8 +240,9 @@ def test_multinode_distrib_cpu(distributed_context_multi_node_gloo): @pytest.mark.multinode_distributed @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif("GPU_MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed") -def test_multinode_distrib_gpu(distributed_context_multi_node_nccl): - device = torch.device(f"cuda:{distributed_context_multi_node_nccl['local_rank']}") +def test_multinode_distrib_nccl_gpu(distributed_context_multi_node_nccl): + + device = idist.device() _test_distrib_compute(device) _test_distrib_integration(device) diff --git a/tests/ignite/contrib/metrics/regression/test_median_absolute_percentage_error.py b/tests/ignite/contrib/metrics/regression/test_median_absolute_percentage_error.py index da1f89c9098b..a463b6406e05 100644 --- a/tests/ignite/contrib/metrics/regression/test_median_absolute_percentage_error.py +++ b/tests/ignite/contrib/metrics/regression/test_median_absolute_percentage_error.py @@ -209,17 +209,18 @@ def update(engine, i): @pytest.mark.distributed @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif(torch.cuda.device_count() < 1, reason="Skip if no GPU") -def test_distrib_gpu(distributed_context_single_node_nccl): - device = torch.device(f"cuda:{distributed_context_single_node_nccl['local_rank']}") +def test_distrib_nccl_gpu(distributed_context_single_node_nccl): + + device = idist.device() _test_distrib_compute(device) _test_distrib_integration(device) @pytest.mark.distributed @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") -def test_distrib_cpu(distributed_context_single_node_gloo): +def test_distrib_gloo_cpu_or_gpu(distributed_context_single_node_gloo): - device = torch.device("cpu") + device = idist.device() _test_distrib_compute(device) _test_distrib_integration(device) @@ -239,8 +240,9 @@ def test_distrib_hvd(gloo_hvd_executor): @pytest.mark.multinode_distributed @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif("MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed") -def test_multinode_distrib_cpu(distributed_context_multi_node_gloo): - device = torch.device("cpu") +def test_multinode_distrib_gloo_cpu_or_gpu(distributed_context_multi_node_gloo): + + device = idist.device() _test_distrib_compute(device) _test_distrib_integration(device) @@ -248,8 +250,9 @@ def test_multinode_distrib_cpu(distributed_context_multi_node_gloo): @pytest.mark.multinode_distributed @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif("GPU_MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed") -def test_multinode_distrib_gpu(distributed_context_multi_node_nccl): - device = torch.device(f"cuda:{distributed_context_multi_node_nccl['local_rank']}") +def test_multinode_distrib_nccl_gpu(distributed_context_multi_node_nccl): + + device = idist.device() _test_distrib_compute(device) _test_distrib_integration(device) diff --git a/tests/ignite/contrib/metrics/regression/test_median_relative_absolute_error.py b/tests/ignite/contrib/metrics/regression/test_median_relative_absolute_error.py index b311761f414d..06c5ab2eea53 100644 --- a/tests/ignite/contrib/metrics/regression/test_median_relative_absolute_error.py +++ b/tests/ignite/contrib/metrics/regression/test_median_relative_absolute_error.py @@ -200,17 +200,18 @@ def update(engine, i): @pytest.mark.distributed @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif(torch.cuda.device_count() < 1, reason="Skip if no GPU") -def test_distrib_gpu(distributed_context_single_node_nccl): - device = torch.device(f"cuda:{distributed_context_single_node_nccl['local_rank']}") +def test_distrib_nccl_gpu(distributed_context_single_node_nccl): + + device = idist.device() _test_distrib_compute(device) _test_distrib_integration(device) @pytest.mark.distributed @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") -def test_distrib_cpu(distributed_context_single_node_gloo): +def test_distrib_gloo_cpu_or_gpu(distributed_context_single_node_gloo): - device = torch.device("cpu") + device = idist.device() _test_distrib_compute(device) _test_distrib_integration(device) @@ -230,8 +231,9 @@ def test_distrib_hvd(gloo_hvd_executor): @pytest.mark.multinode_distributed @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif("MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed") -def test_multinode_distrib_cpu(distributed_context_multi_node_gloo): - device = torch.device("cpu") +def test_multinode_distrib_gloo_cpu_or_gpu(distributed_context_multi_node_gloo): + + device = idist.device() _test_distrib_compute(device) _test_distrib_integration(device) @@ -239,8 +241,9 @@ def test_multinode_distrib_cpu(distributed_context_multi_node_gloo): @pytest.mark.multinode_distributed @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif("GPU_MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed") -def test_multinode_distrib_gpu(distributed_context_multi_node_nccl): - device = torch.device(f"cuda:{distributed_context_multi_node_nccl['local_rank']}") +def test_multinode_distrib_nccl_gpu(distributed_context_multi_node_nccl): + + device = idist.device() _test_distrib_compute(device) _test_distrib_integration(device) diff --git a/tests/ignite/contrib/metrics/regression/test_r2_score.py b/tests/ignite/contrib/metrics/regression/test_r2_score.py index ad5e3fdf5460..4a87089f3304 100644 --- a/tests/ignite/contrib/metrics/regression/test_r2_score.py +++ b/tests/ignite/contrib/metrics/regression/test_r2_score.py @@ -171,17 +171,18 @@ def update(engine, i): @pytest.mark.distributed @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif(torch.cuda.device_count() < 1, reason="Skip if no GPU") -def test_distrib_gpu(distributed_context_single_node_nccl): - device = torch.device(f"cuda:{distributed_context_single_node_nccl['local_rank']}") +def test_distrib_nccl_gpu(distributed_context_single_node_nccl): + + device = idist.device() _test_distrib_compute(device) _test_distrib_integration(device) @pytest.mark.distributed @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") -def test_distrib_cpu(distributed_context_single_node_gloo): +def test_distrib_gloo_cpu_or_gpu(distributed_context_single_node_gloo): - device = torch.device("cpu") + device = idist.device() _test_distrib_compute(device) _test_distrib_integration(device) @@ -201,8 +202,9 @@ def test_distrib_hvd(gloo_hvd_executor): @pytest.mark.multinode_distributed @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif("MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed") -def test_multinode_distrib_cpu(distributed_context_multi_node_gloo): - device = torch.device("cpu") +def test_multinode_distrib_gloo_cpu_or_gpu(distributed_context_multi_node_gloo): + + device = idist.device() _test_distrib_compute(device) _test_distrib_integration(device) @@ -210,8 +212,9 @@ def test_multinode_distrib_cpu(distributed_context_multi_node_gloo): @pytest.mark.multinode_distributed @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif("GPU_MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed") -def test_multinode_distrib_gpu(distributed_context_multi_node_nccl): - device = torch.device(f"cuda:{distributed_context_multi_node_nccl['local_rank']}") +def test_multinode_distrib_nccl_gpu(distributed_context_multi_node_nccl): + + device = idist.device() _test_distrib_compute(device) _test_distrib_integration(device) diff --git a/tests/ignite/contrib/metrics/regression/test_wave_hedges_distance.py b/tests/ignite/contrib/metrics/regression/test_wave_hedges_distance.py index 1fc746d5cbaf..da8ba88f7f20 100644 --- a/tests/ignite/contrib/metrics/regression/test_wave_hedges_distance.py +++ b/tests/ignite/contrib/metrics/regression/test_wave_hedges_distance.py @@ -166,17 +166,18 @@ def update(engine, i): @pytest.mark.distributed @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif(torch.cuda.device_count() < 1, reason="Skip if no GPU") -def test_distrib_gpu(distributed_context_single_node_nccl): - device = torch.device(f"cuda:{distributed_context_single_node_nccl['local_rank']}") +def test_distrib_nccl_gpu(distributed_context_single_node_nccl): + + device = idist.device() _test_distrib_compute(device) _test_distrib_integration(device) @pytest.mark.distributed @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") -def test_distrib_cpu(distributed_context_single_node_gloo): +def test_distrib_gloo_cpu_or_gpu(distributed_context_single_node_gloo): - device = torch.device("cpu") + device = idist.device() _test_distrib_compute(device) _test_distrib_integration(device) @@ -196,8 +197,9 @@ def test_distrib_hvd(gloo_hvd_executor): @pytest.mark.multinode_distributed @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif("MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed") -def test_multinode_distrib_cpu(distributed_context_multi_node_gloo): - device = torch.device("cpu") +def test_multinode_distrib_gloo_cpu_or_gpu(distributed_context_multi_node_gloo): + + device = idist.device() _test_distrib_compute(device) _test_distrib_integration(device) @@ -205,8 +207,9 @@ def test_multinode_distrib_cpu(distributed_context_multi_node_gloo): @pytest.mark.multinode_distributed @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif("GPU_MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed") -def test_multinode_distrib_gpu(distributed_context_multi_node_nccl): - device = torch.device(f"cuda:{distributed_context_multi_node_nccl['local_rank']}") +def test_multinode_distrib_nccl_gpu(distributed_context_multi_node_nccl): + + device = idist.device() _test_distrib_compute(device) _test_distrib_integration(device) diff --git a/tests/ignite/contrib/metrics/test_average_precision.py b/tests/ignite/contrib/metrics/test_average_precision.py index c1f2fcab1c55..7b7f55aaca0e 100644 --- a/tests/ignite/contrib/metrics/test_average_precision.py +++ b/tests/ignite/contrib/metrics/test_average_precision.py @@ -278,18 +278,18 @@ def update_fn(engine, i): @pytest.mark.distributed @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif(torch.cuda.device_count() < 1, reason="Skip if no GPU") -def test_distrib_gpu(distributed_context_single_node_nccl): +def test_distrib_nccl_gpu(distributed_context_single_node_nccl): - device = torch.device(f"cuda:{distributed_context_single_node_nccl['local_rank']}") + device = idist.device() _test_distrib_binary_and_multilabel_inputs(device) _test_distrib_integration_binary_input(device) @pytest.mark.distributed @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") -def test_distrib_cpu(distributed_context_single_node_gloo): +def test_distrib_gloo_cpu_or_gpu(distributed_context_single_node_gloo): - device = torch.device("cpu") + device = idist.device() _test_distrib_binary_and_multilabel_inputs(device) _test_distrib_integration_binary_input(device) @@ -309,9 +309,9 @@ def test_distrib_hvd(gloo_hvd_executor): @pytest.mark.multinode_distributed @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif("MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed") -def test_multinode_distrib_cpu(distributed_context_multi_node_gloo): +def test_multinode_distrib_gloo_cpu_or_gpu(distributed_context_multi_node_gloo): - device = torch.device("cpu") + device = idist.device() _test_distrib_binary_and_multilabel_inputs(device) _test_distrib_integration_binary_input(device) @@ -319,9 +319,9 @@ def test_multinode_distrib_cpu(distributed_context_multi_node_gloo): @pytest.mark.multinode_distributed @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif("GPU_MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed") -def test_multinode_distrib_gpu(distributed_context_multi_node_nccl): +def test_multinode_distrib_nccl_gpu(distributed_context_multi_node_nccl): - device = torch.device(f"cuda:{distributed_context_multi_node_nccl['local_rank']}") + device = idist.device() _test_distrib_binary_and_multilabel_inputs(device) _test_distrib_integration_binary_input(device) diff --git a/tests/ignite/contrib/metrics/test_cohen_kappa.py b/tests/ignite/contrib/metrics/test_cohen_kappa.py index e237794f25fc..32f9bbf9a1f3 100644 --- a/tests/ignite/contrib/metrics/test_cohen_kappa.py +++ b/tests/ignite/contrib/metrics/test_cohen_kappa.py @@ -273,18 +273,18 @@ def update(engine, i): @pytest.mark.distributed @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif(torch.cuda.device_count() < 1, reason="Skip if no GPU") -def test_distrib_gpu(distributed_context_single_node_nccl): +def test_distrib_nccl_gpu(distributed_context_single_node_nccl): - device = torch.device(f"cuda:{distributed_context_single_node_nccl['local_rank']}") + device = idist.device() _test_distrib_binary_input(device) _test_distrib_integration_binary_input(device) @pytest.mark.distributed @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") -def test_distrib_cpu(distributed_context_single_node_gloo): +def test_distrib_gloo_cpu_or_gpu(distributed_context_single_node_gloo): - device = torch.device("cpu") + device = idist.device() _test_distrib_binary_input(device) _test_distrib_integration_binary_input(device) @@ -308,9 +308,9 @@ def test_distrib_hvd(gloo_hvd_executor): @pytest.mark.multinode_distributed @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif("MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed") -def test_multinode_distrib_cpu(distributed_context_multi_node_gloo): +def test_multinode_distrib_gloo_cpu_or_gpu(distributed_context_multi_node_gloo): - device = torch.device("cpu") + device = idist.device() _test_distrib_binary_input(device) _test_distrib_integration_binary_input(device) @@ -318,9 +318,9 @@ def test_multinode_distrib_cpu(distributed_context_multi_node_gloo): @pytest.mark.multinode_distributed @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif("GPU_MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed") -def test_multinode_distrib_gpu(distributed_context_multi_node_nccl): +def test_multinode_distrib_nccl_gpu(distributed_context_multi_node_nccl): - device = torch.device(f"cuda:{distributed_context_multi_node_nccl['local_rank']}") + device = idist.device() _test_distrib_binary_input(device) _test_distrib_integration_binary_input(device) diff --git a/tests/ignite/contrib/metrics/test_roc_auc.py b/tests/ignite/contrib/metrics/test_roc_auc.py index 957a97bb6b7c..aa34089cdbc2 100644 --- a/tests/ignite/contrib/metrics/test_roc_auc.py +++ b/tests/ignite/contrib/metrics/test_roc_auc.py @@ -291,18 +291,18 @@ def update_fn(engine, i): @pytest.mark.distributed @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif(torch.cuda.device_count() < 1, reason="Skip if no GPU") -def test_distrib_gpu(distributed_context_single_node_nccl): +def test_distrib_nccl_gpu(distributed_context_single_node_nccl): - device = torch.device(f"cuda:{distributed_context_single_node_nccl['local_rank']}") + device = idist.device() _test_distrib_binary_and_multilabel_inputs(device) _test_distrib_integration_binary_input(device) @pytest.mark.distributed @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") -def test_distrib_cpu(distributed_context_single_node_gloo): +def test_distrib_gloo_cpu_or_gpu(distributed_context_single_node_gloo): - device = torch.device("cpu") + device = idist.device() _test_distrib_binary_and_multilabel_inputs(device) _test_distrib_integration_binary_input(device) @@ -322,9 +322,9 @@ def test_distrib_hvd(gloo_hvd_executor): @pytest.mark.multinode_distributed @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif("MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed") -def test_multinode_distrib_cpu(distributed_context_multi_node_gloo): +def test_multinode_distrib_gloo_cpu_or_gpu(distributed_context_multi_node_gloo): - device = torch.device("cpu") + device = idist.device() _test_distrib_binary_and_multilabel_inputs(device) _test_distrib_integration_binary_input(device) @@ -332,9 +332,9 @@ def test_multinode_distrib_cpu(distributed_context_multi_node_gloo): @pytest.mark.multinode_distributed @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif("GPU_MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed") -def test_multinode_distrib_gpu(distributed_context_multi_node_nccl): +def test_multinode_distrib_nccl_gpu(distributed_context_multi_node_nccl): - device = torch.device(f"cuda:{distributed_context_multi_node_nccl['local_rank']}") + device = idist.device() _test_distrib_binary_and_multilabel_inputs(device) _test_distrib_integration_binary_input(device) diff --git a/tests/ignite/distributed/comp_models/test_base.py b/tests/ignite/distributed/comp_models/test_base.py index b944ea61cf9f..6a7ca20d35dd 100644 --- a/tests/ignite/distributed/comp_models/test_base.py +++ b/tests/ignite/distributed/comp_models/test_base.py @@ -34,7 +34,7 @@ def test_serial_model(): def test__encode_str__decode_str(): - device = torch.device("cpu") + device = torch.device("cpu" if not torch.cuda.is_available() else "cuda") s = "test-abcedfg" encoded_s = ComputationModel._encode_str(s, device, 1024) diff --git a/tests/ignite/distributed/comp_models/test_native.py b/tests/ignite/distributed/comp_models/test_native.py index 61499f35c0d6..80cb480f5ab4 100644 --- a/tests/ignite/distributed/comp_models/test_native.py +++ b/tests/ignite/distributed/comp_models/test_native.py @@ -279,16 +279,18 @@ def _test__native_dist_model_create_from_context_dist(local_rank, rank, world_si @pytest.mark.distributed @pytest.mark.skipif("WORLD_SIZE" in os.environ, reason="Should be no-dist config") def test__native_dist_model_create_no_dist_gloo(clean_env): - _test__native_dist_model_create_from_backend_no_dist("gloo", "cpu") - _test__native_dist_model_create_from_context_no_dist("gloo", "cpu") + device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") + _test__native_dist_model_create_from_backend_no_dist("gloo", device) + _test__native_dist_model_create_from_context_no_dist("gloo", device) @pytest.mark.distributed @pytest.mark.skipif("WORLD_SIZE" in os.environ, reason="Should be no-dist config") @pytest.mark.skipif(torch.cuda.device_count() < 1, reason="Skip if no GPU") def test__native_dist_model_create_no_dist_nccl(clean_env): - _test__native_dist_model_create_from_backend_no_dist("nccl", "cuda:0") - _test__native_dist_model_create_from_context_no_dist("nccl", "cuda:0") + device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") + _test__native_dist_model_create_from_backend_no_dist("nccl", device) + _test__native_dist_model_create_from_context_no_dist("nccl", device) @pytest.mark.distributed @@ -297,12 +299,15 @@ def test__native_dist_model_create_dist_gloo_1(init_method, get_fixed_dirname, l if init_method == "FILE": init_method = f"file://{get_fixed_dirname('native_dist_model_create_dist_gloo_1')}/shared" - _test__native_dist_model_create_from_backend_dist(init_method, local_rank, local_rank, world_size, "gloo", "cpu") + device = torch.device(f"cuda:{local_rank}" if torch.cuda.is_available() else "cpu") + _test__native_dist_model_create_from_backend_dist(init_method, local_rank, local_rank, world_size, "gloo", device) @pytest.mark.distributed def test__native_dist_model_create_dist_gloo_2(local_rank, world_size): - _test__native_dist_model_create_from_context_dist(local_rank, local_rank, world_size, "gloo", "cpu") + + device = torch.device(f"cuda:{local_rank}" if torch.cuda.is_available() else "cpu") + _test__native_dist_model_create_from_context_dist(local_rank, local_rank, world_size, "gloo", device) @pytest.mark.distributed @@ -354,10 +359,7 @@ def _test_dist_spawn_fn(local_rank, backend, world_size, device): assert _model.get_local_rank() == local_rank assert _model.get_world_size() == world_size - if backend == "nccl": - assert _model.device() == torch.device(f"{device}:{local_rank}") - elif backend == "gloo": - assert _model.device() == torch.device(device) + assert _model.device().type == torch.device(device).type def _test__native_dist_model_spawn(backend, num_workers_per_machine, device, init_method=None, **spawn_kwargs): @@ -379,10 +381,13 @@ def test__native_dist_model_spawn_gloo(init_method, dirname): if init_method == "FILE": init_method = f"file://{dirname}/shared" - _test__native_dist_model_spawn("gloo", num_workers_per_machine=4, device="cpu", init_method=init_method) - _test__native_dist_model_spawn( - "gloo", num_workers_per_machine=4, device="cpu", start_method="fork", init_method=init_method - ) + nproc = torch.cuda.device_count() if torch.cuda.is_available() else 4 + device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + _test__native_dist_model_spawn("gloo", num_workers_per_machine=nproc, device=device, init_method=init_method) + if device.type == "cpu": + _test__native_dist_model_spawn( + "gloo", num_workers_per_machine=nproc, device=device, start_method="fork", init_method=init_method + ) @pytest.mark.distributed diff --git a/tests/ignite/distributed/test_auto.py b/tests/ignite/distributed/test_auto.py index 0a979c1de835..e77ea3ac6dc3 100644 --- a/tests/ignite/distributed/test_auto.py +++ b/tests/ignite/distributed/test_auto.py @@ -78,7 +78,7 @@ def _test_auto_dataloader(ws, nproc, batch_size, num_workers=1, sampler_name=Non def _test_auto_model(model, ws, device, sync_bn=False, **kwargs): model = auto_model(model, sync_bn=sync_bn, **kwargs) bnd = idist.backend() - if ws > 1 and device in ("cuda", "cpu"): + if ws > 1 and torch.device(device).type in ("cuda", "cpu"): if idist.has_native_dist_support and bnd in ("nccl", "gloo"): assert isinstance(model, nn.parallel.DistributedDataParallel) if sync_bn: @@ -93,8 +93,8 @@ def _test_auto_model(model, ws, device, sync_bn=False, **kwargs): assert isinstance(model, nn.Module) assert all( - [p.device.type == device for p in model.parameters()] - ), f"{[p.device.type for p in model.parameters()]} vs {device}" + [p.device.type == torch.device(device).type for p in model.parameters()] + ), f"{[p.device.type for p in model.parameters()]} vs {torch.device(device).type}" def _test_auto_model_optimizer(ws, device): @@ -103,7 +103,7 @@ def _test_auto_model_optimizer(ws, device): _test_auto_model(model, ws, device) model = nn.Sequential(nn.Linear(20, 100), nn.BatchNorm1d(100)) - _test_auto_model(model, ws, device, sync_bn="cuda" in device) + _test_auto_model(model, ws, device, sync_bn="cuda" in torch.device(device).type) if ws > 1: _test_auto_model(model, ws, device, find_unused_parameters=True) _test_auto_model(model, ws, device, find_unused_parameters=False) @@ -138,9 +138,10 @@ def test_auto_methods_gloo(distributed_context_single_node_gloo): _test_auto_dataloader(ws=ws, nproc=ws, batch_size=10, num_workers=2) _test_auto_dataloader(ws=ws, nproc=ws, batch_size=10, sampler_name="WeightedRandomSampler") - _test_auto_model_optimizer(ws, "cpu") + device = idist.device() + _test_auto_model_optimizer(ws, device) - if ws > 1: + if ws > 1 and device.type == "cpu": with pytest.raises(AssertionError, match=r"SyncBatchNorm layers only work with GPU modules"): model = nn.Sequential(nn.Linear(20, 100), nn.BatchNorm1d(100)) auto_model(model, sync_bn=True) @@ -156,7 +157,8 @@ def test_auto_methods_nccl(distributed_context_single_node_nccl): _test_auto_dataloader(ws=ws, nproc=ws, batch_size=10, num_workers=10) _test_auto_dataloader(ws=ws, nproc=ws, batch_size=1, sampler_name="WeightedRandomSampler") - _test_auto_model_optimizer(ws, "cuda") + device = idist.device() + _test_auto_model_optimizer(ws, device) if ws > 1: with pytest.raises(ValueError, match=r"Argument kwargs should not contain 'device_ids'"): diff --git a/tests/ignite/distributed/test_launcher.py b/tests/ignite/distributed/test_launcher.py index 5a268f59632e..5dbd8146d751 100644 --- a/tests/ignite/distributed/test_launcher.py +++ b/tests/ignite/distributed/test_launcher.py @@ -92,7 +92,8 @@ def test_check_idist_parallel_torch_launch_n_procs_gloo(init_method, dirname, ex if init_method == "FILE": init_method = f"file://{dirname}/shared" - _test_check_idist_parallel_torch_launch(init_method, exec_filepath, "gloo", 4) + np = torch.cuda.device_count() if torch.cuda.is_available() else 4 + _test_check_idist_parallel_torch_launch(init_method, exec_filepath, "gloo", np) @pytest.mark.distributed @@ -150,7 +151,8 @@ def _test_check_idist_parallel_spawn(fp, backend, nprocs): @pytest.mark.skipif(not has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif("WORLD_SIZE" in os.environ, reason="Skip if launched as multiproc") def test_check_idist_parallel_spawn_n_procs_gloo(exec_filepath): - _test_check_idist_parallel_spawn(exec_filepath, "gloo", 4) + np = 4 if not torch.cuda.is_available() else torch.cuda.device_count() + _test_check_idist_parallel_spawn(exec_filepath, "gloo", np) @pytest.mark.distributed @@ -182,7 +184,7 @@ def _test_func(index, ws, device, backend, true_init_method): assert 0 <= index < ws assert index == idist.get_local_rank() assert ws == idist.get_world_size() - assert device in idist.device().type + assert torch.device(device).type == idist.device().type assert backend == idist.backend() if idist.model_name() == "native-dist": @@ -203,8 +205,8 @@ def test_idist_parallel_spawn_n_procs_native(init_method, backend, dirname): if init_method == "FILE": init_method = f"file://{dirname}/shared" - nproc_per_node = 4 if "gloo" == backend else torch.cuda.device_count() - device = "cpu" if "gloo" == backend else "cuda" + nproc_per_node = torch.cuda.device_count() if torch.cuda.is_available() else 4 + device = torch.device("cuda" if torch.cuda.is_available() else "cpu") with idist.Parallel(backend=backend, nproc_per_node=nproc_per_node, init_method=init_method) as parallel: parallel.run(_test_func, ws=nproc_per_node, device=device, backend=backend, true_init_method=init_method) @@ -222,14 +224,14 @@ def test_idist_parallel_n_procs_native(init_method, backend, get_fixed_dirname, init_method = f"file://{get_fixed_dirname('idist_parallel_n_procs_native')}/shared" os.environ["RANK"] = str(local_rank) - device = "cuda" if "nccl" in backend else "cpu" + device = torch.device("cuda" if torch.cuda.is_available() else "cpu") with idist.Parallel(backend=backend, init_method=init_method) as parallel: parallel.run(_test_func, ws=world_size, device=device, backend=backend, true_init_method=init_method) @pytest.mark.skipif("WORLD_SIZE" in os.environ, reason="Skip if launched as multiproc") def test_idist_parallel_no_dist(): - device = "cuda" if torch.cuda.is_available() else "cpu" + device = torch.device("cuda" if torch.cuda.is_available() else "cpu") with idist.Parallel(backend=None) as parallel: parallel.run(_test_func, ws=1, device=device, backend=None, true_init_method=None) diff --git a/tests/ignite/distributed/utils/__init__.py b/tests/ignite/distributed/utils/__init__.py index 7af11ac620bd..91f15958431b 100644 --- a/tests/ignite/distributed/utils/__init__.py +++ b/tests/ignite/distributed/utils/__init__.py @@ -21,11 +21,10 @@ def _test_distrib_config(local_rank, backend, ws, true_device, rank=None, true_i this_device = idist.device() assert isinstance(this_device, torch.device) - if backend in ("nccl", "horovod") and "cuda" in this_device.type: - true_device = torch.device(f"{true_device}:{local_rank}") - assert this_device == true_device, f"{this_device} vs {true_device}" + if backend in ("nccl", "gloo", "horovod") and "cuda" in this_device.type: + assert this_device.type == torch.device(true_device).type, f"{this_device} vs {true_device}" elif backend in ("gloo", "horovod"): - assert this_device == torch.device(true_device) + assert this_device.type == torch.device(true_device).type elif backend == "xla-tpu": assert true_device in this_device.type diff --git a/tests/ignite/distributed/utils/test_native.py b/tests/ignite/distributed/utils/test_native.py index 89a93ce9f08d..55ce5ebb7647 100644 --- a/tests/ignite/distributed/utils/test_native.py +++ b/tests/ignite/distributed/utils/test_native.py @@ -42,8 +42,9 @@ def test_native_distrib_single_node_launch_tool_gloo(init_method, get_fixed_dirn if init_method == "FILE": init_method = f"file://{get_fixed_dirname('native_distrib_single_node_launch_tool_gloo')}/shared" + device = torch.device(f"cuda:{local_rank}" if torch.cuda.is_available() else "cpu") _test_native_distrib_single_node_launch_tool( - "gloo", "cpu", local_rank, world_size, timeout=timeout, init_method=init_method + "gloo", device, local_rank, world_size, timeout=timeout, init_method=init_method ) @@ -56,11 +57,12 @@ def test_native_distrib_single_node_launch_tool_nccl(init_method, get_fixed_dirn if init_method == "FILE": init_method = f"file://{get_fixed_dirname('native_distrib_single_node_launch_tool_nccl')}/shared" - _test_native_distrib_single_node_launch_tool("nccl", "cuda", local_rank, world_size, init_method=init_method) + device = torch.device(f"cuda:{local_rank}") + _test_native_distrib_single_node_launch_tool("nccl", device, local_rank, world_size, init_method=init_method) def _test_native_distrib_single_node_spawn(init_method, backend, device, **kwargs): - world_size = 4 if device == "cpu" else torch.cuda.device_count() + world_size = 4 if torch.device(device).type == "cpu" else torch.cuda.device_count() idist.spawn( backend, _test_distrib_config, @@ -84,7 +86,8 @@ def test_native_distrib_single_node_spawn_gloo(init_method, dirname): if init_method == "FILE": init_method = f"file://{dirname}/shared" - _test_native_distrib_single_node_spawn(init_method, "gloo", "cpu", timeout=timeout) + device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + _test_native_distrib_single_node_spawn(init_method, "gloo", device, timeout=timeout) @pytest.mark.distributed @@ -96,7 +99,8 @@ def test_native_distrib_single_node_spawn_nccl(init_method, dirname): if init_method == "FILE": init_method = f"file://{dirname}/shared" - _test_native_distrib_single_node_spawn(init_method, "nccl", "cuda") + device = torch.device("cuda") + _test_native_distrib_single_node_spawn(init_method, "nccl", device) @pytest.mark.distributed @@ -132,7 +136,8 @@ def _test_idist_methods_in_native_context(backend, device, local_rank): @pytest.mark.skipif(not has_native_dist_support, reason="Skip if no native dist support") def test_idist_methods_in_native_gloo_context(distributed_context_single_node_gloo): local_rank = distributed_context_single_node_gloo["local_rank"] - _test_idist_methods_in_native_context("gloo", "cpu", local_rank) + device = torch.device(f"cuda:{local_rank}" if torch.cuda.is_available() else "cpu") + _test_idist_methods_in_native_context("gloo", device, local_rank) @pytest.mark.distributed @@ -140,7 +145,8 @@ def test_idist_methods_in_native_gloo_context(distributed_context_single_node_gl @pytest.mark.skipif(torch.cuda.device_count() < 1, reason="Skip if no GPU") def test_idist_methods_in_native_nccl_context(distributed_context_single_node_nccl): local_rank = distributed_context_single_node_nccl["local_rank"] - _test_idist_methods_in_native_context("nccl", "cuda", local_rank) + device = torch.device(f"cuda:{local_rank}") + _test_idist_methods_in_native_context("nccl", device, local_rank) def _test_idist_methods_in_native_context_set_local_rank(backend, device, local_rank): @@ -166,8 +172,10 @@ def _test_idist_methods_in_native_context_set_local_rank(backend, device, local_ @pytest.mark.distributed @pytest.mark.skipif(not has_native_dist_support, reason="Skip if no native dist support") def test_idist_methods_in_native_gloo_context_set_local_rank(distributed_context_single_node_gloo): + local_rank = distributed_context_single_node_gloo["local_rank"] - _test_idist_methods_in_native_context_set_local_rank("gloo", "cpu", local_rank) + device = idist.device() + _test_idist_methods_in_native_context_set_local_rank("gloo", device, local_rank) @pytest.mark.distributed @@ -175,7 +183,8 @@ def test_idist_methods_in_native_gloo_context_set_local_rank(distributed_context @pytest.mark.skipif(torch.cuda.device_count() < 1, reason="Skip if no GPU") def test_idist_methods_in_native_nccl_context_set_local_rank(distributed_context_single_node_nccl): local_rank = distributed_context_single_node_nccl["local_rank"] - _test_idist_methods_in_native_context_set_local_rank("nccl", "cuda", local_rank) + device = idist.device() + _test_idist_methods_in_native_context_set_local_rank("nccl", device, local_rank) @pytest.mark.distributed @@ -183,7 +192,7 @@ def test_idist_methods_in_native_nccl_context_set_local_rank(distributed_context @pytest.mark.skipif(torch.cuda.device_count() < 1, reason="Skip if no GPU") def test_idist__model_methods_nccl(distributed_context_single_node_nccl): - device = f"cuda:{distributed_context_single_node_nccl['local_rank']}" + device = idist.device() _test_distrib__get_max_length(device) @@ -191,7 +200,7 @@ def test_idist__model_methods_nccl(distributed_context_single_node_nccl): @pytest.mark.skipif(not has_native_dist_support, reason="Skip if no native dist support") def test_idist__model_methods_gloo(distributed_context_single_node_gloo): - device = "cpu" + device = idist.device() _test_distrib__get_max_length(device) @@ -200,7 +209,7 @@ def test_idist__model_methods_gloo(distributed_context_single_node_gloo): @pytest.mark.skipif(torch.cuda.device_count() < 1, reason="Skip if no GPU") def test_idist_all_reduce_nccl(distributed_context_single_node_nccl): - device = f"cuda:{distributed_context_single_node_nccl['local_rank']}" + device = idist.device() _test_distrib_all_reduce(device) @@ -208,7 +217,7 @@ def test_idist_all_reduce_nccl(distributed_context_single_node_nccl): @pytest.mark.skipif(not has_native_dist_support, reason="Skip if no native dist support") def test_idist_all_reduce_gloo(distributed_context_single_node_gloo): - device = "cpu" + device = idist.device() _test_distrib_all_reduce(device) @@ -217,7 +226,7 @@ def test_idist_all_reduce_gloo(distributed_context_single_node_gloo): @pytest.mark.skipif(torch.cuda.device_count() < 1, reason="Skip if no GPU") def test_idist_all_gather_nccl(distributed_context_single_node_nccl): - device = f"cuda:{distributed_context_single_node_nccl['local_rank']}" + device = idist.device() _test_distrib_all_gather(device) @@ -225,7 +234,7 @@ def test_idist_all_gather_nccl(distributed_context_single_node_nccl): @pytest.mark.skipif(not has_native_dist_support, reason="Skip if no native dist support") def test_idist_all_gather_gloo(distributed_context_single_node_gloo): - device = "cpu" + device = idist.device() _test_distrib_all_gather(device) @@ -234,7 +243,7 @@ def test_idist_all_gather_gloo(distributed_context_single_node_gloo): @pytest.mark.skipif(torch.cuda.device_count() < 1, reason="Skip if no GPU") def test_idist_broadcast_nccl(distributed_context_single_node_nccl): - device = f"cuda:{distributed_context_single_node_nccl['local_rank']}" + device = idist.device() _test_distrib_broadcast(device) @@ -242,7 +251,7 @@ def test_idist_broadcast_nccl(distributed_context_single_node_nccl): @pytest.mark.skipif(not has_native_dist_support, reason="Skip if no native dist support") def test_idist_broadcast_gloo(distributed_context_single_node_gloo): - device = "cpu" + device = idist.device() _test_distrib_broadcast(device) @@ -251,7 +260,7 @@ def test_idist_broadcast_gloo(distributed_context_single_node_gloo): @pytest.mark.skipif(torch.cuda.device_count() < 1, reason="Skip if no GPU") def test_idist_barrier_nccl(distributed_context_single_node_nccl): - device = f"cuda:{distributed_context_single_node_nccl['local_rank']}" + device = idist.device() _test_distrib_barrier(device) @@ -259,7 +268,7 @@ def test_idist_barrier_nccl(distributed_context_single_node_nccl): @pytest.mark.skipif(not has_native_dist_support, reason="Skip if no native dist support") def test_idist_barrier_gloo(distributed_context_single_node_gloo): - device = "cpu" + device = idist.device() _test_distrib_barrier(device) @@ -325,7 +334,8 @@ def test_idist_methods_overhead_nccl(distributed_context_single_node_nccl): @pytest.mark.distributed @pytest.mark.skipif(not has_native_dist_support, reason="Skip if no native dist support") def test_idist_one_rank_only_gloo(distributed_context_single_node_gloo): - device = "cpu" + + device = idist.device() _test_distrib_one_rank_only(device=device) _test_distrib_one_rank_only_with_engine(device=device) @@ -334,6 +344,7 @@ def test_idist_one_rank_only_gloo(distributed_context_single_node_gloo): @pytest.mark.skipif(not has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif(torch.cuda.device_count() < 1, reason="Skip if no GPU") def test_idist_one_rank_only_nccl(local_rank, distributed_context_single_node_nccl): - device = f"cuda:{local_rank}" + + device = idist.device() _test_distrib_one_rank_only(device=device) _test_distrib_one_rank_only_with_engine(device=device) diff --git a/tests/ignite/engine/test_custom_events.py b/tests/ignite/engine/test_custom_events.py index 2a09e116b1ca..6c9bf230dce5 100644 --- a/tests/ignite/engine/test_custom_events.py +++ b/tests/ignite/engine/test_custom_events.py @@ -455,7 +455,7 @@ def _test(num_workers): data, batch_size=batch_size, num_workers=num_workers, - pin_memory="cuda" in device, + pin_memory="cuda" in torch.device(device).type, drop_last=True, shuffle=True, ) @@ -489,16 +489,19 @@ def test_every_event_filter_with_engine_with_dataloader(): @pytest.mark.distributed @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") -def test_distrib_cpu(distributed_context_single_node_gloo): - _test_every_event_filter_with_engine() - _test_every_event_filter_with_engine_with_dataloader("cpu") +def test_distrib_gloo_cpu_or_gpu(distributed_context_single_node_gloo): + + device = idist.device() + _test_every_event_filter_with_engine(device) + _test_every_event_filter_with_engine_with_dataloader(device) @pytest.mark.distributed @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif(torch.cuda.device_count() < 1, reason="Skip if no GPU") -def test_distrib_gpu(distributed_context_single_node_nccl): - device = f"cuda:{distributed_context_single_node_nccl['local_rank']}" +def test_distrib_nccl_gpu(distributed_context_single_node_nccl): + + device = idist.device() _test_every_event_filter_with_engine(device) _test_every_event_filter_with_engine_with_dataloader(device) diff --git a/tests/ignite/engine/test_deterministic.py b/tests/ignite/engine/test_deterministic.py index c1cd8e90cb46..f7a581ba4343 100644 --- a/tests/ignite/engine/test_deterministic.py +++ b/tests/ignite/engine/test_deterministic.py @@ -262,7 +262,7 @@ def _test(epoch_length=None): data, batch_size=batch_size, num_workers=num_workers, - pin_memory="cuda" in device, + pin_memory="cuda" in torch.device(device).type, sampler=sampler, drop_last=True, shuffle=sampler is None, @@ -294,7 +294,7 @@ def _(engine): data, batch_size=batch_size, num_workers=num_workers, - pin_memory="cuda" in device, + pin_memory="cuda" in torch.device(device).type, sampler=sampler, drop_last=True, shuffle=sampler is None, @@ -370,7 +370,7 @@ def _test(epoch_length=None): data, batch_size=batch_size, num_workers=num_workers, - pin_memory="cuda" in device, + pin_memory="cuda" in torch.device(device).type, sampler=sampler, drop_last=True, shuffle=sampler is None, @@ -401,7 +401,7 @@ def _(engine): data, batch_size=batch_size, num_workers=num_workers, - pin_memory="cuda" in device, + pin_memory="cuda" in torch.device(device).type, sampler=sampler, drop_last=True, shuffle=sampler is None, @@ -563,16 +563,18 @@ def test_resume_random_data_iterator_from_iter(): @pytest.mark.distributed @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif(torch.cuda.device_count() < 1, reason="Skip if no GPU") -def test_distrib_gpu(distributed_context_single_node_nccl): - device = f"cuda:{distributed_context_single_node_nccl['local_rank']}" +def test_distrib_nccl_gpu(distributed_context_single_node_nccl): + + device = idist.device() _test_resume_random_dataloader_from_iter(device, setup_sampler, sampler_type="distributed") _test_resume_random_dataloader_from_epoch(device, setup_sampler, sampler_type="distributed") @pytest.mark.distributed @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") -def test_distrib_cpu(distributed_context_single_node_gloo): - device = "cpu" +def test_distrib_gloo_cpu_or_gpu(distributed_context_single_node_gloo): + + device = idist.device() _test_resume_random_dataloader_from_iter(device, setup_sampler, sampler_type="distributed") _test_resume_random_dataloader_from_epoch(device, setup_sampler, sampler_type="distributed") @@ -581,8 +583,9 @@ def test_distrib_cpu(distributed_context_single_node_gloo): @pytest.mark.multinode_distributed @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif("MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed") -def test_multinode_distrib_cpu(distributed_context_multi_node_gloo): - device = "cpu" +def test_multinode_distrib_gloo_cpu_or_gpu(distributed_context_multi_node_gloo): + + device = idist.device() _test_resume_random_dataloader_from_iter(device, setup_sampler, sampler_type="distributed") _test_resume_random_dataloader_from_epoch(device, setup_sampler, sampler_type="distributed") @@ -590,8 +593,9 @@ def test_multinode_distrib_cpu(distributed_context_multi_node_gloo): @pytest.mark.multinode_distributed @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif("GPU_MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed") -def test_multinode_distrib_gpu(distributed_context_multi_node_nccl): - device = f"cuda:{distributed_context_multi_node_nccl['local_rank']}" +def test_multinode_distrib_nccl_gpu(distributed_context_multi_node_nccl): + + device = idist.device() _test_resume_random_dataloader_from_iter(device, setup_sampler, sampler_type="distributed") _test_resume_random_dataloader_from_epoch(device, setup_sampler, sampler_type="distributed") diff --git a/tests/ignite/engine/test_engine.py b/tests/ignite/engine/test_engine.py index 3500ad4328a1..ebbdf74c5c5e 100644 --- a/tests/ignite/engine/test_engine.py +++ b/tests/ignite/engine/test_engine.py @@ -500,14 +500,14 @@ def test_run_check_triggered_events_on_iterator(): @pytest.mark.distributed @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif(torch.cuda.device_count() < 1, reason="Skip if no GPU") -def test_distrib_gpu(distributed_context_single_node_nccl): +def test_distrib_nccl_gpu(distributed_context_single_node_nccl): _test_run_check_triggered_events_on_iterator() _test_run_check_triggered_events() @pytest.mark.distributed @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") -def test_distrib_cpu(distributed_context_single_node_gloo): +def test_distrib_gloo_cpu_or_gpu(distributed_context_single_node_gloo): _test_run_check_triggered_events_on_iterator() _test_run_check_triggered_events() @@ -515,7 +515,7 @@ def test_distrib_cpu(distributed_context_single_node_gloo): @pytest.mark.multinode_distributed @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif("MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed") -def test_multinode_distrib_cpu(distributed_context_multi_node_gloo): +def test_multinode_distrib_gloo_cpu_or_gpu(distributed_context_multi_node_gloo): _test_run_check_triggered_events_on_iterator() _test_run_check_triggered_events() @@ -523,7 +523,7 @@ def test_multinode_distrib_cpu(distributed_context_multi_node_gloo): @pytest.mark.multinode_distributed @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif("GPU_MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed") -def test_multinode_distrib_gpu(distributed_context_multi_node_nccl): +def test_multinode_distrib_nccl_gpu(distributed_context_multi_node_nccl): _test_run_check_triggered_events_on_iterator() _test_run_check_triggered_events() diff --git a/tests/ignite/handlers/test_checkpoint.py b/tests/ignite/handlers/test_checkpoint.py index 2319be61f92b..de35a1eb4247 100644 --- a/tests/ignite/handlers/test_checkpoint.py +++ b/tests/ignite/handlers/test_checkpoint.py @@ -1188,8 +1188,9 @@ def _test_checkpoint_load_objects_ddp(device): @pytest.mark.distributed @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") -def test_distrib_cpu(distributed_context_single_node_gloo, get_rank_zero_dirname): - device = torch.device("cpu") +def test_distrib_gloo_cpu_or_gpu(distributed_context_single_node_gloo, get_rank_zero_dirname): + + device = idist.device() dirname = get_rank_zero_dirname() _test_save_model_optimizer_lr_scheduler_with_state_dict(device, os.path.join(dirname, "1")) _test_save_model_optimizer_lr_scheduler_with_state_dict(device, os.path.join(dirname, "2"), on_zero_rank=True) @@ -1200,7 +1201,8 @@ def test_distrib_cpu(distributed_context_single_node_gloo, get_rank_zero_dirname @pytest.mark.distributed @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif(torch.cuda.device_count() < 1, reason="Skip if no GPU") -def test_distrib_gpu(distributed_context_single_node_nccl, get_rank_zero_dirname): +def test_distrib_nccl_gpu(distributed_context_single_node_nccl, get_rank_zero_dirname): + device = idist.device() dirname = get_rank_zero_dirname() _test_save_model_optimizer_lr_scheduler_with_state_dict(device, os.path.join(dirname, "1")) diff --git a/tests/ignite/handlers/test_early_stopping.py b/tests/ignite/handlers/test_early_stopping.py index 712338e11f45..66b96f757042 100644 --- a/tests/ignite/handlers/test_early_stopping.py +++ b/tests/ignite/handlers/test_early_stopping.py @@ -336,16 +336,18 @@ def evaluation(engine): @pytest.mark.distributed @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif(torch.cuda.device_count() < 1, reason="Skip if no GPU") -def test_distrib_gpu(local_rank, distributed_context_single_node_nccl): - device = f"cuda:{local_rank}" +def test_distrib_nccl_gpu(distributed_context_single_node_nccl): + + device = idist.device() _test_distrib_with_engine_early_stopping(device) _test_distrib_integration_engine_early_stopping(device) @pytest.mark.distributed @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") -def test_distrib_cpu(local_rank, distributed_context_single_node_gloo): - device = "cpu" +def test_distrib_gloo_cpu_or_gpu(distributed_context_single_node_gloo): + + device = idist.device() _test_distrib_with_engine_early_stopping(device) _test_distrib_integration_engine_early_stopping(device) @@ -353,8 +355,9 @@ def test_distrib_cpu(local_rank, distributed_context_single_node_gloo): @pytest.mark.multinode_distributed @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif("MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed") -def test_multinode_distrib_cpu(distributed_context_multi_node_gloo): - device = "cpu" +def test_multinode_distrib_gloo_cpu_or_gpu(distributed_context_multi_node_gloo): + + device = idist.device() _test_distrib_with_engine_early_stopping(device) _test_distrib_integration_engine_early_stopping(device) @@ -362,7 +365,8 @@ def test_multinode_distrib_cpu(distributed_context_multi_node_gloo): @pytest.mark.multinode_distributed @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif("GPU_MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed") -def test_multinode_distrib_gpu(distributed_context_multi_node_nccl): - device = f"cuda:{distributed_context_multi_node_nccl['local_rank']}" +def test_multinode_distrib_nccl_gpu(distributed_context_multi_node_nccl): + + device = idist.device() _test_distrib_with_engine_early_stopping(device) _test_distrib_integration_engine_early_stopping(device) diff --git a/tests/ignite/metrics/nlp/test_bleu.py b/tests/ignite/metrics/nlp/test_bleu.py index c98143cf98d2..745678449830 100644 --- a/tests/ignite/metrics/nlp/test_bleu.py +++ b/tests/ignite/metrics/nlp/test_bleu.py @@ -184,15 +184,17 @@ def _test(metric_device): @pytest.mark.distributed @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif(torch.cuda.device_count() < 1, reason="Skip if no GPU") -def test_distrib_gpu(local_rank, distributed_context_single_node_nccl): - device = torch.device(f"cuda:{local_rank}") +def test_distrib_nccl_gpu(distributed_context_single_node_nccl): + + device = idist.device() _test_distrib_integration(device) @pytest.mark.distributed @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") -def test_distrib_cpu(distributed_context_single_node_gloo): - device = torch.device("cpu") +def test_distrib_gloo_cpu_or_gpu(distributed_context_single_node_gloo): + + device = idist.device() _test_distrib_integration(device) @@ -210,16 +212,18 @@ def test_distrib_hvd(gloo_hvd_executor): @pytest.mark.multinode_distributed @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif("MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed") -def test_multinode_distrib_cpu(distributed_context_multi_node_gloo): - device = torch.device("cpu") +def test_multinode_distrib_gloo_cpu_or_gpu(distributed_context_multi_node_gloo): + + device = idist.device() _test_distrib_integration(device) @pytest.mark.multinode_distributed @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif("GPU_MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed") -def test_multinode_distrib_gpu(distributed_context_multi_node_nccl): - device = torch.device(f"cuda:{distributed_context_multi_node_nccl['local_rank']}") +def test_multinode_distrib_nccl_gpu(distributed_context_multi_node_nccl): + + device = idist.device() _test_distrib_integration(device) diff --git a/tests/ignite/metrics/nlp/test_rouge.py b/tests/ignite/metrics/nlp/test_rouge.py index 40aafae189c2..7102a6f98f12 100644 --- a/tests/ignite/metrics/nlp/test_rouge.py +++ b/tests/ignite/metrics/nlp/test_rouge.py @@ -176,15 +176,17 @@ def _test(metric_device): @pytest.mark.distributed @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif(torch.cuda.device_count() < 1, reason="Skip if no GPU") -def test_distrib_gpu(local_rank, distributed_context_single_node_nccl): - device = torch.device(f"cuda:{local_rank}") +def test_distrib_nccl_gpu(distributed_context_single_node_nccl): + + device = idist.device() _test_distrib_integration(device) @pytest.mark.distributed @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") -def test_distrib_cpu(distributed_context_single_node_gloo): - device = torch.device("cpu") +def test_distrib_gloo_cpu_or_gpu(distributed_context_single_node_gloo): + + device = idist.device() _test_distrib_integration(device) @@ -202,16 +204,18 @@ def test_distrib_hvd(gloo_hvd_executor): @pytest.mark.multinode_distributed @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif("MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed") -def test_multinode_distrib_cpu(distributed_context_multi_node_gloo): - device = torch.device("cpu") +def test_multinode_distrib_gloo_cpu_or_gpu(distributed_context_multi_node_gloo): + + device = idist.device() _test_distrib_integration(device) @pytest.mark.multinode_distributed @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif("GPU_MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed") -def test_multinode_distrib_gpu(distributed_context_multi_node_nccl): - device = torch.device(f"cuda:{distributed_context_multi_node_nccl['local_rank']}") +def test_multinode_distrib_nccl_gpu(distributed_context_multi_node_nccl): + + device = idist.device() _test_distrib_integration(device) diff --git a/tests/ignite/metrics/test_accumulation.py b/tests/ignite/metrics/test_accumulation.py index e47a9273e583..1034c3cfd21c 100644 --- a/tests/ignite/metrics/test_accumulation.py +++ b/tests/ignite/metrics/test_accumulation.py @@ -425,9 +425,9 @@ def _test_apex_average(device, amp_mode, opt_level): @pytest.mark.distributed @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif(torch.cuda.device_count() < 1, reason="Skip if no GPU") -def test_distrib_gpu(distributed_context_single_node_nccl): +def test_distrib_nccl_gpu(distributed_context_single_node_nccl): - device = torch.device(f"cuda:{distributed_context_single_node_nccl['local_rank']}") + device = idist.device() _test_distrib_variable_accumulation(device) _test_distrib_average(device) _test_distrib_geom_average(device) @@ -437,21 +437,9 @@ def test_distrib_gpu(distributed_context_single_node_nccl): @pytest.mark.distributed @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") -def test_distrib_cpu(distributed_context_single_node_gloo): - - device = torch.device("cpu") - _test_distrib_variable_accumulation(device) - _test_distrib_average(device) - _test_distrib_geom_average(device) - _test_distrib_integration(device) - _test_distrib_accumulator_device(device) - +def test_distrib_gloo_cpu_or_gpu(distributed_context_single_node_gloo): -@pytest.mark.multinode_distributed -@pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") -@pytest.mark.skipif("MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed") -def test_multinode_distrib_cpu(distributed_context_multi_node_gloo): - device = torch.device("cpu") + device = idist.device() _test_distrib_variable_accumulation(device) _test_distrib_average(device) _test_distrib_geom_average(device) @@ -464,7 +452,7 @@ def test_multinode_distrib_cpu(distributed_context_multi_node_gloo): @pytest.mark.skipif("WORLD_SIZE" in os.environ, reason="Skip if launched as multiproc") def test_distrib_hvd(gloo_hvd_executor): - device = torch.device("cpu" if not torch.cuda.is_available() else "cuda") + device = idist.device() nproc = 4 if not torch.cuda.is_available() else torch.cuda.device_count() gloo_hvd_executor(_test_distrib_variable_accumulation, (device,), np=nproc, do_init=True) @@ -474,22 +462,11 @@ def test_distrib_hvd(gloo_hvd_executor): gloo_hvd_executor(_test_distrib_accumulator_device, (device,), np=nproc, do_init=True) -@pytest.mark.multinode_distributed -@pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") -@pytest.mark.skipif("GPU_MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed") -def test_multinode_distrib_gpu(distributed_context_multi_node_nccl): - device = torch.device(f"cuda:{distributed_context_multi_node_nccl['local_rank']}") - _test_distrib_variable_accumulation(device) - _test_distrib_average(device) - _test_distrib_geom_average(device) - _test_distrib_integration(device) - _test_distrib_accumulator_device(device) - - @pytest.mark.tpu @pytest.mark.skipif("NUM_TPU_WORKERS" in os.environ, reason="Skip if NUM_TPU_WORKERS is in env vars") @pytest.mark.skipif(not idist.has_xla_support, reason="Skip if no PyTorch XLA package") def test_distrib_single_device_xla(): + device = idist.device() _test_distrib_variable_accumulation(device) _test_distrib_average(device) @@ -523,3 +500,29 @@ def test_apex_average_on_cuda(): _test_apex_average(device, amp_mode="apex", opt_level="O1") _test_apex_average(device, amp_mode="apex", opt_level="O2") _test_apex_average(device, amp_mode="apex", opt_level="O3") + + +@pytest.mark.multinode_distributed +@pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") +@pytest.mark.skipif("MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed") +def test_multinode_distrib_gloo_cpu_or_gpu(distributed_context_multi_node_gloo): + + device = idist.device() + _test_distrib_variable_accumulation(device) + _test_distrib_average(device) + _test_distrib_geom_average(device) + _test_distrib_integration(device) + _test_distrib_accumulator_device(device) + + +@pytest.mark.multinode_distributed +@pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") +@pytest.mark.skipif("GPU_MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed") +def test_multinode_distrib_nccl_gpu(distributed_context_multi_node_nccl): + + device = idist.device() + _test_distrib_variable_accumulation(device) + _test_distrib_average(device) + _test_distrib_geom_average(device) + _test_distrib_integration(device) + _test_distrib_accumulator_device(device) diff --git a/tests/ignite/metrics/test_accuracy.py b/tests/ignite/metrics/test_accuracy.py index 6f84735cc3ee..c997d2f62a0a 100644 --- a/tests/ignite/metrics/test_accuracy.py +++ b/tests/ignite/metrics/test_accuracy.py @@ -503,8 +503,9 @@ def _test_distrib_accumulator_device(device): @pytest.mark.distributed @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif(torch.cuda.device_count() < 1, reason="Skip if no GPU") -def test_distrib_gpu(distributed_context_single_node_nccl): - device = torch.device(f"cuda:{distributed_context_single_node_nccl['local_rank']}") +def test_distrib_nccl_gpu(distributed_context_single_node_nccl): + + device = idist.device() _test_distrib_multilabel_input_NHW(device) _test_distrib_integration_multiclass(device) _test_distrib_integration_multilabel(device) @@ -513,9 +514,9 @@ def test_distrib_gpu(distributed_context_single_node_nccl): @pytest.mark.distributed @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") -def test_distrib_cpu(distributed_context_single_node_gloo): +def test_distrib_gloo_cpu_or_gpu(distributed_context_single_node_gloo): - device = torch.device("cpu") + device = idist.device() _test_distrib_multilabel_input_NHW(device) _test_distrib_integration_multiclass(device) _test_distrib_integration_multilabel(device) @@ -536,22 +537,19 @@ def test_distrib_hvd(gloo_hvd_executor): gloo_hvd_executor(_test_distrib_accumulator_device, (device,), np=nproc, do_init=True) -@pytest.mark.multinode_distributed -@pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") -@pytest.mark.skipif("MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed") -def test_multinode_distrib_cpu(distributed_context_multi_node_gloo): - device = torch.device("cpu") +@pytest.mark.tpu +@pytest.mark.skipif("NUM_TPU_WORKERS" in os.environ, reason="Skip if NUM_TPU_WORKERS is in env vars") +@pytest.mark.skipif(not idist.has_xla_support, reason="Skip if no PyTorch XLA package") +def test_distrib_single_device_xla(): + device = idist.device() _test_distrib_multilabel_input_NHW(device) _test_distrib_integration_multiclass(device) _test_distrib_integration_multilabel(device) _test_distrib_accumulator_device(device) -@pytest.mark.multinode_distributed -@pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") -@pytest.mark.skipif("GPU_MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed") -def test_multinode_distrib_gpu(distributed_context_multi_node_nccl): - device = torch.device(f"cuda:{distributed_context_multi_node_nccl['local_rank']}") +def _test_distrib_xla_nprocs(index): + device = idist.device() _test_distrib_multilabel_input_NHW(device) _test_distrib_integration_multiclass(device) _test_distrib_integration_multilabel(device) @@ -559,9 +557,18 @@ def test_multinode_distrib_gpu(distributed_context_multi_node_nccl): @pytest.mark.tpu -@pytest.mark.skipif("NUM_TPU_WORKERS" in os.environ, reason="Skip if NUM_TPU_WORKERS is in env vars") +@pytest.mark.skipif("NUM_TPU_WORKERS" not in os.environ, reason="Skip if no NUM_TPU_WORKERS in env vars") @pytest.mark.skipif(not idist.has_xla_support, reason="Skip if no PyTorch XLA package") -def test_distrib_single_device_xla(): +def test_distrib_xla_nprocs(xmp_executor): + n = int(os.environ["NUM_TPU_WORKERS"]) + xmp_executor(_test_distrib_xla_nprocs, args=(), nprocs=n) + + +@pytest.mark.multinode_distributed +@pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") +@pytest.mark.skipif("MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed") +def test_multinode_distrib_gloo_cpu_or_gpu(distributed_context_multi_node_gloo): + device = idist.device() _test_distrib_multilabel_input_NHW(device) _test_distrib_integration_multiclass(device) @@ -569,17 +576,13 @@ def test_distrib_single_device_xla(): _test_distrib_accumulator_device(device) -def _test_distrib_xla_nprocs(index): +@pytest.mark.multinode_distributed +@pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") +@pytest.mark.skipif("GPU_MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed") +def test_multinode_distrib_nccl_gpu(distributed_context_multi_node_nccl): + device = idist.device() _test_distrib_multilabel_input_NHW(device) _test_distrib_integration_multiclass(device) _test_distrib_integration_multilabel(device) _test_distrib_accumulator_device(device) - - -@pytest.mark.tpu -@pytest.mark.skipif("NUM_TPU_WORKERS" not in os.environ, reason="Skip if no NUM_TPU_WORKERS in env vars") -@pytest.mark.skipif(not idist.has_xla_support, reason="Skip if no PyTorch XLA package") -def test_distrib_xla_nprocs(xmp_executor): - n = int(os.environ["NUM_TPU_WORKERS"]) - xmp_executor(_test_distrib_xla_nprocs, args=(), nprocs=n) diff --git a/tests/ignite/metrics/test_classification_report.py b/tests/ignite/metrics/test_classification_report.py index 3a15987a6a04..57b80cfcf364 100644 --- a/tests/ignite/metrics/test_classification_report.py +++ b/tests/ignite/metrics/test_classification_report.py @@ -141,23 +141,23 @@ def update(engine, i): _test(metric_device, 2, ["0", "1", "2", "3", "4", "5", "6"]) -@pytest.mark.multinode_distributed +@pytest.mark.distributed @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") -@pytest.mark.skipif("GPU_MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed") -def test_multinode_distrib_gpu(distributed_context_multi_node_nccl): +@pytest.mark.skipif(torch.cuda.device_count() < 1, reason="Skip if no GPU") +def test_distrib_nccl_gpu(distributed_context_single_node_nccl): - device = torch.device(f"cuda:{distributed_context_multi_node_nccl['local_rank']}") + device = idist.device() _test_integration_multiclass(device, True) _test_integration_multiclass(device, False) _test_integration_multilabel(device, True) _test_integration_multilabel(device, False) -@pytest.mark.multinode_distributed +@pytest.mark.distributed @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") -@pytest.mark.skipif("MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed") -def test_multinode_distrib_cpu(distributed_context_multi_node_gloo): - device = torch.device("cpu") +def test_distrib_gloo_cpu_or_gpu(local_rank, distributed_context_single_node_gloo): + + device = idist.device() _test_integration_multiclass(device, True) _test_integration_multiclass(device, False) _test_integration_multilabel(device, True) @@ -165,13 +165,17 @@ def test_multinode_distrib_cpu(distributed_context_multi_node_gloo): @pytest.mark.distributed -@pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") -def test_distrib_cpu(local_rank, distributed_context_single_node_gloo): - device = torch.device("cpu") - _test_integration_multiclass(device, True) - _test_integration_multiclass(device, False) - _test_integration_multilabel(device, True) - _test_integration_multilabel(device, False) +@pytest.mark.skipif(not idist.has_hvd_support, reason="Skip if no Horovod dist support") +@pytest.mark.skipif("WORLD_SIZE" in os.environ, reason="Skip if launched as multiproc") +def test_distrib_hvd(gloo_hvd_executor): + + device = torch.device("cpu" if not torch.cuda.is_available() else "cuda") + nproc = 4 if not torch.cuda.is_available() else torch.cuda.device_count() + + gloo_hvd_executor(_test_integration_multiclass, (device, True), np=nproc, do_init=True) + gloo_hvd_executor(_test_integration_multiclass, (device, False), np=nproc, do_init=True) + gloo_hvd_executor(_test_integration_multilabel, (device, True), np=nproc, do_init=True) + gloo_hvd_executor(_test_integration_multilabel, (device, False), np=nproc, do_init=True) def _test_distrib_xla_nprocs(index): @@ -197,3 +201,27 @@ def to_numpy_multilabel(y): num_classes = y.shape[0] y = y.reshape((num_classes, -1)).transpose(1, 0) return y + + +@pytest.mark.multinode_distributed +@pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") +@pytest.mark.skipif("MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed") +def test_multinode_distrib_gloo_cpu_or_gpu(distributed_context_multi_node_gloo): + + device = idist.device() + _test_integration_multiclass(device, True) + _test_integration_multiclass(device, False) + _test_integration_multilabel(device, True) + _test_integration_multilabel(device, False) + + +@pytest.mark.multinode_distributed +@pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") +@pytest.mark.skipif("GPU_MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed") +def test_multinode_distrib_nccl_gpu(distributed_context_multi_node_nccl): + + device = idist.device() + _test_integration_multiclass(device, True) + _test_integration_multiclass(device, False) + _test_integration_multilabel(device, True) + _test_integration_multilabel(device, False) diff --git a/tests/ignite/metrics/test_confusion_matrix.py b/tests/ignite/metrics/test_confusion_matrix.py index 1745b532408f..af85e060d9b3 100644 --- a/tests/ignite/metrics/test_confusion_matrix.py +++ b/tests/ignite/metrics/test_confusion_matrix.py @@ -589,18 +589,18 @@ def _test(average=None): @pytest.mark.distributed @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif(torch.cuda.device_count() < 1, reason="Skip if no GPU") -def test_distrib_gpu(local_rank, distributed_context_single_node_nccl): +def test_distrib_nccl_gpu(distributed_context_single_node_nccl): - device = torch.device(f"cuda:{local_rank}") + device = idist.device() _test_distrib_multiclass_images(device) _test_distrib_accumulator_device(device) @pytest.mark.distributed @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") -def test_distrib_cpu(distributed_context_single_node_gloo): +def test_distrib_gloo_cpu_or_gpu(distributed_context_single_node_gloo): - device = torch.device("cpu") + device = idist.device() _test_distrib_multiclass_images(device) _test_distrib_accumulator_device(device) @@ -617,24 +617,6 @@ def test_distrib_hvd(gloo_hvd_executor): gloo_hvd_executor(_test_distrib_accumulator_device, (device,), np=nproc, do_init=True) -@pytest.mark.multinode_distributed -@pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") -@pytest.mark.skipif("MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed") -def test_multinode_distrib_cpu(distributed_context_multi_node_gloo): - device = torch.device("cpu") - _test_distrib_multiclass_images(device) - _test_distrib_accumulator_device(device) - - -@pytest.mark.multinode_distributed -@pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") -@pytest.mark.skipif("GPU_MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed") -def test_multinode_distrib_gpu(distributed_context_multi_node_nccl): - device = torch.device(f"cuda:{distributed_context_multi_node_nccl['local_rank']}") - _test_distrib_multiclass_images(device) - _test_distrib_accumulator_device(device) - - @pytest.mark.tpu @pytest.mark.skipif("NUM_TPU_WORKERS" in os.environ, reason="Skip if NUM_TPU_WORKERS is in env vars") @pytest.mark.skipif(not idist.has_xla_support, reason="Skip if no PyTorch XLA package") @@ -656,3 +638,23 @@ def _test_distrib_xla_nprocs(index): def test_distrib_xla_nprocs(xmp_executor): n = int(os.environ["NUM_TPU_WORKERS"]) xmp_executor(_test_distrib_xla_nprocs, args=(), nprocs=n) + + +@pytest.mark.multinode_distributed +@pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") +@pytest.mark.skipif("MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed") +def test_multinode_distrib_gloo_cpu_or_gpu(distributed_context_multi_node_gloo): + + device = idist.device() + _test_distrib_multiclass_images(device) + _test_distrib_accumulator_device(device) + + +@pytest.mark.multinode_distributed +@pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") +@pytest.mark.skipif("GPU_MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed") +def test_multinode_distrib_nccl_gpu(distributed_context_multi_node_nccl): + + device = idist.device() + _test_distrib_multiclass_images(device) + _test_distrib_accumulator_device(device) diff --git a/tests/ignite/metrics/test_epoch_metric.py b/tests/ignite/metrics/test_epoch_metric.py index ef660a16be41..70296d484180 100644 --- a/tests/ignite/metrics/test_epoch_metric.py +++ b/tests/ignite/metrics/test_epoch_metric.py @@ -193,14 +193,18 @@ def assert_data_fn(all_preds, all_targets): @pytest.mark.distributed @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif(torch.cuda.device_count() < 1, reason="Skip if no GPU") -def test_distrib_gpu(distributed_context_single_node_nccl): - _test_distrib_integration(device="cuda") +def test_distrib_nccl_gpu(distributed_context_single_node_nccl): + + device = idist.device() + _test_distrib_integration(device) @pytest.mark.distributed @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") -def test_distrib_cpu(distributed_context_single_node_gloo): - _test_distrib_integration(device="cpu") +def test_distrib_gloo_cpu_or_gpu(distributed_context_single_node_gloo): + + device = idist.device() + _test_distrib_integration(device) @pytest.mark.tpu diff --git a/tests/ignite/metrics/test_fbeta.py b/tests/ignite/metrics/test_fbeta.py index 8e38d516140c..27eb28905189 100644 --- a/tests/ignite/metrics/test_fbeta.py +++ b/tests/ignite/metrics/test_fbeta.py @@ -146,15 +146,17 @@ def update(engine, i): @pytest.mark.distributed @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif(torch.cuda.device_count() < 1, reason="Skip if no GPU") -def test_distrib_gpu(local_rank, distributed_context_single_node_nccl): - device = torch.device(f"cuda:{local_rank}") +def test_distrib_nccl_gpu(distributed_context_single_node_nccl): + + device = idist.device() _test_distrib_integration(device) @pytest.mark.distributed @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") -def test_distrib_cpu(distributed_context_single_node_gloo): - device = torch.device("cpu") +def test_distrib_gloo_cpu_or_gpu(distributed_context_single_node_gloo): + + device = idist.device() _test_distrib_integration(device) @@ -169,22 +171,6 @@ def test_distrib_hvd(gloo_hvd_executor): gloo_hvd_executor(_test_distrib_integration, (device,), np=nproc, do_init=True) -@pytest.mark.multinode_distributed -@pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") -@pytest.mark.skipif("MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed") -def test_multinode_distrib_cpu(distributed_context_multi_node_gloo): - device = torch.device("cpu") - _test_distrib_integration(device) - - -@pytest.mark.multinode_distributed -@pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") -@pytest.mark.skipif("GPU_MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed") -def test_multinode_distrib_gpu(distributed_context_multi_node_nccl): - device = torch.device(f"cuda:{distributed_context_multi_node_nccl['local_rank']}") - _test_distrib_integration(device) - - @pytest.mark.tpu @pytest.mark.skipif("NUM_TPU_WORKERS" in os.environ, reason="Skip if NUM_TPU_WORKERS is in env vars") @pytest.mark.skipif(not idist.has_xla_support, reason="Skip if no PyTorch XLA package") @@ -204,3 +190,21 @@ def _test_distrib_xla_nprocs(index): def test_distrib_xla_nprocs(xmp_executor): n = int(os.environ["NUM_TPU_WORKERS"]) xmp_executor(_test_distrib_xla_nprocs, args=(), nprocs=n) + + +@pytest.mark.multinode_distributed +@pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") +@pytest.mark.skipif("MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed") +def test_multinode_distrib_gloo_cpu_or_gpu(distributed_context_multi_node_gloo): + + device = idist.device() + _test_distrib_integration(device) + + +@pytest.mark.multinode_distributed +@pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") +@pytest.mark.skipif("GPU_MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed") +def test_multinode_distrib_nccl_gpu(distributed_context_multi_node_nccl): + + device = idist.device() + _test_distrib_integration(device) diff --git a/tests/ignite/metrics/test_loss.py b/tests/ignite/metrics/test_loss.py index 353933b0ee04..23749b960072 100644 --- a/tests/ignite/metrics/test_loss.py +++ b/tests/ignite/metrics/test_loss.py @@ -181,18 +181,18 @@ def test_sum_detached(): @pytest.mark.distributed @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif(torch.cuda.device_count() < 1, reason="Skip if no GPU") -def test_distrib_gpu(local_rank, distributed_context_single_node_nccl): +def test_distrib_nccl_gpu(distributed_context_single_node_nccl): - device = torch.device(f"cuda:{local_rank}") + device = idist.device() _test_distrib_compute_on_criterion(device, y_test_1(), y_test_2()) _test_distrib_accumulator_device(device, y_test_1()) @pytest.mark.distributed @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") -def test_distrib_cpu(distributed_context_single_node_gloo): +def test_distrib_gloo_cpu_or_gpu(distributed_context_single_node_gloo): - device = torch.device("cpu") + device = idist.device() _test_distrib_compute_on_criterion(device, y_test_1(), y_test_2()) _test_distrib_accumulator_device(device, y_test_1()) @@ -209,34 +209,18 @@ def test_distrib_hvd(gloo_hvd_executor): gloo_hvd_executor(_test_distrib_accumulator_device, (device, y_test_1()), np=nproc, do_init=True) -@pytest.mark.multinode_distributed -@pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") -@pytest.mark.skipif("MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed") -def test_multinode_distrib_cpu(distributed_context_multi_node_gloo): - device = torch.device("cpu") - _test_distrib_compute_on_criterion(device, y_test_1(), y_test_2(), tol=1e-6) - _test_distrib_accumulator_device(device, y_test_1()) - - -@pytest.mark.multinode_distributed -@pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") -@pytest.mark.skipif("GPU_MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed") -def test_multinode_distrib_gpu(distributed_context_multi_node_nccl): - device = torch.device(f"cuda:{distributed_context_multi_node_nccl['local_rank']}") - _test_distrib_compute_on_criterion(device, y_test_1(), y_test_2()) - _test_distrib_accumulator_device(device, y_test_1()) - - @pytest.mark.tpu @pytest.mark.skipif("NUM_TPU_WORKERS" in os.environ, reason="Skip if NUM_TPU_WORKERS is in env vars") @pytest.mark.skipif(not idist.has_xla_support, reason="Skip if no PyTorch XLA package") def test_distrib_single_device_xla(): + device = idist.device() _test_distrib_compute_on_criterion(device, y_test_1(), y_test_2()) _test_distrib_accumulator_device(device, y_test_1()) def _test_distrib_xla_nprocs(index): + device = idist.device() _test_distrib_compute_on_criterion(device, y_test_1(), y_test_2()) _test_distrib_accumulator_device(device, y_test_1()) @@ -248,3 +232,23 @@ def _test_distrib_xla_nprocs(index): def test_distrib_xla_nprocs(xmp_executor): n = int(os.environ["NUM_TPU_WORKERS"]) xmp_executor(_test_distrib_xla_nprocs, args=(), nprocs=n) + + +@pytest.mark.multinode_distributed +@pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") +@pytest.mark.skipif("MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed") +def test_multinode_distrib_gloo_cpu_or_gpu(distributed_context_multi_node_gloo): + + device = idist.device() + _test_distrib_compute_on_criterion(device, y_test_1(), y_test_2(), tol=1e-6) + _test_distrib_accumulator_device(device, y_test_1()) + + +@pytest.mark.multinode_distributed +@pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") +@pytest.mark.skipif("GPU_MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed") +def test_multinode_distrib_nccl_gpu(distributed_context_multi_node_nccl): + + device = idist.device() + _test_distrib_compute_on_criterion(device, y_test_1(), y_test_2()) + _test_distrib_accumulator_device(device, y_test_1()) diff --git a/tests/ignite/metrics/test_mean_absolute_error.py b/tests/ignite/metrics/test_mean_absolute_error.py index cfd662dedddf..0e9e2a75140f 100644 --- a/tests/ignite/metrics/test_mean_absolute_error.py +++ b/tests/ignite/metrics/test_mean_absolute_error.py @@ -129,16 +129,18 @@ def test_accumulator_detached(): @pytest.mark.distributed @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif(torch.cuda.device_count() < 1, reason="Skip if no GPU") -def test_distrib_gpu(local_rank, distributed_context_single_node_nccl): - device = torch.device(f"cuda:{local_rank}") +def test_distrib_nccl_gpu(distributed_context_single_node_nccl): + + device = idist.device() _test_distrib_integration(device) _test_distrib_accumulator_device(device) @pytest.mark.distributed @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") -def test_distrib_cpu(distributed_context_single_node_gloo): - device = torch.device("cpu") +def test_distrib_gloo_cpu_or_gpu(distributed_context_single_node_gloo): + + device = idist.device() _test_distrib_integration(device) _test_distrib_accumulator_device(device) @@ -158,8 +160,9 @@ def test_distrib_hvd(gloo_hvd_executor): @pytest.mark.multinode_distributed @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif("MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed") -def test_multinode_distrib_cpu(distributed_context_multi_node_gloo): - device = torch.device("cpu") +def test_multinode_distrib_gloo_cpu_or_gpu(distributed_context_multi_node_gloo): + + device = idist.device() _test_distrib_integration(device) _test_distrib_accumulator_device(device) @@ -167,8 +170,9 @@ def test_multinode_distrib_cpu(distributed_context_multi_node_gloo): @pytest.mark.multinode_distributed @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif("GPU_MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed") -def test_multinode_distrib_gpu(distributed_context_multi_node_nccl): - device = torch.device(f"cuda:{distributed_context_multi_node_nccl['local_rank']}") +def test_multinode_distrib_nccl_gpu(distributed_context_multi_node_nccl): + + device = idist.device() _test_distrib_integration(device) _test_distrib_accumulator_device(device) @@ -177,6 +181,7 @@ def test_multinode_distrib_gpu(distributed_context_multi_node_nccl): @pytest.mark.skipif("NUM_TPU_WORKERS" in os.environ, reason="Skip if NUM_TPU_WORKERS is in env vars") @pytest.mark.skipif(not idist.has_xla_support, reason="Skip if no PyTorch XLA package") def test_distrib_single_device_xla(): + device = idist.device() _test_distrib_integration(device) _test_distrib_accumulator_device(device) diff --git a/tests/ignite/metrics/test_mean_pairwise_distance.py b/tests/ignite/metrics/test_mean_pairwise_distance.py index 2e042fd4703d..4ba5bdf4ec00 100644 --- a/tests/ignite/metrics/test_mean_pairwise_distance.py +++ b/tests/ignite/metrics/test_mean_pairwise_distance.py @@ -138,16 +138,18 @@ def test_accumulator_detached(): @pytest.mark.distributed @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif(torch.cuda.device_count() < 1, reason="Skip if no GPU") -def test_distrib_gpu(local_rank, distributed_context_single_node_nccl): - device = torch.device(f"cuda:{local_rank}") +def test_distrib_nccl_gpu(distributed_context_single_node_nccl): + + device = idist.device() _test_distrib_integration(device) _test_distrib_accumulator_device(device) @pytest.mark.distributed @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") -def test_distrib_cpu(distributed_context_single_node_gloo): - device = torch.device("cpu") +def test_distrib_gloo_cpu_or_gpu(distributed_context_single_node_gloo): + + device = idist.device() _test_distrib_integration(device) _test_distrib_accumulator_device(device) @@ -167,8 +169,9 @@ def test_distrib_hvd(gloo_hvd_executor): @pytest.mark.multinode_distributed @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif("MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed") -def test_multinode_distrib_cpu(distributed_context_multi_node_gloo): - device = torch.device("cpu") +def test_multinode_distrib_gloo_cpu_or_gpu(distributed_context_multi_node_gloo): + + device = idist.device() _test_distrib_integration(device) _test_distrib_accumulator_device(device) @@ -176,8 +179,9 @@ def test_multinode_distrib_cpu(distributed_context_multi_node_gloo): @pytest.mark.multinode_distributed @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif("GPU_MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed") -def test_multinode_distrib_gpu(distributed_context_multi_node_nccl): - device = torch.device(f"cuda:{distributed_context_multi_node_nccl['local_rank']}") +def test_multinode_distrib_nccl_gpu(distributed_context_multi_node_nccl): + + device = idist.device() _test_distrib_integration(device) _test_distrib_accumulator_device(device) diff --git a/tests/ignite/metrics/test_mean_squared_error.py b/tests/ignite/metrics/test_mean_squared_error.py index 5f5a169d2770..a1df3fb3a5cf 100644 --- a/tests/ignite/metrics/test_mean_squared_error.py +++ b/tests/ignite/metrics/test_mean_squared_error.py @@ -131,17 +131,18 @@ def test_accumulator_detached(): @pytest.mark.distributed @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif(torch.cuda.device_count() < 1, reason="Skip if no GPU") -def test_distrib_gpu(local_rank, distributed_context_single_node_nccl): +def test_distrib_nccl_gpu(distributed_context_single_node_nccl): - device = torch.device(f"cuda:{local_rank}") + device = idist.device() _test_distrib_integration(device) _test_distrib_accumulator_device(device) @pytest.mark.distributed @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") -def test_distrib_cpu(distributed_context_single_node_gloo): - device = torch.device("cpu") +def test_distrib_gloo_cpu_or_gpu(distributed_context_single_node_gloo): + + device = idist.device() _test_distrib_integration(device) _test_distrib_accumulator_device(device) @@ -161,8 +162,9 @@ def test_distrib_hvd(gloo_hvd_executor): @pytest.mark.multinode_distributed @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif("MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed") -def test_multinode_distrib_cpu(distributed_context_multi_node_gloo): - device = torch.device("cpu") +def test_multinode_distrib_gloo_cpu_or_gpu(distributed_context_multi_node_gloo): + + device = idist.device() _test_distrib_integration(device) _test_distrib_accumulator_device(device) @@ -170,8 +172,9 @@ def test_multinode_distrib_cpu(distributed_context_multi_node_gloo): @pytest.mark.multinode_distributed @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif("GPU_MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed") -def test_multinode_distrib_gpu(distributed_context_multi_node_nccl): - device = torch.device(f"cuda:{distributed_context_multi_node_nccl['local_rank']}") +def test_multinode_distrib_nccl_gpu(distributed_context_multi_node_nccl): + + device = idist.device() _test_distrib_integration(device) _test_distrib_accumulator_device(device) diff --git a/tests/ignite/metrics/test_metric.py b/tests/ignite/metrics/test_metric.py index 06997139281b..8664d6c230aa 100644 --- a/tests/ignite/metrics/test_metric.py +++ b/tests/ignite/metrics/test_metric.py @@ -668,18 +668,18 @@ def _test_creating_on_xla_fails(device): @pytest.mark.distributed @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif(torch.cuda.device_count() < 1, reason="Skip if no GPU") -def test_distrib_gpu(distributed_context_single_node_nccl): +def test_distrib_nccl_gpu(distributed_context_single_node_nccl): - device = f"cuda:{distributed_context_single_node_nccl['local_rank']}" + device = idist.device() _test_distrib_sync_all_reduce_decorator(device) _test_invalid_sync_all_reduce(device) @pytest.mark.distributed @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") -def test_distrib_cpu(distributed_context_single_node_gloo): +def test_distrib_gloo_cpu_or_gpu(distributed_context_single_node_gloo): - device = "cpu" + device = idist.device() _test_distrib_sync_all_reduce_decorator(device) _test_invalid_sync_all_reduce(device) @@ -699,8 +699,9 @@ def test_distrib_hvd(gloo_hvd_executor): @pytest.mark.multinode_distributed @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif("MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed") -def test_multinode_distrib_cpu(distributed_context_multi_node_gloo): - device = "cpu" +def test_multinode_distrib_gloo_cpu_or_gpu(distributed_context_multi_node_gloo): + + device = idist.device() _test_distrib_sync_all_reduce_decorator(device) _test_invalid_sync_all_reduce(device) @@ -708,8 +709,9 @@ def test_multinode_distrib_cpu(distributed_context_multi_node_gloo): @pytest.mark.multinode_distributed @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif("GPU_MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed") -def test_multinode_distrib_gpu(distributed_context_multi_node_nccl): - device = f"cuda:{distributed_context_multi_node_nccl['local_rank']}" +def test_multinode_distrib_nccl_gpu(distributed_context_multi_node_nccl): + + device = idist.device() _test_distrib_sync_all_reduce_decorator(device) _test_invalid_sync_all_reduce(device) diff --git a/tests/ignite/metrics/test_metrics_lambda.py b/tests/ignite/metrics/test_metrics_lambda.py index 618cb35fd965..1376f414fc9b 100644 --- a/tests/ignite/metrics/test_metrics_lambda.py +++ b/tests/ignite/metrics/test_metrics_lambda.py @@ -402,18 +402,18 @@ def update(engine, i): @pytest.mark.distributed @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif(torch.cuda.device_count() < 1, reason="Skip if no GPU") -def test_distrib_gpu(local_rank, distributed_context_single_node_nccl): +def test_distrib_nccl_gpu(distributed_context_single_node_nccl): - device = torch.device(f"cuda:{local_rank}") + device = idist.device() _test_distrib_integration(device) _test_distrib_metrics_on_diff_devices(device) @pytest.mark.distributed @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") -def test_distrib_cpu(local_rank, distributed_context_single_node_gloo): +def test_distrib_gloo_cpu_or_gpu(distributed_context_single_node_gloo): - device = torch.device("cpu") + device = idist.device() _test_distrib_integration(device) @@ -432,16 +432,18 @@ def test_distrib_hvd(gloo_hvd_executor): @pytest.mark.multinode_distributed @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif("MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed") -def test_multinode_distrib_cpu(distributed_context_multi_node_gloo): - device = torch.device("cpu") +def test_multinode_distrib_gloo_cpu_or_gpu(distributed_context_multi_node_gloo): + + device = idist.device() _test_distrib_integration(device) @pytest.mark.multinode_distributed @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif("GPU_MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed") -def test_multinode_distrib_gpu(distributed_context_multi_node_nccl): - device = torch.device(f"cuda:{distributed_context_multi_node_nccl['local_rank']}") +def test_multinode_distrib_nccl_gpu(distributed_context_multi_node_nccl): + + device = idist.device() _test_distrib_integration(device) _test_distrib_metrics_on_diff_devices(device) diff --git a/tests/ignite/metrics/test_multilabel_confusion_matrix.py b/tests/ignite/metrics/test_multilabel_confusion_matrix.py index c4ea2bc2987a..01c959332fb8 100644 --- a/tests/ignite/metrics/test_multilabel_confusion_matrix.py +++ b/tests/ignite/metrics/test_multilabel_confusion_matrix.py @@ -359,18 +359,18 @@ def test_simple_batched(): # @pytest.mark.distributed # @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") # @pytest.mark.skipif(torch.cuda.device_count() < 1, reason="Skip if no GPU") -# def test_distrib_gpu(local_rank, distributed_context_single_node_nccl): +# def test_distrib_nccl_gpu(distributed_context_single_node_nccl): -# device = torch.device(f"cuda:{local_rank}") +# device = idist.device() # _test_distrib_multiclass_images(device) # _test_distrib_accumulator_device(device) # @pytest.mark.distributed # @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") -# def test_distrib_cpu(distributed_context_single_node_gloo): +# def test_distrib_gloo_cpu_or_gpu(distributed_context_single_node_gloo): -# device = torch.device("cpu") +# device = idist.device() # _test_distrib_multiclass_images(device) # _test_distrib_accumulator_device(device) @@ -390,8 +390,9 @@ def test_simple_batched(): # @pytest.mark.multinode_distributed # @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") # @pytest.mark.skipif("MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed") -# def test_multinode_distrib_cpu(distributed_context_multi_node_gloo): -# device = torch.device("cpu") +# def test_multinode_distrib_gloo_cpu_or_gpu(distributed_context_multi_node_gloo): +# +# device = idist.device() # _test_distrib_multiclass_images(device) # _test_distrib_accumulator_device(device) @@ -399,8 +400,9 @@ def test_simple_batched(): # @pytest.mark.multinode_distributed # @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") # @pytest.mark.skipif("GPU_MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed") -# def test_multinode_distrib_gpu(distributed_context_multi_node_nccl): -# device = torch.device(f"cuda:{distributed_context_multi_node_nccl['local_rank']}") +# def test_multinode_distrib_nccl_gpu(distributed_context_multi_node_nccl): +# +# device = idist.device() # _test_distrib_multiclass_images(device) # _test_distrib_accumulator_device(device) diff --git a/tests/ignite/metrics/test_precision.py b/tests/ignite/metrics/test_precision.py index bff02cc65c27..be2f6a909fd9 100644 --- a/tests/ignite/metrics/test_precision.py +++ b/tests/ignite/metrics/test_precision.py @@ -518,8 +518,9 @@ def _test(average, metric_device): @pytest.mark.distributed @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif(torch.cuda.device_count() < 1, reason="Skip if no GPU") -def test_distrib_gpu(local_rank, distributed_context_single_node_nccl): - device = torch.device(f"cuda:{local_rank}") +def test_distrib_nccl_gpu(distributed_context_single_node_nccl): + + device = idist.device() _test_distrib_integration_multiclass(device) _test_distrib_integration_multilabel(device) _test_distrib_accumulator_device(device) @@ -528,8 +529,9 @@ def test_distrib_gpu(local_rank, distributed_context_single_node_nccl): @pytest.mark.distributed @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") -def test_distrib_cpu(local_rank, distributed_context_single_node_gloo): - device = torch.device("cpu") +def test_distrib_gloo_cpu_or_gpu(distributed_context_single_node_gloo): + + device = idist.device() _test_distrib_integration_multiclass(device) _test_distrib_integration_multilabel(device) _test_distrib_accumulator_device(device) @@ -553,8 +555,9 @@ def test_distrib_hvd(gloo_hvd_executor): @pytest.mark.multinode_distributed @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif("MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed") -def test_multinode_distrib_cpu(distributed_context_multi_node_gloo): - device = torch.device("cpu") +def test_multinode_distrib_gloo_cpu_or_gpu(distributed_context_multi_node_gloo): + + device = idist.device() _test_distrib_integration_multiclass(device) _test_distrib_integration_multilabel(device) _test_distrib_accumulator_device(device) @@ -564,8 +567,9 @@ def test_multinode_distrib_cpu(distributed_context_multi_node_gloo): @pytest.mark.multinode_distributed @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif("GPU_MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed") -def test_multinode_distrib_gpu(distributed_context_multi_node_nccl): - device = torch.device(f"cuda:{distributed_context_multi_node_nccl['local_rank']}") +def test_multinode_distrib_nccl_gpu(distributed_context_multi_node_nccl): + + device = idist.device() _test_distrib_integration_multiclass(device) _test_distrib_integration_multilabel(device) _test_distrib_accumulator_device(device) diff --git a/tests/ignite/metrics/test_psnr.py b/tests/ignite/metrics/test_psnr.py index e1abf0f1fff7..f39fbd46e763 100644 --- a/tests/ignite/metrics/test_psnr.py +++ b/tests/ignite/metrics/test_psnr.py @@ -239,8 +239,9 @@ def _test_distrib_accumulator_device(device): @pytest.mark.distributed @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") -def test_distrib_cpu(distributed_context_single_node_gloo): - device = "cpu" +def test_distrib_gloo_cpu_or_gpu(distributed_context_single_node_gloo): + + device = idist.device() _test_distrib_integration(device) _test_distrib_accumulator_device(device) @@ -248,8 +249,9 @@ def test_distrib_cpu(distributed_context_single_node_gloo): @pytest.mark.distributed @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif(torch.cuda.device_count() < 1, reason="Skip if no GPU") -def test_distrib_gpu(local_rank, distributed_context_single_node_nccl): - device = f"cuda:{local_rank}" +def test_distrib_nccl_gpu(distributed_context_single_node_nccl): + + device = idist.device() _test_distrib_integration(device) _test_distrib_accumulator_device(device) @@ -257,8 +259,9 @@ def test_distrib_gpu(local_rank, distributed_context_single_node_nccl): @pytest.mark.multinode_distributed @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif("MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed") -def test_multinode_distrib_cpu(distributed_context_multi_node_gloo): - device = "cpu" +def test_multinode_distrib_gloo_cpu_or_gpu(distributed_context_multi_node_gloo): + + device = idist.device() _test_distrib_integration(device) _test_distrib_accumulator_device(device) @@ -266,8 +269,9 @@ def test_multinode_distrib_cpu(distributed_context_multi_node_gloo): @pytest.mark.multinode_distributed @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif("GPU_MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed") -def test_multinode_distrib_gpu(distributed_context_multi_node_nccl): - device = f"cuda:{distributed_context_multi_node_nccl['local_rank']}" +def test_multinode_distrib_nccl_gpu(distributed_context_multi_node_nccl): + + device = idist.device() _test_distrib_integration(device) _test_distrib_accumulator_device(device) @@ -276,6 +280,7 @@ def test_multinode_distrib_gpu(distributed_context_multi_node_nccl): @pytest.mark.skipif("NUM_TPU_WORKERS" in os.environ, reason="Skip if NUM_TPU_WORKERS is in env vars") @pytest.mark.skipif(not idist.has_xla_support, reason="Skip if no PyTorch XLA package") def test_distrib_single_device_xla(): + device = idist.device() _test_distrib_integration(device) _test_distrib_accumulator_device(device) diff --git a/tests/ignite/metrics/test_recall.py b/tests/ignite/metrics/test_recall.py index fe9b14e93dc8..8f3cdb9c67e4 100644 --- a/tests/ignite/metrics/test_recall.py +++ b/tests/ignite/metrics/test_recall.py @@ -519,8 +519,9 @@ def _test(average, metric_device): @pytest.mark.distributed @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif(torch.cuda.device_count() < 1, reason="Skip if no GPU") -def test_distrib_gpu(local_rank, distributed_context_single_node_nccl): - device = torch.device(f"cuda:{local_rank}") +def test_distrib_nccl_gpu(distributed_context_single_node_nccl): + + device = idist.device() _test_distrib_integration_multiclass(device) _test_distrib_integration_multilabel(device) _test_distrib_accumulator_device(device) @@ -529,8 +530,9 @@ def test_distrib_gpu(local_rank, distributed_context_single_node_nccl): @pytest.mark.distributed @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") -def test_distrib_cpu(distributed_context_single_node_gloo): - device = torch.device("cpu") +def test_distrib_gloo_cpu_or_gpu(distributed_context_single_node_gloo): + + device = idist.device() _test_distrib_integration_multiclass(device) _test_distrib_integration_multilabel(device) _test_distrib_accumulator_device(device) @@ -554,8 +556,9 @@ def test_distrib_hvd(gloo_hvd_executor): @pytest.mark.multinode_distributed @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif("MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed") -def test_multinode_distrib_cpu(distributed_context_multi_node_gloo): - device = torch.device("cpu") +def test_multinode_distrib_gloo_cpu_or_gpu(distributed_context_multi_node_gloo): + + device = idist.device() _test_distrib_integration_multiclass(device) _test_distrib_integration_multilabel(device) _test_distrib_accumulator_device(device) @@ -565,8 +568,9 @@ def test_multinode_distrib_cpu(distributed_context_multi_node_gloo): @pytest.mark.multinode_distributed @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif("GPU_MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed") -def test_multinode_distrib_gpu(distributed_context_multi_node_nccl): - device = torch.device(f"cuda:{distributed_context_multi_node_nccl['local_rank']}") +def test_multinode_distrib_nccl_gpu(distributed_context_multi_node_nccl): + + device = idist.device() _test_distrib_integration_multiclass(device) _test_distrib_integration_multilabel(device) _test_distrib_accumulator_device(device) diff --git a/tests/ignite/metrics/test_root_mean_squared_error.py b/tests/ignite/metrics/test_root_mean_squared_error.py index e5c66616456f..7c0ccee4d60b 100644 --- a/tests/ignite/metrics/test_root_mean_squared_error.py +++ b/tests/ignite/metrics/test_root_mean_squared_error.py @@ -103,17 +103,17 @@ def _test(metric_device): @pytest.mark.distributed @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif(torch.cuda.device_count() < 1, reason="Skip if no GPU") -def test_distrib_gpu(local_rank, distributed_context_single_node_nccl): +def test_distrib_nccl_gpu(distributed_context_single_node_nccl): - device = torch.device(f"cuda:{local_rank}") + device = idist.device() _test_distrib_integration(device) @pytest.mark.distributed @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") -def test_distrib_cpu(local_rank, distributed_context_single_node_gloo): +def test_distrib_gloo_cpu_or_gpu(distributed_context_single_node_gloo): - device = torch.device("cpu") + device = idist.device() _test_distrib_integration(device) @@ -131,16 +131,18 @@ def test_distrib_hvd(gloo_hvd_executor): @pytest.mark.multinode_distributed @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif("MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed") -def test_multinode_distrib_cpu(distributed_context_multi_node_gloo): - device = torch.device("cpu") +def test_multinode_distrib_gloo_cpu_or_gpu(distributed_context_multi_node_gloo): + + device = idist.device() _test_distrib_integration(device) @pytest.mark.multinode_distributed @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif("GPU_MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed") -def test_multinode_distrib_gpu(distributed_context_multi_node_nccl): - device = torch.device(f"cuda:{distributed_context_multi_node_nccl['local_rank']}") +def test_multinode_distrib_nccl_gpu(distributed_context_multi_node_nccl): + + device = idist.device() _test_distrib_integration(device) diff --git a/tests/ignite/metrics/test_running_average.py b/tests/ignite/metrics/test_running_average.py index 2e06ad589ecf..857e929c8509 100644 --- a/tests/ignite/metrics/test_running_average.py +++ b/tests/ignite/metrics/test_running_average.py @@ -392,9 +392,9 @@ def _test_distrib_accumulator_device(device): @pytest.mark.distributed @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif(torch.cuda.device_count() < 1, reason="Skip if no GPU") -def test_distrib_gpu(local_rank, distributed_context_single_node_nccl): +def test_distrib_nccl_gpu(distributed_context_single_node_nccl): - device = torch.device(f"cuda:{local_rank}") + device = idist.device() _test_distrib_on_output(device) _test_distrib_on_metric(device) _test_distrib_accumulator_device(device) @@ -402,9 +402,9 @@ def test_distrib_gpu(local_rank, distributed_context_single_node_nccl): @pytest.mark.distributed @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") -def test_distrib_cpu(distributed_context_single_node_gloo): +def test_distrib_gloo_cpu_or_gpu(distributed_context_single_node_gloo): - device = torch.device("cpu") + device = idist.device() _test_distrib_on_output(device) _test_distrib_on_metric(device) _test_distrib_accumulator_device(device) @@ -426,8 +426,9 @@ def test_distrib_hvd(gloo_hvd_executor): @pytest.mark.multinode_distributed @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif("MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed") -def test_multinode_distrib_cpu(distributed_context_multi_node_gloo): - device = torch.device("cpu") +def test_multinode_distrib_gloo_cpu_or_gpu(distributed_context_multi_node_gloo): + + device = idist.device() _test_distrib_on_output(device) _test_distrib_on_metric(device) _test_distrib_accumulator_device(device) @@ -436,8 +437,9 @@ def test_multinode_distrib_cpu(distributed_context_multi_node_gloo): @pytest.mark.multinode_distributed @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif("GPU_MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed") -def test_multinode_distrib_gpu(distributed_context_multi_node_nccl): - device = torch.device(f"cuda:{distributed_context_multi_node_nccl['local_rank']}") +def test_multinode_distrib_nccl_gpu(distributed_context_multi_node_nccl): + + device = idist.device() _test_distrib_on_output(device) _test_distrib_on_metric(device) _test_distrib_accumulator_device(device) diff --git a/tests/ignite/metrics/test_ssim.py b/tests/ignite/metrics/test_ssim.py index d612bd521920..ae62768eda45 100644 --- a/tests/ignite/metrics/test_ssim.py +++ b/tests/ignite/metrics/test_ssim.py @@ -193,17 +193,18 @@ def _test_distrib_accumulator_device(device): @pytest.mark.distributed @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif(torch.cuda.device_count() < 1, reason="Skip if no GPU") -def test_distrib_gpu(local_rank, distributed_context_single_node_nccl): +def test_distrib_nccl_gpu(distributed_context_single_node_nccl): - device = f"cuda:{local_rank}" + device = idist.device() _test_distrib_integration(device) _test_distrib_accumulator_device(device) @pytest.mark.distributed @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") -def test_distrib_cpu(distributed_context_single_node_gloo): - device = "cpu" +def test_distrib_gloo_cpu_or_gpu(distributed_context_single_node_gloo): + + device = idist.device() _test_distrib_integration(device) _test_distrib_accumulator_device(device) @@ -211,8 +212,9 @@ def test_distrib_cpu(distributed_context_single_node_gloo): @pytest.mark.multinode_distributed @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif("MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed") -def test_multinode_distrib_cpu(distributed_context_multi_node_gloo): - device = "cpu" +def test_multinode_distrib_gloo_cpu_or_gpu(distributed_context_multi_node_gloo): + + device = idist.device() _test_distrib_integration(device) _test_distrib_accumulator_device(device) @@ -220,8 +222,9 @@ def test_multinode_distrib_cpu(distributed_context_multi_node_gloo): @pytest.mark.multinode_distributed @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif("GPU_MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed") -def test_multinode_distrib_gpu(distributed_context_multi_node_nccl): - device = f"cuda:{distributed_context_multi_node_nccl['local_rank']}" +def test_multinode_distrib_nccl_gpu(distributed_context_multi_node_nccl): + + device = idist.device() _test_distrib_integration(device) _test_distrib_accumulator_device(device) diff --git a/tests/ignite/metrics/test_top_k_categorical_accuracy.py b/tests/ignite/metrics/test_top_k_categorical_accuracy.py index e2e4bb39ad85..2282248a96a1 100644 --- a/tests/ignite/metrics/test_top_k_categorical_accuracy.py +++ b/tests/ignite/metrics/test_top_k_categorical_accuracy.py @@ -128,16 +128,18 @@ def _test_distrib_accumulator_device(device): @pytest.mark.distributed @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif(torch.cuda.device_count() < 1, reason="Skip if no GPU") -def test_distrib_gpu(local_rank, distributed_context_single_node_nccl): - device = torch.device(f"cuda:{local_rank}") +def test_distrib_nccl_gpu(distributed_context_single_node_nccl): + + device = idist.device() _test_distrib_integration(device) _test_distrib_accumulator_device(device) @pytest.mark.distributed @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") -def test_distrib_cpu(local_rank, distributed_context_single_node_gloo): - device = torch.device("cpu") +def test_distrib_gloo_cpu_or_gpu(distributed_context_single_node_gloo): + + device = idist.device() _test_distrib_integration(device) _test_distrib_accumulator_device(device) @@ -157,8 +159,9 @@ def test_distrib_hvd(gloo_hvd_executor): @pytest.mark.multinode_distributed @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif("MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed") -def test_multinode_distrib_cpu(distributed_context_multi_node_gloo): - device = torch.device("cpu") +def test_multinode_distrib_gloo_cpu_or_gpu(distributed_context_multi_node_gloo): + + device = idist.device() _test_distrib_integration(device) _test_distrib_accumulator_device(device) @@ -166,8 +169,9 @@ def test_multinode_distrib_cpu(distributed_context_multi_node_gloo): @pytest.mark.multinode_distributed @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif("GPU_MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed") -def test_multinode_distrib_gpu(distributed_context_multi_node_nccl): - device = torch.device(f"cuda:{distributed_context_multi_node_nccl['local_rank']}") +def test_multinode_distrib_nccl_gpu(distributed_context_multi_node_nccl): + + device = idist.device() _test_distrib_integration(device) _test_distrib_accumulator_device(device) @@ -176,12 +180,14 @@ def test_multinode_distrib_gpu(distributed_context_multi_node_nccl): @pytest.mark.skipif("NUM_TPU_WORKERS" in os.environ, reason="Skip if NUM_TPU_WORKERS is in env vars") @pytest.mark.skipif(not idist.has_xla_support, reason="Skip if no PyTorch XLA package") def test_distrib_single_device_xla(): + device = idist.device() _test_distrib_integration(device) _test_distrib_accumulator_device(device) def _test_distrib_xla_nprocs(index): + device = idist.device() _test_distrib_integration(device) _test_distrib_accumulator_device(device)