Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 15 additions & 12 deletions tests/ignite/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,18 +17,21 @@ def dirname():


@pytest.fixture()
def fixed_dirname(worker_id):
# multi-proc friendly fixed tmp dirname
path = "/tmp/fixed_tmp_dirname"
lrank = int(worker_id.replace("gw", "")) if "gw" in worker_id else 0
time.sleep(0.5 * lrank)
os.makedirs(path, exist_ok=True)
yield path
time.sleep(0.5 * lrank)
if os.path.exists(path):
shutil.rmtree(path)
# sort of sync
time.sleep(1.0)
def get_fixed_dirname(worker_id):
def getter(name="test"):
# multi-proc friendly fixed tmp dirname
path = f"/tmp/fixed_tmp_dirname_{name}"
lrank = int(worker_id.replace("gw", "")) if "gw" in worker_id else 0
time.sleep(0.5 * lrank)
os.makedirs(path, exist_ok=True)
yield path
time.sleep(1.0 * lrank + 1.0)
if os.path.exists(path):
shutil.rmtree(path)
# sort of sync
time.sleep(1.0)

return getter


@pytest.fixture()
Expand Down
12 changes: 6 additions & 6 deletions tests/ignite/distributed/comp_models/test_native.py
Original file line number Diff line number Diff line change
Expand Up @@ -256,9 +256,9 @@ def test__native_dist_model_create_no_dist_nccl(clean_env):

@pytest.mark.distributed
@pytest.mark.parametrize("init_method", [None, "tcp://0.0.0.0:22334", "FILE"])
def test__native_dist_model_create_dist_gloo_1(init_method, fixed_dirname, local_rank, world_size):
def test__native_dist_model_create_dist_gloo_1(init_method, get_fixed_dirname, local_rank, world_size):
if init_method == "FILE":
init_method = f"file://{fixed_dirname}/shared"
init_method = f"file://{get_fixed_dirname('native_dist_model_create_dist_gloo_1')}/shared"

_test__native_dist_model_create_from_backend_dist(init_method, local_rank, local_rank, world_size, "gloo", "cpu")

Expand All @@ -271,9 +271,9 @@ def test__native_dist_model_create_dist_gloo_2(local_rank, world_size):
@pytest.mark.distributed
@pytest.mark.skipif(torch.cuda.device_count() < 1, reason="Skip if no GPU")
@pytest.mark.parametrize("init_method", [None, "tcp://0.0.0.0:22334", "FILE"])
def test__native_dist_model_create_dist_nccl_1(init_method, fixed_dirname, local_rank, world_size):
def test__native_dist_model_create_dist_nccl_1(init_method, get_fixed_dirname, local_rank, world_size):
if init_method == "FILE":
init_method = f"file://{fixed_dirname}/shared"
init_method = f"file://{get_fixed_dirname('native_dist_model_create_dist_nccl_1')}/shared"

_test__native_dist_model_create_from_backend_dist(
init_method, local_rank, local_rank, world_size, "nccl", f"cuda:{local_rank}"
Expand Down Expand Up @@ -373,8 +373,8 @@ def test__native_dist_model_init_method_is_none(world_size):
@pytest.mark.distributed
@pytest.mark.skipif("WORLD_SIZE" in os.environ, reason="Skip if launched as multiproc")
@pytest.mark.skipif(not has_native_dist_support, reason="Skip if no native dist support")
def test__native_dist_model_init_method_is_not_none(world_size, local_rank, fixed_dirname):
init_method = f"file://{fixed_dirname}/shared"
def test__native_dist_model_init_method_is_not_none(world_size, local_rank, get_fixed_dirname):
init_method = f"file://{get_fixed_dirname('native_dist_model_init_method_is_not_none')}/shared"
with pytest.raises(ValueError, match=r"Both rank and world_size should be provided"):
_NativeDistModel.create_from_backend(backend="gloo", world_size=world_size, init_method=init_method)

Expand Down
4 changes: 2 additions & 2 deletions tests/ignite/distributed/test_launcher.py
Original file line number Diff line number Diff line change
Expand Up @@ -217,9 +217,9 @@ def test_idist_parallel_spawn_n_procs_native(init_method, backend, dirname):
"backend",
["gloo", pytest.param("nccl", marks=pytest.mark.skipif(torch.cuda.device_count() < 1, reason="Skip if no GPU"))],
)
def test_idist_parallel_n_procs_native(init_method, backend, fixed_dirname, local_rank, world_size):
def test_idist_parallel_n_procs_native(init_method, backend, get_fixed_dirname, local_rank, world_size):
if init_method == "FILE":
init_method = f"file://{fixed_dirname}/shared"
init_method = f"file://{get_fixed_dirname('idist_parallel_n_procs_native')}/shared"

os.environ["RANK"] = str(local_rank)
device = "cuda" if "nccl" in backend else "cpu"
Expand Down
8 changes: 4 additions & 4 deletions tests/ignite/distributed/utils/test_native.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,14 +33,14 @@ def _test_native_distrib_single_node_launch_tool(backend, device, local_rank, wo
@pytest.mark.distributed
@pytest.mark.skipif(not has_native_dist_support, reason="Skip if no native dist support")
@pytest.mark.parametrize("init_method", [None, "tcp://0.0.0.0:22334", "FILE"])
def test_native_distrib_single_node_launch_tool_gloo(init_method, fixed_dirname, local_rank, world_size):
def test_native_distrib_single_node_launch_tool_gloo(init_method, get_fixed_dirname, local_rank, world_size):

from datetime import timedelta

timeout = timedelta(seconds=20)

if init_method == "FILE":
init_method = f"file://{fixed_dirname}/shared"
init_method = f"file://{get_fixed_dirname('native_distrib_single_node_launch_tool_gloo')}/shared"

_test_native_distrib_single_node_launch_tool(
"gloo", "cpu", local_rank, world_size, timeout=timeout, init_method=init_method
Expand All @@ -51,10 +51,10 @@ def test_native_distrib_single_node_launch_tool_gloo(init_method, fixed_dirname,
@pytest.mark.skipif(not has_native_dist_support, reason="Skip if no native dist support")
@pytest.mark.skipif(torch.cuda.device_count() < 1, reason="Skip if no GPU")
@pytest.mark.parametrize("init_method", [None, "tcp://0.0.0.0:22334", "FILE"])
def test_native_distrib_single_node_launch_tool_nccl(init_method, fixed_dirname, local_rank, world_size):
def test_native_distrib_single_node_launch_tool_nccl(init_method, get_fixed_dirname, local_rank, world_size):

if init_method == "FILE":
init_method = f"file://{fixed_dirname}/shared"
init_method = f"file://{get_fixed_dirname('native_distrib_single_node_launch_tool_nccl')}/shared"

_test_native_distrib_single_node_launch_tool("nccl", "cuda", local_rank, world_size, init_method=init_method)

Expand Down