Skip to content

Commit e4ead3a

Browse files
committed
Adding torch accelerator to ddp-tutorial-series example
Signed-off-by: dggaytan <diana.gaytan.munoz@intel.com>
1 parent d47f0f3 commit e4ead3a

File tree

8 files changed

+87
-23
lines changed

8 files changed

+87
-23
lines changed

distributed/ddp-tutorial-series/README.md

Lines changed: 24 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,27 @@ Each code file extends upon the previous one. The series starts with a non-distr
1515
* [slurm/setup_pcluster_slurm.md](slurm/setup_pcluster_slurm.md): instructions to set up an AWS cluster
1616
* [slurm/config.yaml.template](slurm/config.yaml.template): configuration to set up an AWS cluster
1717
* [slurm/sbatch_run.sh](slurm/sbatch_run.sh): slurm script to launch the training job
18-
19-
20-
21-
18+
## Installation
19+
```
20+
pip install -r requirements.txt
21+
```
22+
## Running Examples
23+
For running the examples to run for 20 Epochs and save checkpoints every 5 Epochs, you can use the following command:
24+
### Single GPU
25+
```
26+
python single_gpu.py 20 5
27+
```
28+
### Multi-GPU
29+
```
30+
python multigpu.py 20 5
31+
```
32+
### Multi-GPU Torchrun
33+
```
34+
torchrun --nnodes=1 --nproc_per_node=4 multigpu_torchrun.py 20 5
35+
```
36+
### Multi-Node
37+
```
38+
torchrun --nnodes=2 --nproc_per_node=4 multinode.py 20 5
39+
```
40+
41+
For more details, check the [run_examples.sh](distributed/ddp-tutorial-series/run_examples.sh) script.

distributed/ddp-tutorial-series/multigpu.py

Lines changed: 11 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -18,8 +18,14 @@ def ddp_setup(rank, world_size):
1818
"""
1919
os.environ["MASTER_ADDR"] = "localhost"
2020
os.environ["MASTER_PORT"] = "12355"
21-
torch.cuda.set_device(rank)
22-
init_process_group(backend="nccl", rank=rank, world_size=world_size)
21+
22+
if torch.accelerator.is_available():
23+
device = torch.device(f"{torch.accelerator.current_accelerator()}:{rank}")
24+
torch.accelerator.set_device_index(rank)
25+
print(f"Running on rank {rank} on device {device}")
26+
27+
backend = torch.distributed.get_default_backend_for_device(device)
28+
init_process_group(backend=backend, rank=rank, world_size=world_size)
2329

2430
class Trainer:
2531
def __init__(
@@ -95,10 +101,10 @@ def main(rank: int, world_size: int, save_every: int, total_epochs: int, batch_s
95101
if __name__ == "__main__":
96102
import argparse
97103
parser = argparse.ArgumentParser(description='simple distributed training job')
98-
parser.add_argument('total_epochs', type=int, help='Total epochs to train the model')
99-
parser.add_argument('save_every', type=int, help='How often to save a snapshot')
104+
parser.add_argument('total_epochs', default=50, type=int, help='Total epochs to train the model')
105+
parser.add_argument('save_every', default=5, type=int, help='How often to save a snapshot')
100106
parser.add_argument('--batch_size', default=32, type=int, help='Input batch size on each device (default: 32)')
101107
args = parser.parse_args()
102108

103-
world_size = torch.cuda.device_count()
109+
world_size = torch.accelerator.device_count()
104110
mp.spawn(main, args=(world_size, args.save_every, args.total_epochs, args.batch_size), nprocs=world_size)

distributed/ddp-tutorial-series/multigpu_torchrun.py

Lines changed: 15 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -11,8 +11,18 @@
1111

1212

1313
def ddp_setup():
14-
torch.cuda.set_device(int(os.environ["LOCAL_RANK"]))
15-
init_process_group(backend="nccl")
14+
rank = int(os.environ["LOCAL_RANK"])
15+
if torch.accelerator.is_available():
16+
device = torch.device(f"{torch.accelerator.current_accelerator()}:{rank}")
17+
torch.accelerator.set_device_index(rank)
18+
print(f"Running on rank {rank} on device {device}")
19+
else:
20+
print(f"Multi-GPU environment not detected")
21+
22+
backend = torch.distributed.get_default_backend_for_device(rank)
23+
torch.distributed.init_process_group(backend=backend, rank=rank, device_id=rank)
24+
25+
1626

1727
class Trainer:
1828
def __init__(
@@ -37,7 +47,7 @@ def __init__(
3747
self.model = DDP(self.model, device_ids=[self.gpu_id])
3848

3949
def _load_snapshot(self, snapshot_path):
40-
loc = f"cuda:{self.gpu_id}"
50+
loc = str(torch.accelerator.current_accelerator())
4151
snapshot = torch.load(snapshot_path, map_location=loc)
4252
self.model.load_state_dict(snapshot["MODEL_STATE"])
4353
self.epochs_run = snapshot["EPOCHS_RUN"]
@@ -103,8 +113,8 @@ def main(save_every: int, total_epochs: int, batch_size: int, snapshot_path: str
103113
if __name__ == "__main__":
104114
import argparse
105115
parser = argparse.ArgumentParser(description='simple distributed training job')
106-
parser.add_argument('total_epochs', type=int, help='Total epochs to train the model')
107-
parser.add_argument('save_every', type=int, help='How often to save a snapshot')
116+
parser.add_argument('total_epochs', default=50, type=int, help='Total epochs to train the model')
117+
parser.add_argument('save_every', default=5, type=int, help='How often to save a snapshot')
108118
parser.add_argument('--batch_size', default=32, type=int, help='Input batch size on each device (default: 32)')
109119
args = parser.parse_args()
110120

distributed/ddp-tutorial-series/multinode.py

Lines changed: 14 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -11,8 +11,17 @@
1111

1212

1313
def ddp_setup():
14-
torch.cuda.set_device(int(os.environ["LOCAL_RANK"]))
15-
init_process_group(backend="nccl")
14+
rank = int(os.environ["LOCAL_RANK"])
15+
if torch.accelerator.is_available():
16+
device = torch.device(f"{torch.accelerator.current_accelerator()}:{rank}")
17+
torch.accelerator.set_device_index(rank)
18+
print(f"Running on rank {rank} on device {device}")
19+
else:
20+
print(f"Multi-GPU environment not detected")
21+
22+
backend = torch.distributed.get_default_backend_for_device(rank)
23+
torch.distributed.init_process_group(backend=backend, rank=rank, device_id=rank)
24+
1625

1726
class Trainer:
1827
def __init__(
@@ -38,7 +47,7 @@ def __init__(
3847
self.model = DDP(self.model, device_ids=[self.local_rank])
3948

4049
def _load_snapshot(self, snapshot_path):
41-
loc = f"cuda:{self.local_rank}"
50+
loc = str(torch.accelerator.current_accelerator())
4251
snapshot = torch.load(snapshot_path, map_location=loc)
4352
self.model.load_state_dict(snapshot["MODEL_STATE"])
4453
self.epochs_run = snapshot["EPOCHS_RUN"]
@@ -104,8 +113,8 @@ def main(save_every: int, total_epochs: int, batch_size: int, snapshot_path: str
104113
if __name__ == "__main__":
105114
import argparse
106115
parser = argparse.ArgumentParser(description='simple distributed training job')
107-
parser.add_argument('total_epochs', type=int, help='Total epochs to train the model')
108-
parser.add_argument('save_every', type=int, help='How often to save a snapshot')
116+
parser.add_argument('total_epochs', default=50, type=int, help='Total epochs to train the model')
117+
parser.add_argument('save_every', default=5, type=int, help='How often to save a snapshot')
109118
parser.add_argument('--batch_size', default=32, type=int, help='Input batch size on each device (default: 32)')
110119
args = parser.parse_args()
111120

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
torch>=1.11.0
1+
torch>=2.7
Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
# /bin/bash
2+
# bash run_example.sh {file_to_run.py} {num_gpus}
3+
# where file_to_run = example to run. Default = 'example.py'
4+
# num_gpus = num local gpus to use (must be at least 2). Default = 2
5+
6+
# samples to run include:
7+
# multigpu_torchrun.py
8+
# multinode.py
9+
10+
echo "Launching ${1:-example.py} with ${2:-2} gpus"
11+
torchrun --nnodes=1 --nproc_per_node=${2:-2} --rdzv_id=101 --rdzv_endpoint="localhost:5972" ${1:-example.py} 10 1

distributed/ddp-tutorial-series/single_gpu.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -73,10 +73,10 @@ def main(device, total_epochs, save_every, batch_size):
7373
if __name__ == "__main__":
7474
import argparse
7575
parser = argparse.ArgumentParser(description='simple distributed training job')
76-
parser.add_argument('total_epochs', type=int, help='Total epochs to train the model')
77-
parser.add_argument('save_every', type=int, help='How often to save a snapshot')
76+
parser.add_argument('total_epochs', default=50, type=int, help='Total epochs to train the model')
77+
parser.add_argument('save_every', default=5, type=int, help='How often to save a snapshot')
7878
parser.add_argument('--batch_size', default=32, type=int, help='Input batch size on each device (default: 32)')
7979
args = parser.parse_args()
8080

81-
device = 0 # shorthand for cuda:0
81+
device = 0
8282
main(device, args.total_epochs, args.save_every, args.batch_size)

run_distributed_examples.sh

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -50,12 +50,20 @@ function distributed_tensor_parallelism() {
5050
uv run bash run_example.sh fsdp_tp_example.py || error "2D parallel example failed"
5151
}
5252

53+
function distributed_ddp-tutorial-series() {
54+
uv python multigpu.py 10 1 || error "ddp tutorial series multigpu example failed"
55+
uv run bash run_example.sh multigpu_torchrun.py || error "ddp tutorial series multigpu torchrun example failed"
56+
uv run bash run_example.sh multinode.py || error "ddp tutorial series multinode example failed"
57+
uv python single_gpu.py 10 1 || error "ddp tutorial series single gpu example failed"
58+
}
59+
5360
function distributed_ddp() {
5461
uv run main.py || error "ddp example failed"
5562
}
5663

5764
function run_all() {
5865
run distributed/tensor_parallelism
66+
run distributed/ddp-tutorial-series
5967
run distributed/ddp
6068
}
6169

0 commit comments

Comments
 (0)