In [None]:
# (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary.

# @noautodeps
# pyre-ignore-all-errors
import logging
import os
import os
import torch
import torch.distributed as dist
import torch.distributed as dist
import torch.nn as nn
import torch.optim as optim

from monarch.tools import commands
from monarch.actor import Actor, current_rank, endpoint
from monarch.actor import Actor, current_rank, endpoint
from monarch.utils import setup_env_for_distributed
from torch.nn.parallel import DistributedDataParallel as DDP
from slurm.utils import get_appdef, get_server_info, create_proc_mesh


logging.basicConfig(
    level=logging.INFO,
    format="%(name)s %(asctime)s %(levelname)s %(message)s",
    datefmt="%Y-%m-%d %H:%M:%S",
    force=True,
)


logger: logging.Logger = logging.getLogger(__name__)


class ToyModel(nn.Module):
    """A simple toy model for demonstration purposes."""

    def __init__(self):
        super(ToyModel, self).__init__()
        self.net1 = nn.Linear(10, 10)
        self.relu = nn.ReLU()
        self.net2 = nn.Linear(10, 5)

    def forward(self, x):
        return self.net2(self.relu(self.net1(x)))


class DDPActor(Actor):
    """This Actor wraps the basic functionality from Torch's DDP example.

    Conveniently, all of the methods we need are already laid out for us,
    so we can just wrap them in the usual Actor endpoint semantic with some
    light modifications.

    Adapted from: https://docs.pytorch.org/tutorials/intermediate/ddp_tutorial.html#basic-use-case
    """

    def __init__(self):
        self.rank = current_rank().rank

    def _rprint(self, msg):
        """Helper method to print with rank information."""
        print(f"{self.rank=} {msg}")

    @endpoint
    async def setup(self):
        """Initialize the PyTorch distributed process group."""
        self._rprint("Initializing torch distributed")

        WORLD_SIZE = int(os.environ["WORLD_SIZE"])
        # initialize the process group
        dist.init_process_group("gloo", rank=self.rank, world_size=WORLD_SIZE)
        self._rprint("Finished initializing torch distributed")

    @endpoint
    async def cleanup(self):
        """Clean up the PyTorch distributed process group."""
        self._rprint("Cleaning up torch distributed")
        dist.destroy_process_group()

    @endpoint
    async def demo_basic(self):
        """Run a basic DDP training example."""
        self._rprint("Running basic DDP example")

        # create model and move it to GPU with id rank
        local_rank = int(os.environ["LOCAL_RANK"])
        self._rprint(f"{local_rank=}")
        model = ToyModel().to(local_rank)
        ddp_model = DDP(model, device_ids=[local_rank])

        loss_fn = nn.MSELoss()
        optimizer = optim.SGD(ddp_model.parameters(), lr=0.001)

        optimizer.zero_grad()
        outputs = ddp_model(torch.randn(20, 10))
        labels = torch.randn(20, 5).to(local_rank)
        loss_fn(outputs, labels).backward()
        optimizer.step()

        print(f"{self.rank=} Finished running basic DDP example")


async def main():
    num_hosts = 2
    appdef = await get_appdef(num_hosts)
    server_info = await get_server_info(appdef)

    try:
        proc_mesh = await create_proc_mesh(num_hosts, appdef, server_info)

        ddp_actor = await proc_mesh.spawn("ddp_actor", DDPActor)

        await setup_env_for_distributed(proc_mesh)

        await ddp_actor.setup.call()
        await ddp_actor.demo_basic.call()
        await ddp_actor.cleanup.call()

        print("DDP example completed successfully!")

    finally:
        commands.kill(f"slurm:///{server_info.name}")


if __name__ == "__main__":
    await main()

torchx.schedulers.slurm_scheduler 2025-08-29 20:40:34 INFO unable to get job info for `monarch-ubuntu` with `squeue` (squeue: error: Invalid job id: monarch-ubuntu
), trying `sacct`
torchx.schedulers.slurm_scheduler 2025-08-29 20:40:34 INFO unable to get job info for `monarch-ubuntu` with `sacct` (sacct: fatal: Bad job/step specified: monarch-ubuntu
)
monarch.tools.commands 2025-08-29 20:40:34 INFO no existing RUNNING server `slurm:///monarch-ubuntu` creating new one...
torchx.runner.api 2025-08-29 20:40:34 INFO Tracker configurations: {}
torchx.runner.api 2025-08-29 20:40:34 INFO Checking for changes in workspace `/home/ubuntu/.monarch/out/tmp3m4zzjg6/workspace`...
torchx.runner.api 2025-08-29 20:40:34 INFO To disable workspaces pass: --workspace="" from CLI or workspace=None programmatically.
torchx.runner.api 2025-08-29 20:40:34 INFO Reusing original image `monarch_default_workspace:latest` for role[0]=mesh0. Either a patch was built or no changes to workspace was detected.
monarch.

Ahmad: {'requeue': None, 'ntasks-per-node': '1', 'cpus-per-task': '48', 'mem': '186777', 'gpus-per-task': '4', 'ntasks': '1'}
Ahmad: {'requeue': None, 'ntasks-per-node': '1', 'cpus-per-task': '48', 'mem': '186777', 'gpus-per-task': '4', 'ntasks': '1'}
Waiting for slurm:///410 to be RUNNING (current: PENDING); will check again in 5.0 seconds. Total wait time: 0:00:00.015800

[-]E0829 20:40:34.996334  8536 hyperactor/src/channel/net.rs:695] error_msg:session tcp:10.0.2.236:26600.5117454862225131082: failed to deliver message within timeout
[-]E0829 20:40:35.341902  8536 hyperactor/src/channel/net.rs:708] error_msg:session tcp:10.0.2.132:26600.8381289842876906331: failed to receive ack within timeout 30 secs; link is currently broken


Waiting for slurm:///410 to be RUNNING (current: PENDING); will check again in 5.0 seconds. Total wait time: 0:00:10.059201

slurm.utils 2025-08-29 20:40:49 INFO 
===== Server Info =====
{
  "name": "410",
  "server_handle": "slurm:///410",
  "state": "RUNNING",
  "meshes": {
    "mesh0": {
      "host_type": "__UNSET__",
      "hosts": 2,
      "gpus": -1,
      "hostnames": [
        "gpu-queue-st-gpu-compute-1",
        "gpu-queue-st-gpu-compute-2"
      ]
    }
  }
}
monarch._src.actor.allocator 2025-08-29 20:40:49 INFO no match label `procmesh.monarch.meta.com/name` specified in alloc constraints
monarch._src.actor.allocator 2025-08-29 20:40:49 INFO found a single proc mesh `mesh0` in slurm:///410, will allocate on it
monarch.tools.network 2025-08-29 20:40:49 INFO no AF_INET6 address that can bind TCP sockets for `gpu-queue-st-gpu-compute-1:26600` (error: [Errno -5] No address associated with hostname)
monarch.tools.network 2025-08-29 20:40:49 INFO resolved AF_INET address `10.0.2.236:26600` for `gpu-queue-st-gpu-compute-1:26600`
monarch.tools.network 2025-08-29 20:40:49 INFO no AF_INET6 address that ca

New job `slurm:///410` is ready to serve.
[36m>>> Aggregated Logs (2025-08-29 20:40:55) >>>[0m
[33m[8 similar log lines][0m self.rank=7 Initializing torch distributed
[33m[8 similar log lines][0m [Gloo] Rank 0 is connected to 7 peer ranks. Expected number of connected peer ranks is : 7
[33m[8 similar log lines][0m self.rank=0 Finished initializing torch distributed
[33m[8 similar log lines][0m self.rank=0 Running basic DDP example
[33m[8 similar log lines][0m self.rank=5 local_rank=1
[36m<<< Aggregated Logs (2025-08-29 20:40:58) <<<[0m

DDP example completed successfully!
[36m>>> Aggregated Logs (2025-08-29 20:40:58) >>>[0m
[33m[8 similar log lines][0m self.rank=6 Finished running basic DDP example
[33m[8 similar log lines][0m self.rank=0 Cleaning up torch distributed
[36m<<< Aggregated Logs (2025-08-29 20:41:01) <<<[0m



[-]E0829 20:41:30.158934  8536 hyperactor/src/channel/net.rs:695] error_msg:session tcp:10.0.2.132:26600.11111315873644166091: failed to deliver message within timeout
[-]E0829 20:41:30.774458  8536 hyperactor/src/channel/net.rs:695] error_msg:session tcp:10.0.2.236:26600.6097672994633804723: failed to deliver message within timeout
[-]E0829 20:41:34.705394  8536 hyperactor/src/channel/net.rs:695] error_msg:session tcp:10.0.2.236:38955.9004778724387042266: failed to deliver message within timeout
