In [2]:
# (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary.

# @noautodeps
# pyre-ignore-all-errors

import argparse
import asyncio
import getpass
import json
import logging
import os
import pathlib
import sys

import cloudpickle

sys.path.append("/home/ubuntu/ahmads/monarch/examples")
from compute_world_size_actor import TestActor

from monarch._rust_bindings.monarch_hyperactor.alloc import AllocConstraints, AllocSpec

# from monarch._src.actor.meta.allocator import MastAllocator, MastAllocatorConfig

from monarch.actor import ProcMesh
from monarch.tools import commands
from monarch.tools.components import hyperactor
from monarch.tools.config import Config, UnnamedAppDef
from monarch._src.actor.allocator import (
    RemoteAllocator,
    StaticRemoteAllocInitializer,
    TorchXRemoteAllocInitializer,
)


import math
import os

import torch
import torch.distributed as dist
import torch.nn.functional as F
from monarch.actor import Actor, current_rank, current_size, endpoint


USER = getpass.getuser()
HOME = pathlib.Path().home()
CWD = os.getcwd()
DEACTIVATE = None

logging.basicConfig(
    level=logging.INFO,
    format="%(name)s %(asctime)s %(levelname)s %(message)s",
    datefmt="%Y-%m-%d %H:%M:%S",
    force=True,
)


logger: logging.Logger = logging.getLogger(__name__)

FORCE_RESTART = False


async def get_appdef(num_hosts):
    # similar to Docker image; should contain a conda env in the $img_root/conda/ directory
    # when config.workspace is not None, an ephemeral fbpkg version is created
    # that conda-packs the currently active local conda env AND the directory specified by workspace
    image = "monarch_default_workspace:latest"

    appdef = hyperactor.host_mesh(
        image=image,
        # TODO: For some reason gpu.medium doens't work here
        meshes=[f"mesh0:{num_hosts}:aws_g5.12xlarge"],  # mesh_name:num_hosts:host_type
    )
    return appdef


async def get_server_info(appdef):
    jobname = f"monarch-{USER}"

    # TODO: Register this so we don't have to do this every time
    for role in appdef.roles:
        role.resource.memMB = 186777

    config = Config(
        scheduler="slurm",
        scheduler_args={
            # NOTE: replace with your own values
            "hpcIdentity": "pytorch_distributed",
            "hpcJobOncall": "monarch",
            "hpcClusterUuid": "MastProdCluster",
            "rmAttribution": "pytorch4all_clients_approved",
        },
        appdef=appdef,
        workspace=str(CWD),  # or None to disable building ephemeral,
    )

    # config.dryrun = True
    # o = commands.create(config)
    # print(o)
    # sys.exit(0)

    server_info = await commands.get_or_create(
        jobname,
        config,
        force_restart=FORCE_RESTART,
    )
    return server_info


async def create_proc_mesh(num_hosts, appdef, server_info):
    # TODO: why is gpus equal to -1 in server_info?

    num_gpus_per_host = appdef.roles[0].resource.gpu

    logger.info(
        "\n===== Server Info =====\n%s",
        json.dumps(server_info.to_json(), indent=2),
    )

    mesh_dimensions = {
        "host": server_info.get_mesh_spec("mesh0").num_hosts,
        "gpu": server_info.get_mesh_spec("mesh0").gpus,
    }

    allocator = RemoteAllocator(
        world_id="foo",
        initializer=TorchXRemoteAllocInitializer(server_info.server_handle),
    )
    alloc = await allocator.allocate(
        AllocSpec(AllocConstraints(), hosts=num_hosts, gpus=num_gpus_per_host)
    )

    proc_mesh = await ProcMesh.from_alloc(alloc)
    return proc_mesh


async def main():
    num_hosts = 2
    appdef = await get_appdef(num_hosts)
    server_info = await get_server_info(appdef)

    try:
        proc_mesh = await create_proc_mesh(num_hosts, appdef, server_info)
        actor = await proc_mesh.spawn("compute_world_size_actor", TestActor)

        logger.info("computing world size...")
        # this is redundant but is here for example sake
        mesh_name = server_info.get_mesh_spec("mesh0").name
        values = await actor.compute_world_size.call(
            master_addr=server_info.host0(mesh_name),
            master_port=29500,
        )

        values_by_rank = {f"rank_{p.rank}": v for p, v in list(values.flatten("rank"))}

        logger.info(
            f"""computed world_sizes:
    {'-'*40}
    {json.dumps(values_by_rank, indent=2)}
    {'-'*40}"""
        )
    finally:
        commands.kill(f"slurm:///{server_info.name}")


if __name__ == "__main__":
    cloudpickle.register_pickle_by_value(sys.modules[TestActor.__module__])

    # asyncio.run(main())
    await main()

torchx.schedulers.slurm_scheduler 2025-08-28 22:04:57 INFO unable to get job info for `monarch-ubuntu` with `squeue` (squeue: error: Invalid job id: monarch-ubuntu
), trying `sacct`
torchx.schedulers.slurm_scheduler 2025-08-28 22:04:57 INFO unable to get job info for `monarch-ubuntu` with `sacct` (sacct: fatal: Bad job/step specified: monarch-ubuntu
)
monarch.tools.commands 2025-08-28 22:04:57 INFO no existing RUNNING server `slurm:///monarch-ubuntu` creating new one...
torchx.runner.api 2025-08-28 22:04:57 INFO Tracker configurations: {}
torchx.runner.api 2025-08-28 22:04:57 INFO Checking for changes in workspace `/home/ubuntu/ahmads/monarch/examples`...
torchx.runner.api 2025-08-28 22:04:57 INFO To disable workspaces pass: --workspace="" from CLI or workspace=None programmatically.
torchx.runner.api 2025-08-28 22:04:57 INFO Reusing original image `monarch_default_workspace:latest` for role[0]=mesh0. Either a patch was built or no changes to workspace was detected.
monarch.tools.comma

Ahmad: {'requeue': None, 'ntasks-per-node': '1', 'cpus-per-task': '48', 'mem': '186777', 'gpus-per-task': '4', 'ntasks': '1'}
Ahmad: {'requeue': None, 'ntasks-per-node': '1', 'cpus-per-task': '48', 'mem': '186777', 'gpus-per-task': '4', 'ntasks': '1'}
Waiting for slurm:///336 to be RUNNING (current: PENDING); will check again in 5.0 seconds. Total wait time: 0:00:10.057235

__main__ 2025-08-28 22:05:12 INFO 
===== Server Info =====
{
  "name": "336",
  "server_handle": "slurm:///336",
  "state": "RUNNING",
  "meshes": {
    "mesh0": {
      "host_type": "__UNSET__",
      "hosts": 2,
      "gpus": -1,
      "hostnames": [
        "gpu-queue-st-gpu-compute-1",
        "gpu-queue-st-gpu-compute-2"
      ]
    }
  }
}
__main__ 2025-08-28 22:05:12 INFO computing world size...
monarch._src.actor.allocator 2025-08-28 22:05:12 INFO no match label `procmesh.monarch.meta.com/name` specified in alloc constraints
monarch._src.actor.allocator 2025-08-28 22:05:12 INFO found a single proc mesh `mesh0` in slurm:///336, will allocate on it
monarch.tools.network 2025-08-28 22:05:12 INFO no AF_INET6 address that can bind TCP sockets for `gpu-queue-st-gpu-compute-1:26600` (error: [Errno -5] No address associated with hostname)
monarch.tools.network 2025-08-28 22:05:12 INFO resolved AF_INET address `10.0.2.165:26600` for `gpu-queue-st-gpu-compute-1:26600`
monarch.tools.netwo

New job `slurm:///336` is ready to serve.


__main__ 2025-08-28 22:05:19 INFO computed world_sizes:
    ----------------------------------------
    {
  "rank_0": 8,
  "rank_1": 8,
  "rank_2": 8,
  "rank_3": 8,
  "rank_4": 8,
  "rank_5": 8,
  "rank_6": 8,
  "rank_7": 8
}
    ----------------------------------------


[36m>>> Aggregated Logs (2025-08-28 22:05:18) >>>[0m
[33m[8 similar log lines][0m Initializing process group `nccl`:
[33m[8 similar log lines][0m   MASTER_ADDR = gpu-queue-st-gpu-compute-1
[33m[8 similar log lines][0m   MASTER_PORT = 29500
[33m[8 similar log lines][0m   RANK        = 0
[33m[8 similar log lines][0m   WORLD_SIZE  = 8
[36m<<< Aggregated Logs (2025-08-28 22:05:19) <<<[0m

