# Infrastructure Setup

In [None]:
import os
from chi import context

# We select which chameleon site we are using and choose the project
context.use_site('CHI@UC')
context.choose_project()
username = os.environ.get('USER').replace("_", "-")

## Creating a 1-day lease

In [None]:
from chi import lease
from datetime import timedelta

In [None]:
node_type="gpu_rtx_6000"
instance_count = 2
network_name=f"{username}-storage-network-cuda"

In [None]:
mpi_lease = lease.Lease(f"{username}-mpi-lease-cuda", duration=timedelta(days=4))

mpi_lease.add_node_reservation(amount=instance_count, node_type=node_type)
# Register a storage network which is required to use NFS Shares
mpi_lease.add_network_reservation(network_name=network_name, usage_type='storage')
mpi_lease.add_fip_reservation(1)
mpi_lease.submit(idempotent=True)

## Create a New Share

In [None]:
from chi import share

In [None]:
share_names = [s.name for s in share.list_shares()]

In [None]:
share_name = f"{username}-share-cuda"
if share_name not in share_names:
    print(f"Creating new share - {share_name}")
    mpi_share = share.create_share(size=1, name=share_name)
else:
    mpi_share = share.get_share(share_name)

In [None]:
nfs_share = share.get_share(mpi_share.id)
nfs_share

## Creating server instances

In [None]:
from chi import server

In [None]:
mpi_master=server.Server(
        f"{username}-mpi-master-cuda",
        reservation_id=mpi_lease.node_reservations[0]["id"],
        image_name="Ubuntu22.04-HPC-MPI-Spack-CUDA",
        network_name=network_name
)
mpi_master_hostname = mpi_master.name
mpi_master.submit(idempotent=True)

mpi_workers = []
mpi_worker_hostnames = []
for i in range(instance_count - 1):
    mpi_worker=server.Server(
            f"{username}-mpi-worker-{i+1}-cuda",
            reservation_id=mpi_lease.node_reservations[0]["id"],
            image_name="Ubuntu22.04-HPC-MPI-Spack-CUDA",
            network_name=network_name
    )
    mpi_worker.submit(idempotent=True)
    mpi_workers.append(mpi_worker)
    mpi_worker_hostnames.append(mpi_worker.name)

In [None]:
all_hostnames = mpi_master_hostname + "," + ",".join(mpi_worker_hostnames)

In [None]:
fip = mpi_lease.get_reserved_floating_ips()[0]
server.associate_floating_ip(mpi_master.id, fip)

## Create inventory.ini to work with ansible

In [None]:
with open("./inventory.ini", "w") as f:
    f.write("[master_node]\n")
    f.write(f"{mpi_master.name} ansible_host={fip}\n\n")
    
    f.write("[worker_nodes]\n")
    f.write("\n".join(f"{w.name} ansible_host={w.addresses[network_name][0]['addr']}" for w in mpi_workers))
    f.write("\n\n")
    f.write("[worker_nodes:vars]\n")
    f.write(f"ansible_ssh_common_args='-o ProxyJump=cc@{fip}'")

## Use Ansible to create an MPI Cluster

In [None]:
nfs_shares = [
    {"export": nfs_share.export_locations[0], "mount": "/mnt/share1"}
]

In [None]:
import ansible_runner
import tempfile
tmpdir = tempfile.TemporaryDirectory()
ansible_run = ansible_runner.run(
    private_data_dir=tmpdir.name,
    inventory=os.path.abspath("inventory.ini"),
    envvars = {
        "ANSIBLE_PYTHON_INTERPRETER": "/usr/bin/python3",
        "ANSIBLE_SSH_ARGS": f"-F {os.path.abspath("config")}",
    },
    extravars={
        "nfs_shares": nfs_shares
    },
    playbook=os.path.abspath("mpi-cluster.yml"),
    verbosity=0
)

In [None]:
mpi_master.upload("./examples/src/hello.cu", "/home/cc/hello.cu")
mpi_master.upload("./examples/mpi_jobs/run_hello_cuda.sh", "/home/cc/run_hello.sh")

In [None]:
mpi_master.execute(f'bash -lc "source /home/cc/run_hello.sh {mpi_master_hostname} {",".join(mpi_worker_hostnames)}"')