Skip to content

Commit

Permalink
add cluster utilization data to status endpoint
Browse files Browse the repository at this point in the history
  • Loading branch information
jlewitt1 committed Mar 24, 2024
1 parent 501a53e commit 198ec3c
Show file tree
Hide file tree
Showing 2 changed files with 76 additions and 3 deletions.
57 changes: 56 additions & 1 deletion runhouse/servers/http/http_server.py
Original file line number Diff line number Diff line change
Expand Up @@ -696,7 +696,62 @@ def get_keys(request: Request, env_name: Optional[str] = None):
@app.get("/status")
@validate_cluster_access
def get_status(request: Request):
return obj_store.status()
import psutil
import ray

# Fields: `CPU`, `object_memory_store`, `memory`, `node`
ray_available_resources = {}
ray_total_resources = {}

system_gpu_data = {}

# Note: Ray resource data does not necessarily the real-time utilization of these resources
# Ex: a task may be allocated a CPU but might not be using it at 100% capacity
try:
ray_available_resources = ray.available_resources()
ray_total_resources = ray.cluster_resources()
except ray.exceptions.RaySystemError as e:
# If ray is not initialized
logger.warning(e)

# System wide data
cpu_usage = psutil.cpu_percent(interval=1)

# Fields: `available`, `percent`, `used`, `free`, `active`, `inactive`, `buffers`, `cached`, `shared`, `slab`
memory_usage = psutil.virtual_memory()._asdict()

# Fields: `total`, `used`, `free`, `percent`
disk_usage = psutil.disk_usage("/")._asdict()

try:
# Try loading GPU data (if relevant)
import torch

if torch.cuda.is_available():
system_gpu_data = {}
gpu_count = torch.cuda.device_count()
system_gpu_data["gpu_count"] = gpu_count
for i in range(gpu_count):
device_name = torch.cuda.get_device_name(i)
device_properties = torch.cuda.get_device_properties(i)
device_data = {
"device_properties": device_properties,
"total_memory": device_properties.total_memory,
}
system_gpu_data[device_name] = device_data

except:
pass

return {
"cluster_config": obj_store.status(),
"ray_available_resources": ray_available_resources,
"ray_total_resources": ray_total_resources,
"system_cpu_usage": cpu_usage,
"system_memory_usage": memory_usage,
"system_disk_usage": disk_usage,
"system_gpu_data": system_gpu_data,
}

@staticmethod
def _collect_cluster_stats():
Expand Down
22 changes: 20 additions & 2 deletions tests/test_resources/test_clusters/test_cluster.py
Original file line number Diff line number Diff line change
Expand Up @@ -160,7 +160,24 @@ def test_cluster_endpoint(self, cluster):
headers=rh.globals.rns_client.request_headers(),
)
assert r.status_code == 200
assert r.json()["resource_type"] == "cluster"
assert r.json().get("cluster_config")["resource_type"] == "cluster"

@pytest.mark.level("local")
def test_load_cluster_status(self, cluster):
endpoint = cluster.endpoint()
verify = cluster.client.verify
r = requests.get(
f"{endpoint}/status",
verify=verify,
headers=rh.globals.rns_client.request_headers(),
)

assert r.status_code == 200
status_data = r.json()
assert status_data["cluster_config"]["resource_type"] == "cluster"
assert status_data["ray_available_resources"]
assert status_data["ray_total_resources"]
assert not status_data["system_gpu_data"]

@pytest.mark.level("local")
def test_cluster_objects(self, cluster):
Expand Down Expand Up @@ -219,7 +236,8 @@ def test_rh_here_objects(self, cluster):
@pytest.mark.level("local")
def test_rh_status_pythonic(self, cluster):
cluster.put(key="status_key1", obj="status_value1", env="numpy_env")
res = cluster.status()
cluster_data = cluster.status()
res = cluster_data.get("cluster_config")
assert res.get("creds") is None
assert res.get("server_port") == (cluster.server_port or DEFAULT_SERVER_PORT)
assert res.get("server_connection_type") == cluster.server_connection_type
Expand Down

0 comments on commit 198ec3c

Please sign in to comment.