From 198ec3cd707088639c77c33896d8fa3ba38a61a8 Mon Sep 17 00:00:00 2001 From: jlewitt1 Date: Sun, 24 Mar 2024 14:56:48 +0200 Subject: [PATCH] add cluster utilization data to `status` endpoint --- runhouse/servers/http/http_server.py | 57 ++++++++++++++++++- .../test_clusters/test_cluster.py | 22 ++++++- 2 files changed, 76 insertions(+), 3 deletions(-) diff --git a/runhouse/servers/http/http_server.py b/runhouse/servers/http/http_server.py index 65b3caa43..0b71016f2 100644 --- a/runhouse/servers/http/http_server.py +++ b/runhouse/servers/http/http_server.py @@ -696,7 +696,62 @@ def get_keys(request: Request, env_name: Optional[str] = None): @app.get("/status") @validate_cluster_access def get_status(request: Request): - return obj_store.status() + import psutil + import ray + + # Fields: `CPU`, `object_memory_store`, `memory`, `node` + ray_available_resources = {} + ray_total_resources = {} + + system_gpu_data = {} + + # Note: Ray resource data does not necessarily the real-time utilization of these resources + # Ex: a task may be allocated a CPU but might not be using it at 100% capacity + try: + ray_available_resources = ray.available_resources() + ray_total_resources = ray.cluster_resources() + except ray.exceptions.RaySystemError as e: + # If ray is not initialized + logger.warning(e) + + # System wide data + cpu_usage = psutil.cpu_percent(interval=1) + + # Fields: `available`, `percent`, `used`, `free`, `active`, `inactive`, `buffers`, `cached`, `shared`, `slab` + memory_usage = psutil.virtual_memory()._asdict() + + # Fields: `total`, `used`, `free`, `percent` + disk_usage = psutil.disk_usage("/")._asdict() + + try: + # Try loading GPU data (if relevant) + import torch + + if torch.cuda.is_available(): + system_gpu_data = {} + gpu_count = torch.cuda.device_count() + system_gpu_data["gpu_count"] = gpu_count + for i in range(gpu_count): + device_name = torch.cuda.get_device_name(i) + device_properties = torch.cuda.get_device_properties(i) + device_data = { + "device_properties": device_properties, + "total_memory": device_properties.total_memory, + } + system_gpu_data[device_name] = device_data + + except: + pass + + return { + "cluster_config": obj_store.status(), + "ray_available_resources": ray_available_resources, + "ray_total_resources": ray_total_resources, + "system_cpu_usage": cpu_usage, + "system_memory_usage": memory_usage, + "system_disk_usage": disk_usage, + "system_gpu_data": system_gpu_data, + } @staticmethod def _collect_cluster_stats(): diff --git a/tests/test_resources/test_clusters/test_cluster.py b/tests/test_resources/test_clusters/test_cluster.py index 3ef70f96e..60037e907 100644 --- a/tests/test_resources/test_clusters/test_cluster.py +++ b/tests/test_resources/test_clusters/test_cluster.py @@ -160,7 +160,24 @@ def test_cluster_endpoint(self, cluster): headers=rh.globals.rns_client.request_headers(), ) assert r.status_code == 200 - assert r.json()["resource_type"] == "cluster" + assert r.json().get("cluster_config")["resource_type"] == "cluster" + + @pytest.mark.level("local") + def test_load_cluster_status(self, cluster): + endpoint = cluster.endpoint() + verify = cluster.client.verify + r = requests.get( + f"{endpoint}/status", + verify=verify, + headers=rh.globals.rns_client.request_headers(), + ) + + assert r.status_code == 200 + status_data = r.json() + assert status_data["cluster_config"]["resource_type"] == "cluster" + assert status_data["ray_available_resources"] + assert status_data["ray_total_resources"] + assert not status_data["system_gpu_data"] @pytest.mark.level("local") def test_cluster_objects(self, cluster): @@ -219,7 +236,8 @@ def test_rh_here_objects(self, cluster): @pytest.mark.level("local") def test_rh_status_pythonic(self, cluster): cluster.put(key="status_key1", obj="status_value1", env="numpy_env") - res = cluster.status() + cluster_data = cluster.status() + res = cluster_data.get("cluster_config") assert res.get("creds") is None assert res.get("server_port") == (cluster.server_port or DEFAULT_SERVER_PORT) assert res.get("server_connection_type") == cluster.server_connection_type