add cluster utilization data to status endpoint

run-house · Mar 24, 2024 · 198ec3c · 198ec3c
1 parent 501a53e
commit 198ec3c
Show file tree

Hide file tree

Showing 2 changed files with 76 additions and 3 deletions.
diff --git a/runhouse/servers/http/http_server.py b/runhouse/servers/http/http_server.py
@@ -696,7 +696,62 @@ def get_keys(request: Request, env_name: Optional[str] = None):
     @app.get("/status")
     @validate_cluster_access
     def get_status(request: Request):
-        return obj_store.status()
+        import psutil
+        import ray
+
+        # Fields: `CPU`, `object_memory_store`, `memory`, `node`
+        ray_available_resources = {}
+        ray_total_resources = {}
+
+        system_gpu_data = {}
+
+        # Note: Ray resource data does not necessarily the real-time utilization of these resources
+        # Ex: a task may be allocated a CPU but might not be using it at 100% capacity
+        try:
+            ray_available_resources = ray.available_resources()
+            ray_total_resources = ray.cluster_resources()
+        except ray.exceptions.RaySystemError as e:
+            # If ray is not initialized
+            logger.warning(e)
+
+        # System wide data
+        cpu_usage = psutil.cpu_percent(interval=1)
+
+        # Fields: `available`, `percent`, `used`, `free`, `active`, `inactive`, `buffers`, `cached`, `shared`, `slab`
+        memory_usage = psutil.virtual_memory()._asdict()
+
+        # Fields: `total`, `used`, `free`, `percent`
+        disk_usage = psutil.disk_usage("/")._asdict()
+
+        try:
+            # Try loading GPU data (if relevant)
+            import torch
+
+            if torch.cuda.is_available():
+                system_gpu_data = {}
+                gpu_count = torch.cuda.device_count()
+                system_gpu_data["gpu_count"] = gpu_count
+                for i in range(gpu_count):
+                    device_name = torch.cuda.get_device_name(i)
+                    device_properties = torch.cuda.get_device_properties(i)
+                    device_data = {
+                        "device_properties": device_properties,
+                        "total_memory": device_properties.total_memory,
+                    }
+                    system_gpu_data[device_name] = device_data
+
+        except:
+            pass
+
+        return {
+            "cluster_config": obj_store.status(),
+            "ray_available_resources": ray_available_resources,
+            "ray_total_resources": ray_total_resources,
+            "system_cpu_usage": cpu_usage,
+            "system_memory_usage": memory_usage,
+            "system_disk_usage": disk_usage,
+            "system_gpu_data": system_gpu_data,
+        }
 
     @staticmethod
     def _collect_cluster_stats():

diff --git a/tests/test_resources/test_clusters/test_cluster.py b/tests/test_resources/test_clusters/test_cluster.py
@@ -160,7 +160,24 @@ def test_cluster_endpoint(self, cluster):
             headers=rh.globals.rns_client.request_headers(),
         )
         assert r.status_code == 200
-        assert r.json()["resource_type"] == "cluster"
+        assert r.json().get("cluster_config")["resource_type"] == "cluster"
+
+    @pytest.mark.level("local")
+    def test_load_cluster_status(self, cluster):
+        endpoint = cluster.endpoint()
+        verify = cluster.client.verify
+        r = requests.get(
+            f"{endpoint}/status",
+            verify=verify,
+            headers=rh.globals.rns_client.request_headers(),
+        )
+
+        assert r.status_code == 200
+        status_data = r.json()
+        assert status_data["cluster_config"]["resource_type"] == "cluster"
+        assert status_data["ray_available_resources"]
+        assert status_data["ray_total_resources"]
+        assert not status_data["system_gpu_data"]
 
     @pytest.mark.level("local")
     def test_cluster_objects(self, cluster):
@@ -219,7 +236,8 @@ def test_rh_here_objects(self, cluster):
     @pytest.mark.level("local")
     def test_rh_status_pythonic(self, cluster):
         cluster.put(key="status_key1", obj="status_value1", env="numpy_env")
-        res = cluster.status()
+        cluster_data = cluster.status()
+        res = cluster_data.get("cluster_config")
         assert res.get("creds") is None
         assert res.get("server_port") == (cluster.server_port or DEFAULT_SERVER_PORT)
         assert res.get("server_connection_type") == cluster.server_connection_type