# monitor cluster

by executing commands on all its pods

In [1]:
import subprocess
from IPython.display import clear_output
from datetime import datetime
from joblib import Parallel, delayed
from loguru import logger
from time import sleep
import numpy as np

In [5]:
class Pods:

    def __init__(self):
        self.get_pods()

    def get_pods(self):
        logger.info('getting pods')
        getpods_command = 'kubectl get pods'
        
        s = subprocess.run(getpods_command.split(), capture_output=True).stdout.decode()
        self.podnames = [i.split()[0] for i in s.split('\n') if not i.startswith('NAME') and len(i.split())>0]

    def run_cmd(self, cmd):
        def _run_cmd(podname, cmd):
            formatted_cmd = cmd.format(podname=podname)
            return subprocess.run(formatted_cmd.split(), capture_output=True).stdout.decode().strip()
        
        r = Parallel(n_jobs=-1)(delayed(_run_cmd)(podname, cmd) for podname in self.podnames)    
        return {k:v for k,v in zip(self.podnames, r)}
    
    def gpu_usage(self):
        gpusage_command = 'kubectl exec -it {podname} --  nvidia-smi --query-gpu=utilization.gpu --format=csv,noheader'
        return self.run_cmd(gpusage_command)

    def run_loop(self, method, wait_seconds=1):
        while True:
            r = method()
            clear_output()
            current_dateTime = datetime.now()    
            print(current_dateTime)
            for k,v in r.items():
                print (k, v)

            sleep(wait_seconds)
            
            if np.random.randint(10)==0:
                self.get_pods()

In [6]:
pods = Pods()

[32m2025-03-23 07:11:16.138[0m | [1mINFO    [0m | [36m__main__[0m:[36mget_pods[0m:[36m7[0m - [1mgetting pods[0m


## GPU usage in a loop

In [None]:
pods.run_loop(pods.gpu_usage)

2025-03-23 08:47:55.610392
tgi-gemma-deployment-7d9f9dcd9d-fkhnh 0 %


In [96]:
k =   [
    {
      "kind": "Deployment",
      "apiVersion": "apps/v1",
      "content": "apiVersion: apps/v1\nkind: Deployment\nmetadata:\n  annotations:\n    aire.gke.io/generated: \"true\"\n    aire.gke.io/inference-server: vllm\n  creationTimestamp: null\n  labels:\n    app: llama3-8b-vllm-inference-server\n  name: llama3-8b-vllm-deployment\n  namespace: default\nspec:\n  replicas: 1\n  selector:\n    matchLabels:\n      app: llama3-8b-vllm-inference-server\n  strategy: {}\n  template:\n    metadata:\n      creationTimestamp: null\n      labels:\n        ai.gke.io/inference-server: vllm\n        ai.gke.io/model: LLaMA3_8B\n        app: llama3-8b-vllm-inference-server\n        examples.ai.gke.io/source: blueprints\n    spec:\n      containers:\n      - args:\n        - --model=$(MODEL_ID)\n        command:\n        - python3\n        - -m\n        - vllm.entrypoints.openai.api_server\n        env:\n        - name: MODEL_ID\n          value: meta-llama/Meta-Llama-3-8B\n        - name: HUGGING_FACE_HUB_TOKEN\n          valueFrom:\n            secretKeyRef:\n              key: hf_api_token\n              name: hf-secret\n        image: vllm/vllm-openai:v0.7.2\n        name: inference-server\n        ports:\n        - containerPort: 8000\n          name: metrics\n        readinessProbe:\n          failureThreshold: 60\n          httpGet:\n            path: /health\n            port: 8000\n          periodSeconds: 10\n        resources:\n          limits:\n            nvidia.com/gpu: \"1\"\n          requests:\n            nvidia.com/gpu: \"1\"\n        volumeMounts:\n        - mountPath: /dev/shm\n          name: dshm\n      nodeSelector:\n        cloud.google.com/gke-accelerator: nvidia-l4\n      volumes:\n      - emptyDir:\n          medium: Memory\n        name: dshm\nstatus: {}\n"
    },
    {
      "kind": "HorizontalPodAutoscaler",
      "apiVersion": "autoscaling/v2",
      "content": "apiVersion: autoscaling/v2\nkind: HorizontalPodAutoscaler\nmetadata:\n  annotations:\n    aire.gke.io/generated: \"true\"\n  creationTimestamp: null\n  labels:\n    app: llama3-8b-vllm-inference-server\n  name: vllm-hpa\n  namespace: default\nspec:\n  maxReplicas: 10\n  metrics:\n  - pods:\n      metric:\n        name: prometheus.googleapis.com|vllm:gpu_cache_usage_perc|gauge\n      target:\n        averageValue: 602m\n        type: AverageValue\n    type: Pods\n  minReplicas: 1\n  scaleTargetRef:\n    apiVersion: apps/v1\n    kind: Deployment\n    name: llama3-8b-vllm-deployment\nstatus:\n  currentMetrics: null\n  desiredReplicas: 0\n"
    },
    {
      "kind": "Service",
      "apiVersion": "v1",
      "content": "apiVersion: v1\nkind: Service\nmetadata:\n  annotations:\n    aire.gke.io/generated: \"true\"\n  creationTimestamp: null\n  labels:\n    app: llama3-8b-vllm-inference-server\n  name: llama3-8b-vllm-service\n  namespace: default\nspec:\n  ports:\n  - port: 8000\n    protocol: TCP\n    targetPort: 8000\n  selector:\n    app: llama3-8b-vllm-inference-server\n  type: ClusterIP\nstatus:\n  loadBalancer: {}\n"
    },
    {
      "kind": "PodMonitoring",
      "apiVersion": "monitoring.googleapis.com/v1",
      "content": "apiVersion: monitoring.googleapis.com/v1\nkind: PodMonitoring\nmetadata:\n  annotations:\n    aire.gke.io/generated: \"true\"\n  labels:\n    app: llama3-8b-vllm-inference-server\n  name: vllm-podmonitoring\n  namespace: default\nspec:\n  endpoints:\n  - interval: 15s\n    path: /metrics\n    port: metrics\n  selector:\n    matchLabels:\n      app: llama3-8b-vllm-inference-server\n  targetLabels:\n    metadata:\n    - pod\n    - container\n    - node\n"
    }
  ]


In [98]:
for ki in k:
    print (ki['content'])
    print('--')

apiVersion: apps/v1
kind: Deployment
metadata:
  annotations:
    aire.gke.io/generated: "true"
    aire.gke.io/inference-server: vllm
  creationTimestamp: null
  labels:
    app: llama3-8b-vllm-inference-server
  name: llama3-8b-vllm-deployment
  namespace: default
spec:
  replicas: 1
  selector:
    matchLabels:
      app: llama3-8b-vllm-inference-server
  strategy: {}
  template:
    metadata:
      creationTimestamp: null
      labels:
        ai.gke.io/inference-server: vllm
        ai.gke.io/model: LLaMA3_8B
        app: llama3-8b-vllm-inference-server
        examples.ai.gke.io/source: blueprints
    spec:
      containers:
      - args:
        - --model=$(MODEL_ID)
        command:
        - python3
        - -m
        - vllm.entrypoints.openai.api_server
        env:
        - name: MODEL_ID
          value: meta-llama/Meta-Llama-3-8B
        - name: HUGGING_FACE_HUB_TOKEN
          valueFrom:
            secretKeyRef:
              key: hf_api_token
              name: hf-s