# Multi-Turn Chat Benchmark

```bash
gcloud container clusters create-auto cluster-1 \
    --location=us-central1

helm repo add kubeai https://www.kubeai.org
helm repo update
curl -L -O https://raw.githubusercontent.com/substratusai/kubeai/refs/heads/main/charts/kubeai/values-gke.yaml
helm upgrade --install kubeai kubeai/kubeai \
    -f values-gke.yaml \
    --set secrets.huggingface.token=$HUGGING_FACE_HUB_TOKEN \
    --set metrics.prometheusOperator.vLLMPodMonitor.enabled=true \
    --set open-webui.enabled=false \
    --wait
```

In [53]:
from subprocess import run, PIPE
import json
from kubernetes import client, config, dynamic
from kubernetes.client import api_client
from copy import deepcopy

In [54]:
k8s_config = config.load_kube_config()

k8s_client = dynamic.DynamicClient(
    api_client.ApiClient(configuration=k8s_config)
)
models_client = k8s_client.resources.get(api_version="kubeai.org/v1", kind="Model")

v1 = client.CoreV1Api()

namespace = "default"

In [55]:
benchmark_pod_spec = {
    "apiVersion": "v1",
    "kind": "Pod",
    "metadata": {
        "name": "bench"
    },
    "spec": {
        "restartPolicy": "Never",
        "containers": [
            {
                "name": "bench",
                "image": "us-central1-docker.pkg.dev/substratus-dev/default/benchmark-multi-turn-chat-go:v0.1.1",
                "imagePullPolicy": "Always",
                "command": ["sleep", "infinity"],
                "env": [
                    {
                        "name": "OPENAI_BASE_URL",
                        "value": "http://kubeai/openai/v1"
                    }
                ],
                "resources": {
                    "requests": {
                        "cpu": "2",
                        "memory": "2G"
                    },
                    "limits": {
                        "cpu": "2",
                        "memory": "2G"
                    }
                }
            }
        ]
    }
}

In [56]:
base_model = {
    "apiVersion": "kubeai.org/v1",
    "kind": "Model",
    "metadata": {
        "name": "bench",
        "namespace": "default",
    },
    "spec": {
        "features": ["TextGeneration"],
        "url": "hf://neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8",
        "engine": "VLLM",
        "args": [
            "--enable-prefix-caching",
            "--max-model-len=16384",
            "--max-num-batched-token=16384",
            "--gpu-memory-utilization=0.90",
            "--disable-log-requests",
        ],
        "resourceProfile": "nvidia-gpu-l4:1"
    },
}

In [None]:
def create_k8s_service(model_name: str):
    service_body = {
        "apiVersion": "v1",
        "kind": "Service",
        "metadata": {
            "name": f"k8s-{model_name}",
            "labels": {
                "app": f"k8s-{model_name}"
            }
        },
        "spec": {
            "selector": {
                "app.kubernetes.io/name": "vllm",
                "model": model_name
            },
            "ports": [
                {
                    "name": "http",
                    "protocol": "TCP",
                    "port": 80,
                    "targetPort": 8000
                }
            ],
            "type": "ClusterIP"
        }
    }

    return v1.create_namespaced_service(namespace=namespace, body=service_body)



In [65]:
benches = [
    # {
    #     "thread_count": 8000,
    #     "max_concurrent_threads": 300,
    # },
    # {
    #     "thread_count": 8000,
    #     "max_concurrent_threads": 600,
    # },
    {
        "thread_count": 8000,
        "max_concurrent_threads": 1200,
    },
    # {
    #     "thread_count": 8000,
    #     "max_concurrent_threads": 2400,
    # },
]
specs = [
    {
        "minReplicas": 8,
        "maxReplicas": 8,
        "loadBalancing": {
            "strategy": "PrefixHash",
            "prefixHash": {
                "meanLoadFactor": 125,
                "prefixCharLength": 100,
                "replication": 256,
            },
        },
    },
    {
        "minReplicas": 8,
        "maxReplicas": 8,
        "loadBalancing": {
            "strategy": "LeastLoad",
            "prefixHash": {
                "meanLoadFactor": 125,
                "prefixCharLength": 100,
                "replication": 256,
            },
        },
    },
    {# k8s-native
        "minReplicas": 8,
        "maxReplicas": 8,
    }
] 


In [66]:
all_results = []

i = 0
for spec in specs:
    for bench in benches:
        print(bench)
        try:
            model = deepcopy(base_model)
            model["spec"].update(spec)
            model_name = model.get("metadata").get("name")
            model_replicas = model.get("spec").get("minReplicas")

            !kubectl delete pod -l app.kubernetes.io/instance=kubeai
            # Start a fresh instance of the benchmark Pod.
            # !kubectl apply -f ./bench-pod.yaml
            # This indicates using native K8s Service instead of KubeAI
            if not "loadBalancing" in spec:
                svc = create_k8s_service(model_name)
                benchmark_pod_spec["spec"]["containers"][0]["env"][0]["value"] = (
                    f"http://{svc.metadata.name}/v1"
                )
            created_pod = v1.create_namespaced_pod(namespace=namespace, body=benchmark_pod_spec)

            !kubectl wait pod --timeout 10m --for=condition=Ready -l app.kubernetes.io/instance=kubeai
            !kubectl wait --timeout 10m --for=condition=Ready pod/{benchmark_pod_spec["metadata"]["name"]}

            #model["metadata"]["name"] = model["metadata"]["name"] + f'-{i}'
            #models_client.create(body=model)
            models_client.patch(
                body=model,
                content_type="application/apply-patch+yaml",
                field_manager="benchmark",
            )



            !kubectl wait --timeout 30m --for=jsonpath='.status.replicas.ready'={model_replicas} model/{model_name}

            thread_count = bench.get("thread_count")
            max_concurrent_threads = bench.get("max_concurrent_threads")
            cmd = f'kubectl exec bench -- bench --threads=./data/large-exact.json --thread-count={thread_count} --max-concurrent-threads={max_concurrent_threads} --request-model={model_name} --max-completion-tokens=40 --request-timeout=2m --seed=2 --format=json'
            print(cmd)

            output = run(cmd, shell=True, stdout=PIPE, encoding='utf8')
            result = json.loads(output.stdout)
            print(result)
            all_results.append({
                "spec": spec,
                "bench": bench,
                "result": result
            }) 
        finally:
            if not "loadBalancing" in spec:
                v1.delete_namespaced_service(namespace=namespace, name=f"k8s-{model_name}")
            models_client.delete(name=model_name, namespace="default")
            !kubectl delete --now pod/{benchmark_pod_spec["metadata"]["name"]}
            i+=1

all_results

{'thread_count': 8000, 'max_concurrent_threads': 1200}
pod "kubeai-74c9f949c4-znkxd" deleted
pod "open-webui-0" deleted
pod/kubeai-74c9f949c4-58fng condition met
pod/open-webui-0 condition met
pod/bench condition met
model.kubeai.org/bench condition met
kubectl exec bench -- bench --threads=./data/large-exact.json --thread-count=8000 --max-concurrent-threads=1200 --request-model=bench --max-completion-tokens=40 --request-timeout=2m --seed=2 --format=json


2025/02/25 05:59:03 Shuffling dataset threads
2025/02/25 05:59:03 Trimming dataset threads (9204) to specified thread count (8000)
2025/02/25 05:59:03 Starting run...
2025/02/25 06:04:59 Run completed, starting summarization...


{'input_thread_count': 8000, 'input_messages_per_thread': {'mean': 7.378875}, 'duration': '5m55.179100635s', 'request_count': 59031, 'request_duration': {'mean': '6.374937267s'}, 'chunks_per_request': {'mean': 38.83833917772018}, 'failed_threads': 0, 'run_output_throughput': 6454.957501443925, 'run_total_throughput': 70589.45178693143, 'ttft': {'mean': '512.621158ms'}, 'itl': {'mean': '150.591253ms'}, 'prompt_tokens': 22779232, 'cached_prompt_tokens': 0, 'completion_tokens': 2292666, 'total_tokens': 25071898}
pod "bench" deleted
{'thread_count': 8000, 'max_concurrent_threads': 1200}
pod "kubeai-74c9f949c4-58fng" deleted
pod "open-webui-0" deleted
pod/kubeai-74c9f949c4-b77ll condition met
pod/open-webui-0 condition met
pod/bench condition met
model.kubeai.org/bench condition met
kubectl exec bench -- bench --threads=./data/large-exact.json --thread-count=8000 --max-concurrent-threads=1200 --request-model=bench --max-completion-tokens=40 --request-timeout=2m --seed=2 --format=json


2025/02/25 06:07:53 Shuffling dataset threads
2025/02/25 06:07:53 Trimming dataset threads (9204) to specified thread count (8000)
2025/02/25 06:07:53 Starting run...
2025/02/25 06:15:10 Run completed, starting summarization...


{'input_thread_count': 8000, 'input_messages_per_thread': {'mean': 7.378875}, 'duration': '7m16.528048126s', 'request_count': 59031, 'request_duration': {'mean': '7.941324749s'}, 'chunks_per_request': {'mean': 38.88797411529535}, 'failed_threads': 0, 'run_output_throughput': 5258.759453957003, 'run_total_throughput': 57474.964341256156, 'ttft': {'mean': '1.430461576s'}, 'itl': {'mean': '167.165802ms'}, 'prompt_tokens': 22793838, 'cached_prompt_tokens': 0, 'completion_tokens': 2295596, 'total_tokens': 25089434}
pod "bench" deleted
{'thread_count': 8000, 'max_concurrent_threads': 1200}
pod "kubeai-74c9f949c4-b77ll" deleted
pod "open-webui-0" deleted
pod/kubeai-74c9f949c4-64nbc condition met
pod/open-webui-0 condition met
pod/bench condition met
model.kubeai.org/bench condition met
kubectl exec bench -- bench --threads=./data/large-exact.json --thread-count=8000 --max-concurrent-threads=1200 --request-model=bench --max-completion-tokens=40 --request-timeout=2m --seed=2 --format=json


2025/02/25 06:18:17 Shuffling dataset threads
2025/02/25 06:18:17 Trimming dataset threads (9204) to specified thread count (8000)
2025/02/25 06:18:17 Starting run...
2025/02/25 06:29:37 Run completed, starting summarization...


{'input_thread_count': 8000, 'input_messages_per_thread': {'mean': 7.378875}, 'duration': '11m20.468260857s', 'request_count': 59031, 'request_duration': {'mean': '11.755163546s'}, 'chunks_per_request': {'mean': 38.86051396723755}, 'failed_threads': 0, 'run_output_throughput': 3371.235856189458, 'run_total_throughput': 36866.40721259429, 'ttft': {'mean': '4.486482254s'}, 'itl': {'mean': '186.749008ms'}, 'prompt_tokens': 22792401, 'cached_prompt_tokens': 0, 'completion_tokens': 2294019, 'total_tokens': 25086420}
pod "bench" deleted


[{'spec': {'minReplicas': 8,
   'maxReplicas': 8,
   'loadBalancing': {'strategy': 'PrefixHash',
    'prefixHash': {'meanLoadFactor': 125,
     'prefixCharLength': 100,
     'replication': 256}}},
  'bench': {'thread_count': 8000, 'max_concurrent_threads': 1200},
  'result': {'input_thread_count': 8000,
   'input_messages_per_thread': {'mean': 7.378875},
   'duration': '5m55.179100635s',
   'request_count': 59031,
   'request_duration': {'mean': '6.374937267s'},
   'chunks_per_request': {'mean': 38.83833917772018},
   'failed_threads': 0,
   'run_output_throughput': 6454.957501443925,
   'run_total_throughput': 70589.45178693143,
   'ttft': {'mean': '512.621158ms'},
   'itl': {'mean': '150.591253ms'},
   'prompt_tokens': 22779232,
   'cached_prompt_tokens': 0,
   'completion_tokens': 2292666,
   'total_tokens': 25071898}},
 {'spec': {'minReplicas': 8,
   'maxReplicas': 8,
   'loadBalancing': {'strategy': 'LeastLoad',
    'prefixHash': {'meanLoadFactor': 125,
     'prefixCharLength': 10

In [67]:
for r in all_results:
    print(f'Strategy {r["spec"].get("loadBalancing", {}).get("strategy", "k8s-native")}: TTFT={r["result"]["ttft"]["mean"]} ITL={r["result"]["itl"]["mean"]} TPS(total)={r["result"]["run_total_throughput"]}')

Strategy PrefixHash: TTFT=512.621158ms ITL=150.591253ms TPS(total)=70589.45178693143
Strategy LeastLoad: TTFT=1.430461576s ITL=167.165802ms TPS(total)=57474.964341256156
Strategy k8s-native: TTFT=4.486482254s ITL=186.749008ms TPS(total)=36866.40721259429
