# Multi-Turn Chat Benchmark

```bash
gcloud container clusters create-auto cluster-1 \
    --location=us-central1

helm repo add kubeai https://www.kubeai.org
helm repo update
curl -L -O https://raw.githubusercontent.com/substratusai/kubeai/refs/heads/main/charts/kubeai/values-gke.yaml
helm upgrade --install kubeai kubeai/kubeai \
    -f values-gke.yaml \
    --set secrets.huggingface.token=$HUGGING_FACE_HUB_TOKEN \
    --set metrics.prometheusOperator.vLLMPodMonitor.enabled=true \
    --set open-webui.enabled=false \
    --wait
```

In [None]:
!kubectl apply -f ./manifests.yaml
!kubectl wait --for=condition=Ready -f ./manifests.yaml

In [None]:
from subprocess import run, PIPE
import json
from kubernetes import config, dynamic
from kubernetes.client import api_client
from copy import deepcopy

In [None]:
k8s_client = dynamic.DynamicClient(
    api_client.ApiClient(configuration=config.load_kube_config())
)
models_client = k8s_client.resources.get(api_version="kubeai.org/v1", kind="Model")

In [None]:
#base_model = {
#    "apiVersion": "kubeai.org/v1",
#    "kind": "Model",
#    "metadata": {
#        "name": "bench",
#        "namespace": "default",
#    },
#    "spec": {
#        "features": ["TextGeneration"],
#        "url": "ollama://qwen2:0.5b",
#        "engine": "OLlama",
#        "resourceProfile": "cpu:2"
#    },
#}
base_model = {
    "apiVersion": "kubeai.org/v1",
    "kind": "Model",
    "metadata": {
        "name": "bench",
        "namespace": "default",
    },
    "spec": {
        "features": ["TextGeneration"],
        "url": "hf://neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8",
        "engine": "VLLM",
        "args": [
            "--enable-prefix-caching",
            "--max-model-len=16384",
            "--max-num-batched-token=16384",
            "--gpu-memory-utilization=0.6",
            "--disable-log-requests"
        ],
        "resourceProfile": "nvidia-gpu-l4:1"
    },

}

In [44]:
benches = [
    #{
    #    "thread_count": 800,
    #    "max_concurrent_threads": 80,
    #},
    #{
    #    "thread_count": 800,
    #    "max_concurrent_threads": 100,
    #},
    #{
    #    "thread_count": 800,
    #    "max_concurrent_threads": 120,
    #},
    #{
    #    "thread_count": 800,
    #    "max_concurrent_threads": 140,
    #},
    #{
    #    "thread_count": 800,
    #    "max_concurrent_threads": 160,
    #},
    #{
    #    "thread_count": 800,
    #    "max_concurrent_threads": 180,
    #},
    #{
    #    "thread_count": 800,
    #    "max_concurrent_threads": 200,
    #},
    ######
    #{
    #    "thread_count": 800,
    #    "max_concurrent_threads": 220,
    #},
    #{
    #    "thread_count": 800,
    #    "max_concurrent_threads": 240,
    #},
    #{
    #    "thread_count": 800,
    #    "max_concurrent_threads": 260,
    #},
    #{
    #    "thread_count": 800,
    #    "max_concurrent_threads": 280,
    #},
    #{
    #    "thread_count": 800,
    #    "max_concurrent_threads": 300,
    #},
    #######
    #{
    #    "thread_count": 800,
    #    "max_concurrent_threads": 180,
    #},
    #{
    #    "thread_count": 800,
    #    "max_concurrent_threads": 220,
    #},
    #{
    #    "thread_count": 800,
    #    "max_concurrent_threads": 260,
    #},
    #{
    #    "thread_count": 800,
    #    "max_concurrent_threads": 300,
    #},
    {
        "thread_count": 1600,
        "max_concurrent_threads": 300,
    },
]
specs = [
    {
        "minReplicas": 2,
        "maxReplicas": 2,
        "loadBalancing": {
            "strategy": "PrefixHash",
        },
    },
    {
        "minReplicas": 2,
        "maxReplicas": 2,
        "loadBalancing": {
            "strategy": "LeastLoad",
        },
    },
] 


In [45]:
all_results = []

i = 0
for spec in specs:
    for  bench in benches:
        print(bench)
        try:
            model = deepcopy(base_model)
            model["spec"].update(spec)
            #model["metadata"]["name"] = model["metadata"]["name"] + f'-{i}'
            #models_client.create(body=model)
            models_client.patch(
                body=model,
                content_type="application/apply-patch+yaml",
                field_manager="benchmark",
            )


            model_name = model.get("metadata").get("name")
            model_replicas = model.get("spec").get("minReplicas")

            !kubectl wait --timeout 30m --for=jsonpath='.status.replicas.ready'={model_replicas} model/{model_name}

            thread_count = bench.get("thread_count")
            max_concurrent_threads = bench.get("max_concurrent_threads")
            cmd = f'kubectl exec bench -- bench --threads=./data/large-exact.json --thread-count={thread_count} --max-concurrent-threads={max_concurrent_threads} --request-model={model_name} --max-completion-tokens=10 --request-timeout=2m --format=json'
            print(cmd)

            output = run(cmd, shell=True, stdout=PIPE, encoding='utf8')
            result = json.loads(output.stdout)
            print(result)
            all_results.append({
                "spec": spec,
                "bench": bench,
                "result": result
            }) 
        finally:
            #models_client.delete(name=model_name, namespace="default")
            i+=1

all_results

{'thread_count': 1600, 'max_concurrent_threads': 300}
model.kubeai.org/bench condition met
kubectl exec bench -- bench --threads=./data/large-exact.json --thread-count=1600 --max-concurrent-threads=300 --request-model=bench --max-completion-tokens=10 --request-timeout=2m --format=json


2025/02/23 16:36:49 Shuffling dataset threads
2025/02/23 16:36:49 Trimming dataset threads (9204) to specified thread count (1600)
2025/02/23 16:36:49 Starting run...
2025/02/23 16:37:05 Thread[99/1600]: Finished
2025/02/23 16:37:05 Thread[98/1600]: Finished
2025/02/23 16:37:05 Thread[147/1600]: Finished
2025/02/23 16:37:05 Thread[293/1600]: Finished
2025/02/23 16:37:05 Thread[174/1600]: Finished
2025/02/23 16:37:05 Thread[8/1600]: Finished
2025/02/23 16:37:05 Thread[65/1600]: Finished
2025/02/23 16:37:05 Thread[184/1600]: Finished
2025/02/23 16:37:05 Thread[158/1600]: Finished
2025/02/23 16:37:05 Thread[59/1600]: Finished
2025/02/23 16:37:05 Thread[18/1600]: Finished
2025/02/23 16:37:05 Thread[248/1600]: Finished
2025/02/23 16:37:05 Thread[263/1600]: Finished
2025/02/23 16:37:05 Thread[205/1600]: Finished
2025/02/23 16:37:09 Thread[270/1600]: Finished
2025/02/23 16:37:10 Thread[71/1600]: Finished
2025/02/23 16:37:10 Thread[207/1600]: Finished
2025/02/23 16:37:10 Thread[224/1600]: Fini

{'input_thread_count': 1600, 'input_messages_per_thread': {'mean': 7.47125}, 'duration': '4m32.832534367s', 'request_count': 11954, 'request_duration': {'mean': '5.952973492s'}, 'chunks_per_request': {'mean': 9.920612347331437}, 'failed_threads': 0, 'run_output_throughput': 434.9591234613171, 'run_total_throughput': 12797.425380740498, 'ttft': {'mean': '4.428007018s'}, 'itl': {'mean': '149.843405ms'}, 'prompt_tokens': 3372883, 'cached_prompt_tokens': 0, 'completion_tokens': 118671, 'total_tokens': 3491554}
{'thread_count': 1600, 'max_concurrent_threads': 300}
model.kubeai.org/bench condition met
kubectl exec bench -- bench --threads=./data/large-exact.json --thread-count=1600 --max-concurrent-threads=300 --request-model=bench --max-completion-tokens=10 --request-timeout=2m --format=json


2025/02/23 16:41:25 Shuffling dataset threads
2025/02/23 16:41:25 Trimming dataset threads (9204) to specified thread count (1600)
2025/02/23 16:41:25 Starting run...
2025/02/23 16:41:43 Thread[269/1600]: Finished
2025/02/23 16:41:43 Thread[205/1600]: Finished
2025/02/23 16:41:43 Thread[228/1600]: Finished
2025/02/23 16:41:43 Thread[98/1600]: Finished
2025/02/23 16:41:43 Thread[232/1600]: Finished
2025/02/23 16:41:43 Thread[121/1600]: Finished
2025/02/23 16:41:43 Thread[158/1600]: Finished
2025/02/23 16:41:43 Thread[13/1600]: Finished
2025/02/23 16:41:43 Thread[144/1600]: Finished
2025/02/23 16:41:43 Thread[72/1600]: Finished
2025/02/23 16:41:43 Thread[136/1600]: Finished
2025/02/23 16:41:44 Thread[36/1600]: Finished
2025/02/23 16:41:44 Thread[278/1600]: Finished
2025/02/23 16:41:48 Thread[123/1600]: Finished
2025/02/23 16:41:48 Thread[8/1600]: Finished
2025/02/23 16:41:48 Thread[35/1600]: Finished
2025/02/23 16:41:48 Thread[285/1600]: Finished
2025/02/23 16:41:48 Thread[19/1600]: Fini

{'input_thread_count': 1600, 'input_messages_per_thread': {'mean': 7.47125}, 'duration': '5m38.930311226s', 'request_count': 11954, 'request_duration': {'mean': '8.042444807s'}, 'chunks_per_request': {'mean': 9.918019073113602}, 'failed_threads': 0, 'run_output_throughput': 350.10441990500044, 'run_total_throughput': 10302.828883521015, 'ttft': {'mean': '6.000048281s'}, 'itl': {'mean': '190.192171ms'}, 'prompt_tokens': 3373280, 'cached_prompt_tokens': 0, 'completion_tokens': 118661, 'total_tokens': 3491941}


2025/02/23 16:47:04 Thread[1421/1600]: Finished
2025/02/23 16:47:04 Run completed, starting summarization...


[{'spec': {'minReplicas': 2,
   'maxReplicas': 2,
   'loadBalancing': {'strategy': 'PrefixHash'}},
  'bench': {'thread_count': 1600, 'max_concurrent_threads': 300},
  'result': {'input_thread_count': 1600,
   'input_messages_per_thread': {'mean': 7.47125},
   'duration': '4m32.832534367s',
   'request_count': 11954,
   'request_duration': {'mean': '5.952973492s'},
   'chunks_per_request': {'mean': 9.920612347331437},
   'failed_threads': 0,
   'run_output_throughput': 434.9591234613171,
   'run_total_throughput': 12797.425380740498,
   'ttft': {'mean': '4.428007018s'},
   'itl': {'mean': '149.843405ms'},
   'prompt_tokens': 3372883,
   'cached_prompt_tokens': 0,
   'completion_tokens': 118671,
   'total_tokens': 3491554}},
 {'spec': {'minReplicas': 2,
   'maxReplicas': 2,
   'loadBalancing': {'strategy': 'LeastLoad'}},
  'bench': {'thread_count': 1600, 'max_concurrent_threads': 300},
  'result': {'input_thread_count': 1600,
   'input_messages_per_thread': {'mean': 7.47125},
   'duratio

In [47]:
for r in all_results:
    print(f'Strategy {r["spec"]["loadBalancing"]["strategy"]}: TTFT={r["result"]["ttft"]["mean"]} ITL={r["result"]["itl"]["mean"]} TPS(total)={r["result"]["run_total_throughput"]}')

Strategy PrefixHash: TTFT=4.428007018s ITL=149.843405ms TPS(total)=12797.425380740498
Strategy LeastLoad: TTFT=6.000048281s ITL=190.192171ms TPS(total)=10302.828883521015


![Graphana](./screenshots/graphana.png)