# Multi-Turn Chat Benchmark

```bash
gcloud container clusters create-auto cluster-1 \
    --location=us-central1

helm repo add kubeai https://www.kubeai.org
helm repo update
curl -L -O https://raw.githubusercontent.com/substratusai/kubeai/refs/heads/main/charts/kubeai/values-gke.yaml
helm upgrade --install kubeai kubeai/kubeai \
    -f values-gke.yaml \
    --set secrets.huggingface.token=$HUGGING_FACE_HUB_TOKEN \
    --set metrics.prometheusOperator.vLLMPodMonitor.enabled=true \
    --set open-webui.enabled=false \
    --wait
```

In [None]:
!kubectl apply -f ./manifests.yaml
!kubectl wait --for=condition=Ready -f ./manifests.yaml

In [None]:
from subprocess import run, PIPE
import json
from kubernetes import config, dynamic
from kubernetes.client import api_client
from copy import deepcopy

In [None]:
k8s_client = dynamic.DynamicClient(
    api_client.ApiClient(configuration=config.load_kube_config())
)
models_client = k8s_client.resources.get(api_version="kubeai.org/v1", kind="Model")

In [None]:
#base_model = {
#    "apiVersion": "kubeai.org/v1",
#    "kind": "Model",
#    "metadata": {
#        "name": "bench",
#        "namespace": "default",
#    },
#    "spec": {
#        "features": ["TextGeneration"],
#        "url": "ollama://qwen2:0.5b",
#        "engine": "OLlama",
#        "resourceProfile": "cpu:2"
#    },
#}
base_model = {
    "apiVersion": "kubeai.org/v1",
    "kind": "Model",
    "metadata": {
        "name": "bench",
        "namespace": "default",
    },
    "spec": {
        "features": ["TextGeneration"],
        "url": "hf://neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8",
        "engine": "VLLM",
        "args": [
            "--enable-prefix-caching",
            "--max-model-len=16384",
            "--max-num-batched-token=16384",
            "--gpu-memory-utilization=0.6",
            "--disable-log-requests"
        ],
        "resourceProfile": "nvidia-gpu-l4:1"
    },

}

In [None]:
benches = [
    {
        "thread_count": 400,
        "max_concurrent_threads": 120,
    },
    {
        "thread_count": 400,
        "max_concurrent_threads": 140,
    },
    {
        "thread_count": 400,
        "max_concurrent_threads": 160,
    },
    {
        "thread_count": 400,
        "max_concurrent_threads": 180,
    },
]
specs = [
    {
        "minReplicas": 2,
        "maxReplicas": 2,
        "loadBalancing": {
            "strategy": "LeastLoad",
        },
    },
    #{
    #    "minReplicas": 2,
    #    "maxReplicas": 2,
    #    "loadBalancing": {
    #        "strategy": "PrefixHash",
    #        "prefixHash": {
    #           "meanLoadPercentage": 125,
    #           "replication": 100,
    #           "prefixCharLength": 10,
    #        },
    #    },
    #},
] 


In [27]:
results = []

i = 0
for spec in specs:
    for  bench in benches:
        print(bench)
        try:
            model = deepcopy(base_model)
            model["spec"].update(spec)
            #model["metadata"]["name"] = model["metadata"]["name"] + f'-{i}'
            #models_client.create(body=model)
            models_client.patch(
                body=model,
                content_type="application/apply-patch+yaml",
                field_manager="benchmark",
            )


            model_name = model.get("metadata").get("name")
            model_replicas = model.get("spec").get("minReplicas")

            !kubectl wait --timeout 30m --for=jsonpath='.status.replicas.ready'={model_replicas} model/{model_name}

            thread_count = bench.get("thread_count")
            max_concurrent_threads = bench.get("max_concurrent_threads")
            cmd = f'kubectl exec bench -- bench --threads=./data/large-exact.json --thread-count={thread_count} --max-concurrent-threads={max_concurrent_threads} --request-model={model_name} --max-completion-tokens=10 --request-timeout=2m --format=json'
            print(cmd)

            output = run(cmd, shell=True, stdout=PIPE, encoding='utf8')
            result = json.loads(output.stdout)
            print(result)
            results.append({
                "spec": spec,
                "bench": bench,
                "results": results
            }) 
        finally:
            #models_client.delete(name=model_name, namespace="default")
            i+=1

results

{'thread_count': 400, 'max_concurrent_threads': 100}
model.kubeai.org/bench condition met
kubectl exec bench -- bench --threads=./data/large-exact.json --thread-count=400 --max-concurrent-threads=100 --request-model=bench --max-completion-tokens=10 --request-timeout=2m --format=json


2025/02/23 15:39:17 Shuffling dataset threads
2025/02/23 15:39:17 Trimming dataset threads (9204) to specified thread count (400)
2025/02/23 15:39:17 Starting run...
2025/02/23 15:39:23 Thread[4/400]: Finished
2025/02/23 15:39:23 Thread[40/400]: Finished
2025/02/23 15:39:23 Thread[43/400]: Finished
2025/02/23 15:39:23 Thread[54/400]: Finished
2025/02/23 15:39:23 Thread[71/400]: Finished
2025/02/23 15:39:23 Thread[88/400]: Finished
2025/02/23 15:39:23 Thread[36/400]: Finished
2025/02/23 15:39:23 Thread[19/400]: Finished
2025/02/23 15:39:23 Thread[67/400]: Finished
2025/02/23 15:39:23 Thread[35/400]: Finished
2025/02/23 15:39:23 Thread[72/400]: Finished
2025/02/23 15:39:23 Thread[37/400]: Finished
2025/02/23 15:39:23 Thread[24/400]: Finished
2025/02/23 15:39:23 Thread[38/400]: Finished
2025/02/23 15:39:23 Thread[45/400]: Finished
2025/02/23 15:39:24 Thread[59/400]: Finished
2025/02/23 15:39:24 Thread[26/400]: Finished
2025/02/23 15:39:24 Thread[51/400]: Finished
2025/02/23 15:39:24 Threa

{'input_thread_count': 400, 'input_messages_per_thread': {'mean': 7.3975}, 'duration': '40.828928239s', 'request_count': 2959, 'request_duration': {'mean': '1.210520826s'}, 'chunks_per_request': {'mean': 9.969246367015884}, 'failed_threads': 0, 'run_output_throughput': 722.5024332581526, 'run_total_throughput': 21763.44171462296, 'ttft': {'mean': '505.434254ms'}, 'itl': {'mean': '69.287217ms'}, 'prompt_tokens': 859079, 'cached_prompt_tokens': 0, 'completion_tokens': 29499, 'total_tokens': 888578}
{'thread_count': 400, 'max_concurrent_threads': 120}
model.kubeai.org/bench condition met
kubectl exec bench -- bench --threads=./data/large-exact.json --thread-count=400 --max-concurrent-threads=120 --request-model=bench --max-completion-tokens=10 --request-timeout=2m --format=json


2025/02/23 15:40:01 Shuffling dataset threads
2025/02/23 15:40:01 Trimming dataset threads (9204) to specified thread count (400)
2025/02/23 15:40:01 Starting run...
2025/02/23 15:40:08 Thread[65/400]: Finished
2025/02/23 15:40:08 Thread[8/400]: Finished
2025/02/23 15:40:08 Thread[51/400]: Finished
2025/02/23 15:40:08 Thread[13/400]: Finished
2025/02/23 15:40:08 Thread[35/400]: Finished
2025/02/23 15:40:08 Thread[58/400]: Finished
2025/02/23 15:40:08 Thread[38/400]: Finished
2025/02/23 15:40:08 Thread[45/400]: Finished
2025/02/23 15:40:08 Thread[37/400]: Finished
2025/02/23 15:40:08 Thread[19/400]: Finished
2025/02/23 15:40:08 Thread[98/400]: Finished
2025/02/23 15:40:08 Thread[36/400]: Finished
2025/02/23 15:40:08 Thread[71/400]: Finished
2025/02/23 15:40:08 Thread[18/400]: Finished
2025/02/23 15:40:08 Thread[72/400]: Finished
2025/02/23 15:40:08 Thread[118/400]: Finished
2025/02/23 15:40:08 Thread[120/400]: Finished
2025/02/23 15:40:08 Thread[99/400]: Finished
2025/02/23 15:40:08 Thr

{'input_thread_count': 400, 'input_messages_per_thread': {'mean': 7.3975}, 'duration': '37.863872691s', 'request_count': 2959, 'request_duration': {'mean': '1.301200044s'}, 'chunks_per_request': {'mean': 9.959445758702264}, 'failed_threads': 0, 'run_output_throughput': 778.3144698509625, 'run_total_throughput': 23470.235262311988, 'ttft': {'mean': '587.607401ms'}, 'itl': {'mean': '70.213843ms'}, 'prompt_tokens': 859204, 'cached_prompt_tokens': 0, 'completion_tokens': 29470, 'total_tokens': 888674}
{'thread_count': 400, 'max_concurrent_threads': 140}
model.kubeai.org/bench condition met
kubectl exec bench -- bench --threads=./data/large-exact.json --thread-count=400 --max-concurrent-threads=140 --request-model=bench --max-completion-tokens=10 --request-timeout=2m --format=json


2025/02/23 15:40:43 Shuffling dataset threads
2025/02/23 15:40:43 Trimming dataset threads (9204) to specified thread count (400)
2025/02/23 15:40:43 Starting run...
2025/02/23 15:40:49 Thread[57/400]: Finished
2025/02/23 15:40:50 Thread[6/400]: Finished
2025/02/23 15:40:50 Thread[29/400]: Finished
2025/02/23 15:40:50 Thread[18/400]: Finished
2025/02/23 15:40:50 Thread[54/400]: Finished
2025/02/23 15:40:50 Thread[102/400]: Finished
2025/02/23 15:40:50 Thread[43/400]: Finished
2025/02/23 15:40:50 Thread[71/400]: Finished
2025/02/23 15:40:50 Thread[59/400]: Finished
2025/02/23 15:40:50 Thread[67/400]: Finished
2025/02/23 15:40:50 Thread[133/400]: Finished
2025/02/23 15:40:50 Thread[65/400]: Finished
2025/02/23 15:40:50 Thread[26/400]: Finished
2025/02/23 15:40:50 Thread[16/400]: Finished
2025/02/23 15:40:50 Thread[130/400]: Finished
2025/02/23 15:40:50 Thread[24/400]: Finished
2025/02/23 15:40:50 Thread[120/400]: Finished
2025/02/23 15:40:50 Thread[45/400]: Finished
2025/02/23 15:40:50 T

{'input_thread_count': 400, 'input_messages_per_thread': {'mean': 7.3975}, 'duration': '37.721088577s', 'request_count': 2959, 'request_duration': {'mean': '1.479939865s'}, 'chunks_per_request': {'mean': 9.957755998648192}, 'failed_threads': 0, 'run_output_throughput': 781.1280403494491, 'run_total_throughput': 23556.610732114503, 'ttft': {'mean': '668.413864ms'}, 'itl': {'mean': '79.495333ms'}, 'prompt_tokens': 859116, 'cached_prompt_tokens': 0, 'completion_tokens': 29465, 'total_tokens': 888581}


2025/02/23 15:41:20 Thread[253/400]: Finished
2025/02/23 15:41:20 Run completed, starting summarization...


[{'spec': {'minReplicas': 2,
   'maxReplicas': 2,
   'loadBalancing': {'strategy': 'LeastLoad'}},
  'bench': {'thread_count': 400, 'max_concurrent_threads': 100},
  'results': [...]},
 {'spec': {'minReplicas': 2,
   'maxReplicas': 2,
   'loadBalancing': {'strategy': 'LeastLoad'}},
  'bench': {'thread_count': 400, 'max_concurrent_threads': 120},
  'results': [...]},
 {'spec': {'minReplicas': 2,
   'maxReplicas': 2,
   'loadBalancing': {'strategy': 'LeastLoad'}},
  'bench': {'thread_count': 400, 'max_concurrent_threads': 140},
  'results': [...]}]