# Multi-Turn Chat Benchmark

```bash
gcloud container clusters create-auto cluster-1 \
    --location=us-central1

helm repo add kubeai https://www.kubeai.org
helm repo update
curl -L -O https://raw.githubusercontent.com/substratusai/kubeai/refs/heads/main/charts/kubeai/values-gke.yaml
helm upgrade --install kubeai kubeai/kubeai \
    -f values-gke.yaml \
    --set secrets.huggingface.token=$HUGGING_FACE_HUB_TOKEN \
    --set metrics.prometheusOperator.vLLMPodMonitor.enabled=true \
    --set open-webui.enabled=false \
    --wait
```

In [136]:
!kubectl apply -f ./manifests.yaml
!kubectl wait --for=condition=Ready -f ./manifests.yaml

pod/bench created
pod/bench condition met


In [None]:
from subprocess import run, PIPE
import json
from kubernetes import config, dynamic
from kubernetes.client import api_client
from copy import deepcopy

In [None]:
k8s_client = dynamic.DynamicClient(
    api_client.ApiClient(configuration=config.load_kube_config())
)
models_client = k8s_client.resources.get(api_version="kubeai.org/v1", kind="Model")

In [114]:
#base_model = {
#    "apiVersion": "kubeai.org/v1",
#    "kind": "Model",
#    "metadata": {
#        "name": "bench",
#        "namespace": "default",
#    },
#    "spec": {
#        "features": ["TextGeneration"],
#        "url": "ollama://qwen2:0.5b",
#        "engine": "OLlama",
#        "resourceProfile": "cpu:2"
#    },
#}
base_model = {
    "apiVersion": "kubeai.org/v1",
    "kind": "Model",
    "metadata": {
        "name": "bench",
        "namespace": "default",
    },
    "spec": {
        "features": ["TextGeneration"],
        "url": "hf://neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8",
        "engine": "VLLM",
        "args": [
            "--enable-prefix-caching",
            "--max-model-len=16384",
            "--max-num-batched-token=16384",
            "--gpu-memory-utilization=0.8",
            "--disable-log-requests"
        ],
        "resourceProfile": "nvidia-gpu-l4:1"
    },

}

In [137]:
benches = [
    #{
    #    "thread_count": 800,
    #    "max_concurrent_threads": 80,
    #},
    #{
    #    "thread_count": 800,
    #    "max_concurrent_threads": 100,
    #},
    #{
    #    "thread_count": 800,
    #    "max_concurrent_threads": 120,
    #},
    #{
    #    "thread_count": 800,
    #    "max_concurrent_threads": 140,
    #},
    #{
    #    "thread_count": 800,
    #    "max_concurrent_threads": 160,
    #},
    #{
    #    "thread_count": 800,
    #    "max_concurrent_threads": 180,
    #},
    #{
    #    "thread_count": 800,
    #    "max_concurrent_threads": 200,
    #},
    ######
    #{
    #    "thread_count": 800,
    #    "max_concurrent_threads": 220,
    #},
    #{
    #    "thread_count": 800,
    #    "max_concurrent_threads": 240,
    #},
    #{
    #    "thread_count": 800,
    #    "max_concurrent_threads": 260,
    #},
    #{
    #    "thread_count": 800,
    #    "max_concurrent_threads": 280,
    #},
    #{
    #    "thread_count": 800,
    #    "max_concurrent_threads": 300,
    #},
    #######
    #{
    #    "thread_count": 800,
    #    "max_concurrent_threads": 180,
    #},
    #{
    #    "thread_count": 800,
    #    "max_concurrent_threads": 220,
    #},
    #{
    #    "thread_count": 800,
    #    "max_concurrent_threads": 260,
    #},
    #{
    #    "thread_count": 800,
    #    "max_concurrent_threads": 300,
    #},
    {
        #"thread_count": 900,
        #"max_concurrent_threads": 280,
        "thread_count": 3000,
        "max_concurrent_threads": 300,
    },
]
specs = [
    {
        "minReplicas": 2,
        "maxReplicas": 2,
        "loadBalancing": {
            "strategy": "PrefixHash",
            "prefixHash": {
                "meanLoadFactor": 120,
                "prefixCharLength": 100,
                "replication": 256,
            },
        },
    },
    #{
    #    "minReplicas": 2,
    #    "maxReplicas": 2,
    #    "loadBalancing": {
    #        "strategy": "LeastLoad",
    #        "prefixHash": {
    #            "meanLoadFactor": 120,
    #            "prefixCharLength": 100,
    #            "replication": 40,
    #        },
    #    },
    #},
] 


In [138]:
all_results = []

i = 0
for spec in specs:
    for  bench in benches:
        print(bench)
        try:
            model = deepcopy(base_model)
            model["spec"].update(spec)
            #model["metadata"]["name"] = model["metadata"]["name"] + f'-{i}'
            #models_client.create(body=model)
            models_client.patch(
                body=model,
                content_type="application/apply-patch+yaml",
                field_manager="benchmark",
            )


            model_name = model.get("metadata").get("name")
            model_replicas = model.get("spec").get("minReplicas")

            !kubectl wait --timeout 30m --for=jsonpath='.status.replicas.ready'={model_replicas} model/{model_name}

            thread_count = bench.get("thread_count")
            max_concurrent_threads = bench.get("max_concurrent_threads")
            cmd = f'kubectl exec bench -- bench --threads=./data/large-exact.json --thread-count={thread_count} --max-concurrent-threads={max_concurrent_threads} --request-model={model_name} --max-completion-tokens=40 --request-timeout=2m --seed=2 --format=json'
            print(cmd)

            output = run(cmd, shell=True, stdout=PIPE, encoding='utf8')
            result = json.loads(output.stdout)
            print(result)
            all_results.append({
                "spec": spec,
                "bench": bench,
                "result": result
            }) 
        finally:
            models_client.delete(name=model_name, namespace="default")
            i+=1

all_results

{'thread_count': 3000, 'max_concurrent_threads': 300}
model.kubeai.org/bench condition met
kubectl exec bench -- bench --threads=./data/large-exact.json --thread-count=3000 --max-concurrent-threads=300 --request-model=bench --max-completion-tokens=40 --request-timeout=2m --seed=2 --format=json


2025/02/23 22:48:17 Shuffling dataset threads
2025/02/23 22:48:17 Trimming dataset threads (9204) to specified thread count (3000)
2025/02/23 22:48:17 Starting run...
2025/02/23 22:58:59 Run completed, starting summarization...


{'input_thread_count': 3000, 'input_messages_per_thread': {'mean': 7.406333333333333}, 'duration': '10m41.722326003s', 'request_count': 22219, 'request_duration': {'mean': '8.1615521s'}, 'chunks_per_request': {'mean': 38.85624915612764}, 'failed_threads': 0, 'run_output_throughput': 1345.51341758361, 'run_total_throughput': 14708.897629900868, 'ttft': {'mean': '2.244834737s'}, 'itl': {'mean': '151.25755ms'}, 'prompt_tokens': 8575582, 'cached_prompt_tokens': 0, 'completion_tokens': 863446, 'total_tokens': 9439028}


[{'spec': {'minReplicas': 2,
   'maxReplicas': 2,
   'loadBalancing': {'strategy': 'PrefixHash',
    'prefixHash': {'meanLoadFactor': 120,
     'prefixCharLength': 100,
     'replication': 256}}},
  'bench': {'thread_count': 3000, 'max_concurrent_threads': 300},
  'result': {'input_thread_count': 3000,
   'input_messages_per_thread': {'mean': 7.406333333333333},
   'duration': '10m41.722326003s',
   'request_count': 22219,
   'request_duration': {'mean': '8.1615521s'},
   'chunks_per_request': {'mean': 38.85624915612764},
   'failed_threads': 0,
   'run_output_throughput': 1345.51341758361,
   'run_total_throughput': 14708.897629900868,
   'ttft': {'mean': '2.244834737s'},
   'itl': {'mean': '151.25755ms'},
   'prompt_tokens': 8575582,
   'cached_prompt_tokens': 0,
   'completion_tokens': 863446,
   'total_tokens': 9439028}}]

In [None]:
for r in all_results:
    print(f'Strategy {r["spec"]["loadBalancing"]["strategy"]}: TTFT={r["result"]["ttft"]["mean"]} ITL={r["result"]["itl"]["mean"]} TPS(total)={r["result"]["run_total_throughput"]}')

![Graphana](./screenshots/graphana.png)