In [18]:
import requests
import time
import numpy as np
import statistics
import json
from concurrent.futures import ThreadPoolExecutor, as_completed
from pathlib import Path

In [19]:
INFERENCE_URL = "http://qwen3-vl-4b-instruct-predictor.qwen3.svc.cluster.local:8080/v1/chat/completions"

MODEL_NAME = "qwen3-vl-4b-instruct"
MAX_TOKENS = 200
TIMEOUT = 120  # segundos

In [20]:
IMAGE_FILE = "images.txt"

image_urls = [line.strip() for line in Path(IMAGE_FILE).read_text().splitlines() if line.strip()]

len(image_urls), image_urls[:2]

(10,
 ['https://img.odcdn.com.br/wp-content/uploads/2021/12/placa-mercosul-o-que-e.jpg',
  'https://quatrorodas.abril.com.br/wp-content/uploads/2017/08/aaa0001.jpg?quality=70&strip=info&w=720&crop=1'])

In [21]:
def run_inference(image_url):
    payload = {
        "model": MODEL_NAME,
        "messages": [
            {
                "role": "user",
                "content": [
                    {"type": "text", "text": "Identifique a placa e a marca do veiculo na imagem."},
                    {
                        "type": "image_url",
                        "image_url": {"url": image_url}
                    }
                ]
            }
        ],
        "max_tokens": MAX_TOKENS
    }

    start = time.time()
    try:
        resp = requests.post(
            INFERENCE_URL,
            headers={"Content-Type": "application/json"},
            json=payload,
            timeout=TIMEOUT
        )
        data = resp.json()
        usage = data.get("usage", {})
        latency = time.time() - start
        resp.raise_for_status()
        return {
            "latency": latency,
            "success": True,
            "response": resp.json(),
            "prompt_tokens": usage.get("prompt_tokens", 0),
            "completion_tokens": usage.get("completion_tokens", 0),
            "total_tokens": usage.get("total_tokens", 0),
        }
    except Exception as e:
        latency = time.time() - start
        return {
            "latency": latency,
            "success": False,
            "error": str(e)
        }


In [22]:
def load_test(concurrency, total_requests):
    latencies = []
    errors = 0

    total_prompt_tokens = 0
    total_completion_tokens = 0
    total_tokens = 0

    start_time = time.time()

    with ThreadPoolExecutor(max_workers=concurrency) as executor:
        futures = []
        for i in range(total_requests):
            image_url = image_urls[i % len(image_urls)]
            futures.append(executor.submit(run_inference, image_url))

        for future in as_completed(futures):
            result = future.result()
            latencies.append(result["latency"])
            if not result["success"]:
                errors += 1
            else:
                total_prompt_tokens += result["prompt_tokens"]
                total_completion_tokens += result["completion_tokens"]
                total_tokens += result["total_tokens"]

    total_time = time.time() - start_time

    return {
        "concurrency": concurrency,
        "requests": total_requests,
        "total_time_s": round(total_time, 2),
        "throughput_req_s": round(total_requests / total_time, 2),
        "latency_avg_s": round(statistics.mean(latencies), 2),
        "latency_p95_s": round(statistics.quantiles(latencies, n=20)[18], 2),
        "latency_max_s": round(max(latencies), 2),
        "errors": errors,
        "prompt_tokens": total_prompt_tokens,
        "completion_tokens": total_completion_tokens,
        "total_tokens": total_tokens,
        "throughput_tokens_s": round(total_completion_tokens / total_time, 2),
    }


In [23]:
results = []

for concurrency in [1, 2, 4, 8, 16, 32]:
    print(f"Executando teste com concorrência = {concurrency}")
    res = load_test(concurrency=concurrency, total_requests=50)
    results.append(res)

Executando teste com concorrência = 1
Executando teste com concorrência = 2
Executando teste com concorrência = 4
Executando teste com concorrência = 8
Executando teste com concorrência = 16
Executando teste com concorrência = 32


In [24]:
import pandas as pd

df = pd.DataFrame(results)
df

Unnamed: 0,concurrency,requests,total_time_s,throughput_req_s,latency_avg_s,latency_p95_s,latency_max_s,errors,prompt_tokens,completion_tokens,total_tokens,throughput_tokens_s
0,1,50,242.19,0.21,4.84,7.25,7.33,1,30356,6696,37052,27.65
1,2,50,132.74,0.38,5.24,7.54,7.55,0,30665,6941,37606,52.29
2,4,50,66.24,0.75,5.16,7.65,7.8,0,30665,6644,37309,100.3
3,8,50,38.78,1.29,5.91,8.24,8.27,0,30665,7168,37833,184.82
4,16,50,24.54,2.04,6.38,9.21,9.26,0,30665,7099,37764,289.3
5,32,50,17.66,2.83,8.21,11.56,11.58,0,30665,7452,38117,421.98


In [25]:
sample = run_inference(image_urls[6])
print(json.dumps(sample["response"]["choices"][0]["message"]["content"], indent=2, ensure_ascii=False))

"Com base na imagem fornecida, os detalhes do veículo são os seguintes:\n\n- **Marca**: Nissan  \n- **Placa**: SSE-3651  \n- **Estado**: São Paulo (SP), conforme o formato da placa brasileira e a sigla \"SP\" visível no canto superior esquerdo da placa.\n\nO modelo do carro é um **Nissan Kicks**, que é um SUV compacto da marca Nissan, com design moderno e característico da linha da marca, incluindo a grade frontal com o emblema \"Nissan\" e faróis de LED.\n\n✅ **Resposta final:**\n- **Marca**: Nissan  \n- **Placa**: SSE-3651"
