In [1]:
import requests
import time
import threading
import statistics
import json
from concurrent.futures import ThreadPoolExecutor, as_completed
from pathlib import Path

In [2]:
INFERENCE_URL = "http://qwen3-vl-4b-instruct-predictor.qwen3.svc.cluster.local:8080/v1/chat/completions"

MODEL_NAME = "qwen3-vl-4b-instruct"
MAX_TOKENS = 200
TIMEOUT = 120  # segundos

In [3]:
IMAGE_FILE = "images.txt"

image_urls = [line.strip() for line in Path(IMAGE_FILE).read_text().splitlines() if line.strip()]

len(image_urls), image_urls[:2]

(10,
 ['https://img.odcdn.com.br/wp-content/uploads/2021/12/placa-mercosul-o-que-e.jpg',
  'https://quatrorodas.abril.com.br/wp-content/uploads/2017/08/aaa0001.jpg?quality=70&strip=info&w=720&crop=1'])

In [4]:
def run_inference(image_url):
    payload = {
        "model": MODEL_NAME,
        "messages": [
            {
                "role": "user",
                "content": [
                    {"type": "text", "text": "Identifique a placa e a marca do veiculo na imagem."},
                    {
                        "type": "image_url",
                        "image_url": {"url": image_url}
                    }
                ]
            }
        ],
        "max_tokens": MAX_TOKENS
    }

    start = time.time()
    try:
        resp = requests.post(
            INFERENCE_URL,
            headers={"Content-Type": "application/json"},
            json=payload,
            timeout=TIMEOUT
        )
        latency = time.time() - start
        resp.raise_for_status()
        return {
            "latency": latency,
            "success": True,
            "response": resp.json()
        }
    except Exception as e:
        latency = time.time() - start
        return {
            "latency": latency,
            "success": False,
            "error": str(e)
        }


In [5]:
def load_test(concurrency, total_requests):
    latencies = []
    errors = 0
    start_time = time.time()

    with ThreadPoolExecutor(max_workers=concurrency) as executor:
        futures = []
        for i in range(total_requests):
            image_url = image_urls[i % len(image_urls)]
            futures.append(executor.submit(run_inference, image_url))

        for future in as_completed(futures):
            result = future.result()
            latencies.append(result["latency"])
            if not result["success"]:
                errors += 1

    total_time = time.time() - start_time

    return {
        "concurrency": concurrency,
        "requests": total_requests,
        "total_time_s": round(total_time, 2),
        "throughput_req_s": round(total_requests / total_time, 2),
        "latency_avg_s": round(statistics.mean(latencies), 2),
        "latency_p95_s": round(statistics.quantiles(latencies, n=20)[18], 2),
        "latency_max_s": round(max(latencies), 2),
        "errors": errors
    }


In [6]:
results = []

for concurrency in [1, 2, 4, 8, 16, 32]:
    print(f"Executando teste com concorrência = {concurrency}")
    res = load_test(concurrency=concurrency, total_requests=50)
    results.append(res)

Executando teste com concorrência = 1
Executando teste com concorrência = 2
Executando teste com concorrência = 4
Executando teste com concorrência = 8
Executando teste com concorrência = 16
Executando teste com concorrência = 32


In [7]:
import pandas as pd

df = pd.DataFrame(results)
df

Unnamed: 0,concurrency,requests,total_time_s,throughput_req_s,latency_avg_s,latency_p95_s,latency_max_s,errors
0,1,50,278.33,0.18,5.57,8.18,9.08,0
1,2,50,138.1,0.36,5.49,7.59,7.66,0
2,4,50,71.22,0.7,5.56,7.78,7.94,0
3,8,50,38.48,1.3,5.84,8.31,8.33,0
4,16,50,23.94,2.09,6.42,9.16,9.26,0
5,32,50,16.81,2.97,7.54,11.49,11.52,0


In [8]:
sample = run_inference(image_urls[0])
print(json.dumps(sample["response"]["choices"][0]["message"]["content"], indent=2, ensure_ascii=False))

"Com base na imagem fornecida, podemos identificar:\n\n**Marca do veículo:** Fiat  \n*(O logotipo da marca Fiat está claramente visível no centro do capô, acima da grade frontal.)*\n\n**Placa do veículo:**  \n- **Número da placa:** OZL-7H33  \n- **Estado:** São Paulo (indicado pela letra \"S\" na placa, que é a sigla do estado de São Paulo, conforme o sistema de placas brasileiro)  \n- **País:** Brasil (indicado pela palavra \"BRASIL\" e pela bandeira do Brasil na placa)\n\n**Observação adicional:**  \nA placa possui o formato padrão do Brasil, com a bandeira e a palavra \"BRASIL\" no topo, e o código do estado (S) na esquerda. O número da placa é \"OZL-7H33\".\n\n---\n\n✅ **"
