# Speculative decoding metrics using vLLM

This notebook provides code walkthrough of getting speculative decoding draft acceptance rate metric using vLLM.

### We need to first serve verifier and draft models 

In [None]:
python -m vllm.entrypoints.openai.api_server --host 0.0.0.0 --port 8000 --model "Qwen/Qwen2.5-1.5B-Instruct" \
    --seed 42 -tp 1 --gpu_memory_utilization 0.8 \
    --speculative_model "Qwen/Qwen2.5-0.5B-Instruct" \
    --num_speculative_tokens 5

### Then we need to use OpenAI client for generation and get corresponding metrics

In [1]:
from openai import OpenAI

def generate_from_serve(prompt):
  # Modify OpenAI's API key and API base to use vLLM's API server.
  openai_api_key = "EMPTY"
  openai_api_base = "http://localhost:8000/v1"

  # Initialize the OpenAI client with the provided API key and base URL.
  client = OpenAI(
      api_key=openai_api_key,
      base_url=openai_api_base,
  )

  # List available models and select the first one.
  models = client.models.list()
  model = models.data[0].id  

  try:
      # Create a completion using the selected model.
      completion = client.completions.create(
          model=model,
          prompt=prompt,
          echo=False,
          n=1,
          stream=False,
      )

      # Print the completion results.
      print("Completion results:")
      print(completion.choices[0].text)
  except Exception as e:
      print(f"An error occurred: {e}")

### Functions to get and parse metrics

In [2]:
import requests 

def get_metrics(url="http://0.0.0.0:8000/metrics"):
    try:
        response = requests.get(url)
        response.raise_for_status()
        metrics = response.text
        return metrics
    except requests.exceptions.RequestException as e:
        print(f"Error fetching metrics: {e}")
        return None

def parse_draft_acceptance_rate(metrics):
    metric = -1
    if metrics:
        for line in metrics.splitlines():
            if line.startswith("vllm:spec_decode_draft_acceptance_rate"):
                print("------------------------SPEC DECODE DRAFT ACCEPTANCE RATE--------------------")
                print(line)
                print("-----------------------------------------------------------------------------")
                metric = float(line.split()[-1])
    else:
        print("No metrics to parse")

    return metric

### Test

In [4]:
# Define the prompt
prompt = "The future of AI is very "

# complete the generation for the prompt
generate_from_serve(prompt)

# Get and parse metrics
metrics = get_metrics()
draft_acceptance_rate = parse_draft_acceptance_rate(metrics)

print(f"Draft acceptance rate is: {draft_acceptance_rate}")

Completion results:
Â uncertain, and there are many different views on the topic. Some people
------------------------SPEC DECODE DRAFT ACCEPTANCE RATE--------------------
vllm:spec_decode_draft_acceptance_rate{model_name="Qwen/Qwen2.5-1.5B-Instruct"} 0.7142857142857143
-----------------------------------------------------------------------------
Draft acceptance rate is: 0.7142857142857143
