Add timing metrics for requests (#12646)

cicirori · scottjlee · web-flow · commit 58095cb00aec · 2025-11-05T23:07:16.000-08:00
Co-authored-by: Scott Lee &lt;scottjlee@users.noreply.github.com&gt;
diff --git a/python/sglang/srt/entrypoints/openai/serving_base.py b/python/sglang/srt/entrypoints/openai/serving_base.py
@@ -2,6 +2,7 @@
 
 import json
 import logging
+import time
 import uuid
 from abc import ABC, abstractmethod
 from typing import TYPE_CHECKING, Any, List, Optional, Tuple, Union
@@ -84,17 +85,23 @@ def _validate_lora_enabled(self, adapter_name: str) -> None:
     async def handle_request(
         self, request: OpenAIServingRequest, raw_request: Request
     ) -> Union[Any, StreamingResponse, ErrorResponse]:
-        """Handle the specific request type with common pattern"""
+        """Handle the specific request type with common pattern
+        If you want to override this method, you should be careful to record the validation time.
+        """
         try:
             # Validate request
+            validation_start = time.perf_counter()
             error_msg = self._validate_request(request)
+            validation_time = time.perf_counter() - validation_start
             if error_msg:
                 return self.create_error_response(error_msg)
 
             # Convert to internal format
             adapted_request, processed_request = self._convert_to_internal_request(
                 request, raw_request
             )
+            if hasattr(adapted_request, "validation_time"):
+                adapted_request.validation_time = validation_time
 
             # Note(Xinyuan): raw_request below is only used for detecting the connection of the client
             if hasattr(request, "stream") and request.stream:
@@ -157,6 +164,7 @@ def _convert_to_internal_request(
         self,
         request: OpenAIServingRequest,
         raw_request: Request = None,
+        validation_time: float = None,
     ) -> tuple[GenerateReqInput, OpenAIServingRequest]:
         """Convert OpenAI request to internal format"""
         pass
diff --git a/python/sglang/srt/grpc/grpc_request_manager.py b/python/sglang/srt/grpc/grpc_request_manager.py
@@ -80,6 +80,10 @@ class GrpcReqState:
     last_time: float = 0.0
     last_completion_tokens: int = 1
 
+    # perf_counter equivalents for accurate time calculations
+    finished_time_perf: float = 0.0
+    first_token_time_perf: float = 0.0
+
     # Streaming state
     stream_finished: bool = False
     input_logprobs_sent: bool = False  # Track if input logprobs were sent in streaming
@@ -536,6 +540,7 @@ async def _handle_batch_output(self, batch_out: BatchTokenIDOutput):
         put_tasks = []
         cleanup_tasks = []
         now = time.time()
+        now_perf_counter = time.perf_counter()
 
         # Process each request in the batch
         for i, rid in enumerate(batch_out.rids):
@@ -552,6 +557,7 @@ async def _handle_batch_output(self, batch_out: BatchTokenIDOutput):
             # Update metrics
             if state.first_token_time == 0.0:
                 state.first_token_time = now
+                state.first_token_time_perf = now_perf_counter
             state.last_time = now
 
             # Extract output for this request
@@ -650,6 +656,7 @@ def get_part(attr_name):
             if output_data["finished"]:
                 state.finished = True
                 state.finished_time = now
+                state.finished_time_perf = now_perf_counter
                 state.stream_finished = True
                 state.event.set()
 
@@ -691,6 +698,7 @@ async def _handle_embedding_output(self, batch_out: BatchEmbeddingOutput):
             # Mark as finished
             state.finished = True
             state.finished_time = time.time()
+            state.finished_time_perf = time.perf_counter()
             state.event.set()
 
     async def _handle_health_check_output(self, health_out: HealthCheckOutput):
@@ -723,6 +731,7 @@ async def _handle_health_check_output(self, health_out: HealthCheckOutput):
         # Mark as finished
         state.finished = True
         state.finished_time = time.time()
+        state.finished_time_perf = time.perf_counter()
         state.event.set()
 
     async def _handle_abort_req(self, recv_obj: AbortReq):
diff --git a/python/sglang/srt/managers/detokenizer_manager.py b/python/sglang/srt/managers/detokenizer_manager.py
@@ -277,6 +277,10 @@ def handle_batch_token_id_out(self, recv_obj: BatchTokenIDOutput):
             placeholder_tokens_val=None,
             retraction_counts=recv_obj.retraction_counts,
             token_steps=recv_obj.token_steps,
+            queue_time=recv_obj.queue_time,
+            forward_entry_time=recv_obj.forward_entry_time,
+            prefill_delay=recv_obj.prefill_delay,
+            prefill_latency=recv_obj.prefill_latency,
         )
 
     def handle_multimodal_decode_req(self, recv_obj: BatchMultimodalDecodeReq):
@@ -291,6 +295,10 @@ def handle_multimodal_decode_req(self, recv_obj: BatchMultimodalDecodeReq):
             cached_tokens=recv_obj.cached_tokens,
             placeholder_tokens_idx=None,
             placeholder_tokens_val=None,
+            queue_time=recv_obj.queue_time,
+            forward_entry_time=recv_obj.forward_entry_time,
+            prefill_delay=recv_obj.prefill_delay,
+            prefill_latency=recv_obj.prefill_latency,
         )
 
     def handle_freeze_gc_req(self, recv_req: FreezeGCReq):
diff --git a/python/sglang/srt/managers/io_struct.py b/python/sglang/srt/managers/io_struct.py
@@ -61,6 +61,55 @@ def regenerate_rids(self):
         return self.rids
 
 
+@dataclass
+class RequestTimingMetricsMixin:
+    """
+    Mixin class containing common request-level timing metrics.
+
+    This class consolidates the timing metrics that are shared across all batch output types
+    to avoid code duplication and ensure consistency.
+    """
+
+    # Queue duration: time spent waiting in queue before request is scheduled.
+    queue_time: Optional[List[Optional[float]]]
+
+    # Forward entry time: timestamp when the request enters the forward pass stage.
+    # This corresponds to `forward_entry_time` in TimeStats.
+    # In different modes:
+    #   - Unified/PD-colocate: timestamp when forward computation begins (covers prefill + decode)
+    #   - Prefill instance (P): timestamp when prefill forward pass begins
+    #   - Decode instance (D): timestamp when decode forward pass begins
+    # Note: This is NOT the same as prefill_start_time. There may be a delay between
+    # forward_entry_time and prefill_start_time (see prefill_delay).
+    forward_entry_time: Optional[List[Optional[float]]]
+
+    # Prefill delay: time spent waiting between forward entry and prefill start.
+    # Calculated as: prefill_start_time - forward_entry_time
+    # This represents the delay between when the request enters the forward stage
+    # and when prefill computation actually begins.
+    prefill_delay: Optional[List[Optional[float]]]
+
+    # Prefill latency: time spent during prefill computation.
+    # Calculated as: prefill_end_time - prefill_start_time
+    prefill_latency: Optional[List[Optional[float]]]
+
+
+@dataclass
+class SpeculativeDecodingMetricsMixin:
+    """
+    Mixin class containing speculative decoding metrics.
+
+    This class consolidates speculative decoding metrics that are shared across
+    batch output types that support speculative decoding to avoid code duplication.
+    """
+
+    # Verify count: number of verification forward passes
+    spec_verify_ct: List[int]
+
+    # Accepted tokens: Number of accepted tokens during speculative decoding
+    spec_accepted_tokens: List[int]
+
+
 # Parameters for a session
 @dataclass
 class SessionParams:
@@ -148,6 +197,9 @@ class GenerateReqInput(BaseReq):
     bootstrap_room: Optional[Union[List[int], int]] = None
     bootstrap_pair_key: Optional[Union[List[str], str]] = None
 
+    # Validation step duration
+    validation_time: Optional[float] = None
+
     # For data parallel rank routing
     data_parallel_rank: Optional[int] = None
 
@@ -564,6 +616,7 @@ def __getitem__(self, i):
                 if self.bootstrap_pair_key is not None
                 else None
             ),
+            validation_time=self.validation_time,
             data_parallel_rank=(
                 self.data_parallel_rank if self.data_parallel_rank is not None else None
             ),
@@ -684,6 +737,8 @@ class EmbeddingReqInput(BaseReq):
     log_metrics: bool = True
     # The modalities of the image data [image, multi-images, video]
     modalities: Optional[List[str]] = None
+    # Validation step duration
+    validation_time: Optional[float] = None
     # For cross-encoder requests
     is_cross_encoder_request: bool = False
     # Priority for the request
@@ -774,6 +829,7 @@ def __getitem__(self, i):
             video_data=self.video_data[i] if self.video_data is not None else None,
             sampling_params=self.sampling_params[i],
             rid=self.rid[i],
+            validation_time=self.validation_time,
             dimensions=self.dimensions,
             http_worker_ipc=self.http_worker_ipc,
         )
@@ -815,7 +871,9 @@ def __iter__(self):
 
 
 @dataclass
-class BatchTokenIDOutput(BaseBatchReq):
+class BatchTokenIDOutput(
+    BaseBatchReq, RequestTimingMetricsMixin, SpeculativeDecodingMetricsMixin
+):
     # The finish reason
     finished_reasons: List[BaseFinishReason]
     # For incremental decoding
@@ -833,8 +891,6 @@ class BatchTokenIDOutput(BaseBatchReq):
     prompt_tokens: List[int]
     completion_tokens: List[int]
     cached_tokens: List[int]
-    spec_verify_ct: List[int]
-    spec_accepted_tokens: List[int]
 
     # Logprobs
     input_token_logprobs_val: List[float]
@@ -868,7 +924,7 @@ class BatchTokenIDOutput(BaseBatchReq):
 
 
 @dataclass
-class BatchMultimodalDecodeReq(BaseBatchReq):
+class BatchMultimodalDecodeReq(BaseBatchReq, RequestTimingMetricsMixin):
     decoded_ids: List[int]
     input_token_logprobs_val: List[float]
     input_token_logprobs_idx: List[int]
@@ -900,7 +956,9 @@ class BatchMultimodalDecodeReq(BaseBatchReq):
 
 
 @dataclass
-class BatchStrOutput(BaseBatchReq):
+class BatchStrOutput(
+    BaseBatchReq, RequestTimingMetricsMixin, SpeculativeDecodingMetricsMixin
+):
     # The finish reason
     finished_reasons: List[dict]
     # The output decoded strings
@@ -912,8 +970,6 @@ class BatchStrOutput(BaseBatchReq):
     prompt_tokens: List[int]
     completion_tokens: List[int]
     cached_tokens: List[int]
-    spec_verify_ct: List[int]
-    spec_accepted_tokens: List[int]
 
     # Logprobs
     input_token_logprobs_val: List[float]
@@ -947,7 +1003,7 @@ class BatchStrOutput(BaseBatchReq):
 
 
 @dataclass
-class BatchMultimodalOutput(BaseBatchReq):
+class BatchMultimodalOutput(BaseBatchReq, RequestTimingMetricsMixin):
     # The finish reason
     finished_reasons: List[dict]
     decoded_ids: List[List[int]]
@@ -972,7 +1028,7 @@ class BatchMultimodalOutput(BaseBatchReq):
 
 
 @dataclass
-class BatchEmbeddingOutput(BaseBatchReq):
+class BatchEmbeddingOutput(BaseBatchReq, RequestTimingMetricsMixin):
     # The finish reason
     finished_reasons: List[BaseFinishReason]
     # The output embedding
diff --git a/python/sglang/srt/managers/multi_tokenizer_mixin.py b/python/sglang/srt/managers/multi_tokenizer_mixin.py
@@ -91,6 +91,26 @@ def _handle_output_by_index(output, i):
     if isinstance(output, BatchTokenIDOutput):
         new_output = BatchTokenIDOutput(
             rids=[output.rids[i]],
+            spec_verify_ct=(
+                [output.spec_verify_ct[i]] if len(output.spec_verify_ct) > i else None
+            ),
+            spec_accepted_tokens=(
+                [output.spec_accepted_tokens[i]]
+                if len(output.spec_accepted_tokens) > i
+                else None
+            ),
+            queue_time=[output.queue_time[i]] if len(output.queue_time) > i else None,
+            forward_entry_time=(
+                [output.forward_entry_time[i]]
+                if len(output.forward_entry_time) > i
+                else None
+            ),
+            prefill_delay=(
+                [output.prefill_delay[i]] if len(output.prefill_delay) > i else None
+            ),
+            prefill_latency=(
+                [output.prefill_latency[i]] if len(output.prefill_latency) > i else None
+            ),
             finished_reasons=(
                 [output.finished_reasons[i]]
                 if len(output.finished_reasons) > i
@@ -132,9 +152,6 @@ def _handle_output_by_index(output, i):
             cached_tokens=(
                 [output.cached_tokens[i]] if len(output.cached_tokens) > i else None
             ),
-            spec_verify_ct=(
-                [output.spec_verify_ct[i]] if len(output.spec_verify_ct) > i else None
-            ),
             input_token_logprobs_val=(
                 [output.input_token_logprobs_val[i]]
                 if output.input_token_logprobs_val
@@ -230,6 +247,26 @@ def _handle_output_by_index(output, i):
     elif isinstance(output, BatchStrOutput):
         new_output = BatchStrOutput(
             rids=[output.rids[i]],
+            spec_verify_ct=(
+                [output.spec_verify_ct[i]] if len(output.spec_verify_ct) > i else None
+            ),
+            spec_accepted_tokens=(
+                [output.spec_accepted_tokens[i]]
+                if len(output.spec_accepted_tokens) > i
+                else None
+            ),
+            queue_time=[output.queue_time[i]] if len(output.queue_time) > i else None,
+            forward_entry_time=(
+                [output.forward_entry_time[i]]
+                if len(output.forward_entry_time) > i
+                else None
+            ),
+            prefill_delay=(
+                [output.prefill_delay[i]] if len(output.prefill_delay) > i else None
+            ),
+            prefill_latency=(
+                [output.prefill_latency[i]] if len(output.prefill_latency) > i else None
+            ),
             finished_reasons=(
                 [output.finished_reasons[i]]
                 if len(output.finished_reasons) > i
@@ -254,14 +291,6 @@ def _handle_output_by_index(output, i):
             cached_tokens=(
                 [output.cached_tokens[i]] if len(output.cached_tokens) > i else None
             ),
-            spec_verify_ct=(
-                [output.spec_verify_ct[i]] if len(output.spec_verify_ct) > i else None
-            ),
-            spec_accepted_tokens=(
-                [output.spec_accepted_tokens[i]]
-                if len(output.spec_accepted_tokens) > i
-                else None
-            ),
             input_token_logprobs_val=(
                 [output.input_token_logprobs_val[i]]
                 if output.input_token_logprobs_val
diff --git a/python/sglang/srt/managers/scheduler.py b/python/sglang/srt/managers/scheduler.py
@@ -152,6 +152,7 @@
 from sglang.srt.mem_cache.mamba_radix_cache import MambaRadixCache
 from sglang.srt.mem_cache.radix_cache import RadixCache
 from sglang.srt.mem_cache.swa_radix_cache import SWARadixCache
+from sglang.srt.model_executor.forward_batch_info import ForwardMode
 from sglang.srt.multiplex.multiplexing_mixin import SchedulerMultiplexMixin
 from sglang.srt.parser.reasoning_parser import ReasoningParser
 from sglang.srt.server_args import PortArgs, ServerArgs, get_global_server_args
@@ -1952,6 +1953,12 @@ def run_batch(
             logger.info(f"Scheduler.run_batch sleep {self.forward_sleep_time}s")
             time.sleep(self.forward_sleep_time)
 
+        # Capture prefill start time for EXTEND mode
+        if batch.forward_mode == ForwardMode.EXTEND:
+            current_time = time.perf_counter()
+            for req in batch.reqs:
+                req.time_stats.prefill_start_time = current_time
+
         # Run forward
         if self.is_generation:
             batch_or_worker_batch = batch
@@ -2045,11 +2052,18 @@ def run_batch(
             batch_result.extend_logprob_start_len_per_req = (
                 extend_logprob_start_len_per_req
             )
-            return batch_result
+            ret = batch_result
         else:  # embedding or reward model
             model_worker_batch = batch.get_model_worker_batch()
             embeddings = self.tp_worker.forward_batch_embedding(model_worker_batch)
             ret = EmbeddingBatchResult(embeddings=embeddings)
+
+        # Capture prefill end time for EXTEND mode
+        if batch.forward_mode == ForwardMode.EXTEND:
+            current_time = time.perf_counter()
+            for req in batch.reqs:
+                req.time_stats.prefill_end_time = current_time
+
         return ret
 
     def launch_batch_sample_if_needed(
diff --git a/python/sglang/srt/managers/scheduler_output_processor_mixin.py b/python/sglang/srt/managers/scheduler_output_processor_mixin.py
diff --git a/python/sglang/srt/managers/tokenizer_manager.py b/python/sglang/srt/managers/tokenizer_manager.py
diff --git a/python/sglang/srt/metrics/collector.py b/python/sglang/srt/metrics/collector.py