From 9614d5c899bf33fa09b4b0e65f16a0070cc86638 Mon Sep 17 00:00:00 2001 From: Mergen Nachin Date: Wed, 4 Mar 2026 06:18:26 -0800 Subject: [PATCH] Voxtral Realtime: enable streaming mode in CUDA CI Remove the vr-offline override so the CUDA CI runs Voxtral Realtime in streaming mode (the default). The streaming encoder path exercises the full pipeline including ring buffer KV cache and incremental mel processing. --- .github/workflows/cuda.yml | 18 ++++-------------- 1 file changed, 4 insertions(+), 14 deletions(-) diff --git a/.github/workflows/cuda.yml b/.github/workflows/cuda.yml index 48a851338af..71f98c03196 100644 --- a/.github/workflows/cuda.yml +++ b/.github/workflows/cuda.yml @@ -159,7 +159,7 @@ jobs: repo: "google" name: "gemma-3-4b-it" quant: "quantized-int4-weight-only" - # Voxtral Realtime only supports int4-tile-packed on CUDA (offline mode) + # Voxtral Realtime only supports int4-tile-packed on CUDA - model: repo: "mistralai" name: "Voxtral-Mini-4B-Realtime-2602" @@ -197,12 +197,7 @@ jobs: echo "::endgroup::" fi - # Voxtral Realtime uses offline mode for CUDA CI (not streaming) - VR_MODE="" - if [ "${{ matrix.model.name }}" = "Voxtral-Mini-4B-Realtime-2602" ]; then - VR_MODE="vr-offline" - fi - source .ci/scripts/export_model_artifact.sh cuda "${{ matrix.model.repo }}/${{ matrix.model.name }}" "${{ matrix.quant }}" "${RUNNER_ARTIFACT_DIR}" "$VR_MODE" + source .ci/scripts/export_model_artifact.sh cuda "${{ matrix.model.repo }}/${{ matrix.model.name }}" "${{ matrix.quant }}" "${RUNNER_ARTIFACT_DIR}" test-model-cuda-e2e: name: test-model-cuda-e2e @@ -237,7 +232,7 @@ jobs: repo: "google" name: "gemma-3-4b-it" quant: "quantized-int4-weight-only" - # Voxtral Realtime only supports int4-tile-packed on CUDA (offline mode) + # Voxtral Realtime only supports int4-tile-packed on CUDA - model: repo: "mistralai" name: "Voxtral-Mini-4B-Realtime-2602" @@ -256,12 +251,7 @@ jobs: download-artifact: ${{ matrix.model.repo }}-${{ matrix.model.name }}-cuda-${{ matrix.quant }} ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} script: | - # Voxtral Realtime uses offline mode for CUDA CI (not streaming) - VR_MODE="" - if [ "${{ matrix.model.name }}" = "Voxtral-Mini-4B-Realtime-2602" ]; then - VR_MODE="vr-offline" - fi - source .ci/scripts/test_model_e2e.sh cuda "${{ matrix.model.repo }}/${{ matrix.model.name }}" "${{ matrix.quant }}" "${RUNNER_ARTIFACT_DIR}" "$VR_MODE" + source .ci/scripts/test_model_e2e.sh cuda "${{ matrix.model.repo }}/${{ matrix.model.name }}" "${{ matrix.quant }}" "${RUNNER_ARTIFACT_DIR}" test-cuda-pybind: name: test-cuda-pybind