pytorch · mergennachin · Apr 17, 2026 · Apr 17, 2026
diff --git a/.ci/scripts/export_model_artifact.sh b/.ci/scripts/export_model_artifact.sh
@@ -184,7 +184,7 @@ case "$HF_MODEL" in
     PREPROCESSOR_FEATURE_SIZE=""
     PREPROCESSOR_OUTPUT=""
     ;;
-  SocialLocalMobile/Qwen3.6-35B-A3B-HQQ-INT4)
+  SocialLocalMobile/Qwen3.5-35B-A3B-HQQ-INT4)
     MODEL_NAME="qwen3_5_moe"
     TASK=""
     MAX_SEQ_LEN=""
@@ -194,7 +194,7 @@ case "$HF_MODEL" in
     ;;
   *)
     echo "Error: Unsupported model '$HF_MODEL'"
-    echo "Supported models: mistralai/Voxtral-Mini-3B-2507, mistralai/Voxtral-Mini-4B-Realtime-2602, openai/whisper-{small, medium, large, large-v2, large-v3, large-v3-turbo}, google/gemma-3-4b-it, Qwen/Qwen3-0.6B, nvidia/diar_streaming_sortformer_4spk-v2, nvidia/parakeet-tdt, facebook/dinov2-small-imagenet1k-1-layer, SocialLocalMobile/Qwen3.6-35B-A3B-HQQ-INT4"
+    echo "Supported models: mistralai/Voxtral-Mini-3B-2507, mistralai/Voxtral-Mini-4B-Realtime-2602, openai/whisper-{small, medium, large, large-v2, large-v3, large-v3-turbo}, google/gemma-3-4b-it, Qwen/Qwen3-0.6B, nvidia/diar_streaming_sortformer_4spk-v2, nvidia/parakeet-tdt, facebook/dinov2-small-imagenet1k-1-layer, SocialLocalMobile/Qwen3.5-35B-A3B-HQQ-INT4"
     exit 1
     ;;
 esac

diff --git a/.ci/scripts/test_model_e2e.sh b/.ci/scripts/test_model_e2e.sh
@@ -216,7 +216,7 @@ case "$HF_MODEL" in
     AUDIO_FILE="test_audio.wav"
     IMAGE_PATH=""
     ;;
-  SocialLocalMobile/Qwen3.6-35B-A3B-HQQ-INT4)
+  SocialLocalMobile/Qwen3.5-35B-A3B-HQQ-INT4)
     MODEL_NAME="qwen3_5_moe"
     RUNNER_TARGET="qwen3_5_moe_runner"
     RUNNER_PATH="qwen3_5_moe"
@@ -230,7 +230,7 @@ case "$HF_MODEL" in
     ;;
   *)
     echo "Error: Unsupported model '$HF_MODEL'"
-    echo "Supported models: mistralai/Voxtral-Mini-3B-2507, mistralai/Voxtral-Mini-4B-Realtime-2602, nvidia/diar_streaming_sortformer_4spk-v2, openai/whisper series (whisper-{small, medium, large, large-v2, large-v3, large-v3-turbo}), google/gemma-3-4b-it, Qwen/Qwen3-0.6B, nvidia/parakeet-tdt, facebook/dinov2-small-imagenet1k-1-layer, SocialLocalMobile/Qwen3.6-35B-A3B-HQQ-INT4"
+    echo "Supported models: mistralai/Voxtral-Mini-3B-2507, mistralai/Voxtral-Mini-4B-Realtime-2602, nvidia/diar_streaming_sortformer_4spk-v2, openai/whisper series (whisper-{small, medium, large, large-v2, large-v3, large-v3-turbo}), google/gemma-3-4b-it, Qwen/Qwen3-0.6B, nvidia/parakeet-tdt, facebook/dinov2-small-imagenet1k-1-layer, SocialLocalMobile/Qwen3.5-35B-A3B-HQQ-INT4"
     exit 1
     ;;
 esac

diff --git a/.github/workflows/cuda.yml b/.github/workflows/cuda.yml
@@ -180,7 +180,7 @@ jobs:
           - repo: "facebook"
             name: "dinov2-small-imagenet1k-1-layer"
           - repo: "SocialLocalMobile"
-            name: "Qwen3.6-35B-A3B-HQQ-INT4"
+            name: "Qwen3.5-35B-A3B-HQQ-INT4"
         quant:
           - "non-quantized"
           - "quantized-int4-tile-packed"
@@ -194,11 +194,11 @@ jobs:
           # Qwen3.5 MoE uses a prequantized checkpoint, only tile-packed
           - model:
               repo: "SocialLocalMobile"
-              name: "Qwen3.6-35B-A3B-HQQ-INT4"
+              name: "Qwen3.5-35B-A3B-HQQ-INT4"
             quant: "non-quantized"
           - model:
               repo: "SocialLocalMobile"
-              name: "Qwen3.6-35B-A3B-HQQ-INT4"
+              name: "Qwen3.5-35B-A3B-HQQ-INT4"
             quant: "quantized-int4-weight-only"
           # Voxtral Realtime only supports int4-tile-packed on CUDA
           - model:
@@ -254,7 +254,7 @@ jobs:
     with:
       timeout: 90
       secrets-env: EXECUTORCH_HF_TOKEN
-      runner: ${{ matrix.model.name == 'Qwen3.6-35B-A3B-HQQ-INT4' && 'linux.aws.a100' || 'linux.g5.4xlarge.nvidia.gpu' }}
+      runner: ${{ matrix.model.name == 'Qwen3.5-35B-A3B-HQQ-INT4' && 'linux.aws.a100' || 'linux.g5.4xlarge.nvidia.gpu' }}
       gpu-arch-type: cuda
       gpu-arch-version: 12.6
       use-custom-docker-registry: false
@@ -310,7 +310,7 @@ jobs:
           - repo: "facebook"
             name: "dinov2-small-imagenet1k-1-layer"
           - repo: "SocialLocalMobile"
-            name: "Qwen3.6-35B-A3B-HQQ-INT4"
+            name: "Qwen3.5-35B-A3B-HQQ-INT4"
         quant:
           - "non-quantized"
           - "quantized-int4-tile-packed"
@@ -324,11 +324,11 @@ jobs:
           # Qwen3.5 MoE uses a prequantized checkpoint, only tile-packed
           - model:
               repo: "SocialLocalMobile"
-              name: "Qwen3.6-35B-A3B-HQQ-INT4"
+              name: "Qwen3.5-35B-A3B-HQQ-INT4"
             quant: "non-quantized"
           - model:
               repo: "SocialLocalMobile"
-              name: "Qwen3.6-35B-A3B-HQQ-INT4"
+              name: "Qwen3.5-35B-A3B-HQQ-INT4"
             quant: "quantized-int4-weight-only"
           # Voxtral Realtime only supports int4-tile-packed on CUDA
           - model:
@@ -378,7 +378,7 @@ jobs:
             quant: "non-quantized"
     with:
       timeout: 90
-      runner: ${{ matrix.model.name == 'Qwen3.6-35B-A3B-HQQ-INT4' && 'linux.aws.a100' || 'linux.g5.4xlarge.nvidia.gpu' }}
+      runner: ${{ matrix.model.name == 'Qwen3.5-35B-A3B-HQQ-INT4' && 'linux.aws.a100' || 'linux.g5.4xlarge.nvidia.gpu' }}
       gpu-arch-type: cuda
       gpu-arch-version: 12.6
       use-custom-docker-registry: false

@@ -30,24 +30,6 @@ Export produces a `model.pte` and `aoti_cuda_blob.ptd` containing the
 compiled CUDA kernels and quantized weights. Int4 quantization is
 recommended — the model is too large to fit in VRAM at bf16.
 
-### Quick start: prequantized weights
-
-The fastest path is to export from prequantized weights, which skips
-the slow quantization step entirely.
-
-Prequantized checkpoints are available for download:
-- [SocialLocalMobile/Qwen3.5-35B-A3B-HQQ-INT4](https://huggingface.co/SocialLocalMobile/Qwen3.5-35B-A3B-HQQ-INT4)
-- [SocialLocalMobile/Qwen3.6-35B-A3B-HQQ-INT4](https://huggingface.co/SocialLocalMobile/Qwen3.6-35B-A3B-HQQ-INT4)
-
-```bash
-python export.py --prequantized <path-to-bundle>
-```
-
-See [Generating Prequantized Weights](#generating-prequantized-weights)
-to create your own.
-
-### Quantize and Export
-
 ```bash
 python export.py \
     --model-id Qwen/Qwen3.5-35B-A3B \
@@ -78,7 +60,7 @@ python export.py \
 | `--qlinear-group-size` | `32` | Group size for linear quantization |
 | `--qembedding` | (none) | Embedding quantization: `8w` |
 | `--hqq` | off | Use HQQ scale-only optimization for expert quantization (slower, better accuracy) |
-| `--prequantized` | (none) | Path to prequantized checkpoint directory (skips quantization) |
+| `--prequantized` | (none) | Path to prequantized bundle directory (skips quantization) |
 | `--turboquant` | off | Enable TurboQuant TQ4 KV cache compression (3.8x cache savings) |
 
 ### TurboQuant KV Cache Compression
@@ -90,11 +72,11 @@ KV cache compression (3.8x savings) on the 10 full-attention layers.
 python export.py --prequantized qwen35_moe_int4_hqq --turboquant
 ```
 
-### Generating Prequantized Weights
+### Prequantized Export
 
 Quantization is slow (~30 min with HQQ). To avoid re-quantizing on every
-export, use `quantize_and_save.py` to create a prequantized checkpoint
-directory, then export from it:
+export, use `quantize_and_save.py` to create a self-contained bundle, then
+export from it:
 
 ```bash
 # Step 1: Quantize once (slow)
@@ -106,13 +88,13 @@ python quantize_and_save.py \
     --hqq \
     --output qwen35_moe_int4_hqq
 
-# Step 2: Export from prequantized checkpoint (fast, no --model-dir needed)
+# Step 2: Export from bundle (fast, no --model-dir needed)
 python export.py \
     --prequantized qwen35_moe_int4_hqq
 ```
 
-The output directory contains `model.safetensors`, `config.json`, and
-tokenizer files. It can be uploaded to HuggingFace Hub for easy sharing.
+The bundle contains `model.safetensors`, `config.json`, and tokenizer files.
+It can be uploaded to HuggingFace Hub for easy sharing.
 
 ## Build