From 9948c9c1183476b432807fa923fa526a0ed0cef2 Mon Sep 17 00:00:00 2001 From: Mergen Nachin Date: Thu, 16 Apr 2026 22:02:36 -0400 Subject: [PATCH] =?UTF-8?q?Revert=20"Add=20Qwen=203.6=20MoE=20model=20and?= =?UTF-8?q?=20switch=20CI=20to=20Qwen3.6-35B-A3B-HQQ-INT4=20(#18=E2=80=A6"?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This reverts commit 7fdd30613ac02521e408d81fcdf8f9892faf9071. --- .ci/scripts/export_model_artifact.sh | 4 ++-- .ci/scripts/test_model_e2e.sh | 4 ++-- .github/workflows/cuda.yml | 16 +++++++------- examples/models/qwen3_5_moe/README.md | 32 ++++++--------------------- examples/models/qwen3_6_moe/README.md | 11 --------- 5 files changed, 19 insertions(+), 48 deletions(-) delete mode 100644 examples/models/qwen3_6_moe/README.md diff --git a/.ci/scripts/export_model_artifact.sh b/.ci/scripts/export_model_artifact.sh index e25dc509020..f19df233628 100755 --- a/.ci/scripts/export_model_artifact.sh +++ b/.ci/scripts/export_model_artifact.sh @@ -184,7 +184,7 @@ case "$HF_MODEL" in PREPROCESSOR_FEATURE_SIZE="" PREPROCESSOR_OUTPUT="" ;; - SocialLocalMobile/Qwen3.6-35B-A3B-HQQ-INT4) + SocialLocalMobile/Qwen3.5-35B-A3B-HQQ-INT4) MODEL_NAME="qwen3_5_moe" TASK="" MAX_SEQ_LEN="" @@ -194,7 +194,7 @@ case "$HF_MODEL" in ;; *) echo "Error: Unsupported model '$HF_MODEL'" - echo "Supported models: mistralai/Voxtral-Mini-3B-2507, mistralai/Voxtral-Mini-4B-Realtime-2602, openai/whisper-{small, medium, large, large-v2, large-v3, large-v3-turbo}, google/gemma-3-4b-it, Qwen/Qwen3-0.6B, nvidia/diar_streaming_sortformer_4spk-v2, nvidia/parakeet-tdt, facebook/dinov2-small-imagenet1k-1-layer, SocialLocalMobile/Qwen3.6-35B-A3B-HQQ-INT4" + echo "Supported models: mistralai/Voxtral-Mini-3B-2507, mistralai/Voxtral-Mini-4B-Realtime-2602, openai/whisper-{small, medium, large, large-v2, large-v3, large-v3-turbo}, google/gemma-3-4b-it, Qwen/Qwen3-0.6B, nvidia/diar_streaming_sortformer_4spk-v2, nvidia/parakeet-tdt, facebook/dinov2-small-imagenet1k-1-layer, SocialLocalMobile/Qwen3.5-35B-A3B-HQQ-INT4" exit 1 ;; esac diff --git a/.ci/scripts/test_model_e2e.sh b/.ci/scripts/test_model_e2e.sh index 646fe4b80c9..5cee37b19cf 100755 --- a/.ci/scripts/test_model_e2e.sh +++ b/.ci/scripts/test_model_e2e.sh @@ -216,7 +216,7 @@ case "$HF_MODEL" in AUDIO_FILE="test_audio.wav" IMAGE_PATH="" ;; - SocialLocalMobile/Qwen3.6-35B-A3B-HQQ-INT4) + SocialLocalMobile/Qwen3.5-35B-A3B-HQQ-INT4) MODEL_NAME="qwen3_5_moe" RUNNER_TARGET="qwen3_5_moe_runner" RUNNER_PATH="qwen3_5_moe" @@ -230,7 +230,7 @@ case "$HF_MODEL" in ;; *) echo "Error: Unsupported model '$HF_MODEL'" - echo "Supported models: mistralai/Voxtral-Mini-3B-2507, mistralai/Voxtral-Mini-4B-Realtime-2602, nvidia/diar_streaming_sortformer_4spk-v2, openai/whisper series (whisper-{small, medium, large, large-v2, large-v3, large-v3-turbo}), google/gemma-3-4b-it, Qwen/Qwen3-0.6B, nvidia/parakeet-tdt, facebook/dinov2-small-imagenet1k-1-layer, SocialLocalMobile/Qwen3.6-35B-A3B-HQQ-INT4" + echo "Supported models: mistralai/Voxtral-Mini-3B-2507, mistralai/Voxtral-Mini-4B-Realtime-2602, nvidia/diar_streaming_sortformer_4spk-v2, openai/whisper series (whisper-{small, medium, large, large-v2, large-v3, large-v3-turbo}), google/gemma-3-4b-it, Qwen/Qwen3-0.6B, nvidia/parakeet-tdt, facebook/dinov2-small-imagenet1k-1-layer, SocialLocalMobile/Qwen3.5-35B-A3B-HQQ-INT4" exit 1 ;; esac diff --git a/.github/workflows/cuda.yml b/.github/workflows/cuda.yml index 21e4c155e8e..68ded356b99 100644 --- a/.github/workflows/cuda.yml +++ b/.github/workflows/cuda.yml @@ -180,7 +180,7 @@ jobs: - repo: "facebook" name: "dinov2-small-imagenet1k-1-layer" - repo: "SocialLocalMobile" - name: "Qwen3.6-35B-A3B-HQQ-INT4" + name: "Qwen3.5-35B-A3B-HQQ-INT4" quant: - "non-quantized" - "quantized-int4-tile-packed" @@ -194,11 +194,11 @@ jobs: # Qwen3.5 MoE uses a prequantized checkpoint, only tile-packed - model: repo: "SocialLocalMobile" - name: "Qwen3.6-35B-A3B-HQQ-INT4" + name: "Qwen3.5-35B-A3B-HQQ-INT4" quant: "non-quantized" - model: repo: "SocialLocalMobile" - name: "Qwen3.6-35B-A3B-HQQ-INT4" + name: "Qwen3.5-35B-A3B-HQQ-INT4" quant: "quantized-int4-weight-only" # Voxtral Realtime only supports int4-tile-packed on CUDA - model: @@ -254,7 +254,7 @@ jobs: with: timeout: 90 secrets-env: EXECUTORCH_HF_TOKEN - runner: ${{ matrix.model.name == 'Qwen3.6-35B-A3B-HQQ-INT4' && 'linux.aws.a100' || 'linux.g5.4xlarge.nvidia.gpu' }} + runner: ${{ matrix.model.name == 'Qwen3.5-35B-A3B-HQQ-INT4' && 'linux.aws.a100' || 'linux.g5.4xlarge.nvidia.gpu' }} gpu-arch-type: cuda gpu-arch-version: 12.6 use-custom-docker-registry: false @@ -310,7 +310,7 @@ jobs: - repo: "facebook" name: "dinov2-small-imagenet1k-1-layer" - repo: "SocialLocalMobile" - name: "Qwen3.6-35B-A3B-HQQ-INT4" + name: "Qwen3.5-35B-A3B-HQQ-INT4" quant: - "non-quantized" - "quantized-int4-tile-packed" @@ -324,11 +324,11 @@ jobs: # Qwen3.5 MoE uses a prequantized checkpoint, only tile-packed - model: repo: "SocialLocalMobile" - name: "Qwen3.6-35B-A3B-HQQ-INT4" + name: "Qwen3.5-35B-A3B-HQQ-INT4" quant: "non-quantized" - model: repo: "SocialLocalMobile" - name: "Qwen3.6-35B-A3B-HQQ-INT4" + name: "Qwen3.5-35B-A3B-HQQ-INT4" quant: "quantized-int4-weight-only" # Voxtral Realtime only supports int4-tile-packed on CUDA - model: @@ -378,7 +378,7 @@ jobs: quant: "non-quantized" with: timeout: 90 - runner: ${{ matrix.model.name == 'Qwen3.6-35B-A3B-HQQ-INT4' && 'linux.aws.a100' || 'linux.g5.4xlarge.nvidia.gpu' }} + runner: ${{ matrix.model.name == 'Qwen3.5-35B-A3B-HQQ-INT4' && 'linux.aws.a100' || 'linux.g5.4xlarge.nvidia.gpu' }} gpu-arch-type: cuda gpu-arch-version: 12.6 use-custom-docker-registry: false diff --git a/examples/models/qwen3_5_moe/README.md b/examples/models/qwen3_5_moe/README.md index 4296cf9f122..83373a804f4 100644 --- a/examples/models/qwen3_5_moe/README.md +++ b/examples/models/qwen3_5_moe/README.md @@ -30,24 +30,6 @@ Export produces a `model.pte` and `aoti_cuda_blob.ptd` containing the compiled CUDA kernels and quantized weights. Int4 quantization is recommended — the model is too large to fit in VRAM at bf16. -### Quick start: prequantized weights - -The fastest path is to export from prequantized weights, which skips -the slow quantization step entirely. - -Prequantized checkpoints are available for download: -- [SocialLocalMobile/Qwen3.5-35B-A3B-HQQ-INT4](https://huggingface.co/SocialLocalMobile/Qwen3.5-35B-A3B-HQQ-INT4) -- [SocialLocalMobile/Qwen3.6-35B-A3B-HQQ-INT4](https://huggingface.co/SocialLocalMobile/Qwen3.6-35B-A3B-HQQ-INT4) - -```bash -python export.py --prequantized -``` - -See [Generating Prequantized Weights](#generating-prequantized-weights) -to create your own. - -### Quantize and Export - ```bash python export.py \ --model-id Qwen/Qwen3.5-35B-A3B \ @@ -78,7 +60,7 @@ python export.py \ | `--qlinear-group-size` | `32` | Group size for linear quantization | | `--qembedding` | (none) | Embedding quantization: `8w` | | `--hqq` | off | Use HQQ scale-only optimization for expert quantization (slower, better accuracy) | -| `--prequantized` | (none) | Path to prequantized checkpoint directory (skips quantization) | +| `--prequantized` | (none) | Path to prequantized bundle directory (skips quantization) | | `--turboquant` | off | Enable TurboQuant TQ4 KV cache compression (3.8x cache savings) | ### TurboQuant KV Cache Compression @@ -90,11 +72,11 @@ KV cache compression (3.8x savings) on the 10 full-attention layers. python export.py --prequantized qwen35_moe_int4_hqq --turboquant ``` -### Generating Prequantized Weights +### Prequantized Export Quantization is slow (~30 min with HQQ). To avoid re-quantizing on every -export, use `quantize_and_save.py` to create a prequantized checkpoint -directory, then export from it: +export, use `quantize_and_save.py` to create a self-contained bundle, then +export from it: ```bash # Step 1: Quantize once (slow) @@ -106,13 +88,13 @@ python quantize_and_save.py \ --hqq \ --output qwen35_moe_int4_hqq -# Step 2: Export from prequantized checkpoint (fast, no --model-dir needed) +# Step 2: Export from bundle (fast, no --model-dir needed) python export.py \ --prequantized qwen35_moe_int4_hqq ``` -The output directory contains `model.safetensors`, `config.json`, and -tokenizer files. It can be uploaded to HuggingFace Hub for easy sharing. +The bundle contains `model.safetensors`, `config.json`, and tokenizer files. +It can be uploaded to HuggingFace Hub for easy sharing. ## Build diff --git a/examples/models/qwen3_6_moe/README.md b/examples/models/qwen3_6_moe/README.md deleted file mode 100644 index 70d38298f83..00000000000 --- a/examples/models/qwen3_6_moe/README.md +++ /dev/null @@ -1,11 +0,0 @@ -# Qwen 3.6 MoE - -Qwen 3.6 MoE uses the same architecture and runner as Qwen 3.5 MoE. -See [examples/models/qwen3_5_moe](../qwen3_5_moe/) for export, build, -and inference instructions. - -Prequantized INT4 weights are available at -[SocialLocalMobile/Qwen3.6-35B-A3B-HQQ-INT4](https://huggingface.co/SocialLocalMobile/Qwen3.6-35B-A3B-HQQ-INT4). - -**Note:** This model has not been tested or evaluated. It is provided -mainly for development purposes.