From 9948c9c1183476b432807fa923fa526a0ed0cef2 Mon Sep 17 00:00:00 2001
From: Mergen Nachin <mnachin@meta.com>
Date: Thu, 16 Apr 2026 22:02:36 -0400
Subject: [PATCH] =?UTF-8?q?Revert=20"Add=20Qwen=203.6=20MoE=20model=20and?=
 =?UTF-8?q?=20switch=20CI=20to=20Qwen3.6-35B-A3B-HQQ-INT4=20(#18=E2=80=A6"?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This reverts commit 7fdd30613ac02521e408d81fcdf8f9892faf9071.
---
 .ci/scripts/export_model_artifact.sh  |  4 ++--
 .ci/scripts/test_model_e2e.sh         |  4 ++--
 .github/workflows/cuda.yml            | 16 +++++++-------
 examples/models/qwen3_5_moe/README.md | 32 ++++++---------------------
 examples/models/qwen3_6_moe/README.md | 11 ---------
 5 files changed, 19 insertions(+), 48 deletions(-)
 delete mode 100644 examples/models/qwen3_6_moe/README.md

diff --git a/.ci/scripts/export_model_artifact.sh b/.ci/scripts/export_model_artifact.sh
index e25dc509020..f19df233628 100755
--- a/.ci/scripts/export_model_artifact.sh
+++ b/.ci/scripts/export_model_artifact.sh
@@ -184,7 +184,7 @@ case "$HF_MODEL" in
     PREPROCESSOR_FEATURE_SIZE=""
     PREPROCESSOR_OUTPUT=""
     ;;
-  SocialLocalMobile/Qwen3.6-35B-A3B-HQQ-INT4)
+  SocialLocalMobile/Qwen3.5-35B-A3B-HQQ-INT4)
     MODEL_NAME="qwen3_5_moe"
     TASK=""
     MAX_SEQ_LEN=""
@@ -194,7 +194,7 @@ case "$HF_MODEL" in
     ;;
   *)
     echo "Error: Unsupported model '$HF_MODEL'"
-    echo "Supported models: mistralai/Voxtral-Mini-3B-2507, mistralai/Voxtral-Mini-4B-Realtime-2602, openai/whisper-{small, medium, large, large-v2, large-v3, large-v3-turbo}, google/gemma-3-4b-it, Qwen/Qwen3-0.6B, nvidia/diar_streaming_sortformer_4spk-v2, nvidia/parakeet-tdt, facebook/dinov2-small-imagenet1k-1-layer, SocialLocalMobile/Qwen3.6-35B-A3B-HQQ-INT4"
+    echo "Supported models: mistralai/Voxtral-Mini-3B-2507, mistralai/Voxtral-Mini-4B-Realtime-2602, openai/whisper-{small, medium, large, large-v2, large-v3, large-v3-turbo}, google/gemma-3-4b-it, Qwen/Qwen3-0.6B, nvidia/diar_streaming_sortformer_4spk-v2, nvidia/parakeet-tdt, facebook/dinov2-small-imagenet1k-1-layer, SocialLocalMobile/Qwen3.5-35B-A3B-HQQ-INT4"
     exit 1
     ;;
 esac
diff --git a/.ci/scripts/test_model_e2e.sh b/.ci/scripts/test_model_e2e.sh
index 646fe4b80c9..5cee37b19cf 100755
--- a/.ci/scripts/test_model_e2e.sh
+++ b/.ci/scripts/test_model_e2e.sh
@@ -216,7 +216,7 @@ case "$HF_MODEL" in
     AUDIO_FILE="test_audio.wav"
     IMAGE_PATH=""
     ;;
-  SocialLocalMobile/Qwen3.6-35B-A3B-HQQ-INT4)
+  SocialLocalMobile/Qwen3.5-35B-A3B-HQQ-INT4)
     MODEL_NAME="qwen3_5_moe"
     RUNNER_TARGET="qwen3_5_moe_runner"
     RUNNER_PATH="qwen3_5_moe"
@@ -230,7 +230,7 @@ case "$HF_MODEL" in
     ;;
   *)
     echo "Error: Unsupported model '$HF_MODEL'"
-    echo "Supported models: mistralai/Voxtral-Mini-3B-2507, mistralai/Voxtral-Mini-4B-Realtime-2602, nvidia/diar_streaming_sortformer_4spk-v2, openai/whisper series (whisper-{small, medium, large, large-v2, large-v3, large-v3-turbo}), google/gemma-3-4b-it, Qwen/Qwen3-0.6B, nvidia/parakeet-tdt, facebook/dinov2-small-imagenet1k-1-layer, SocialLocalMobile/Qwen3.6-35B-A3B-HQQ-INT4"
+    echo "Supported models: mistralai/Voxtral-Mini-3B-2507, mistralai/Voxtral-Mini-4B-Realtime-2602, nvidia/diar_streaming_sortformer_4spk-v2, openai/whisper series (whisper-{small, medium, large, large-v2, large-v3, large-v3-turbo}), google/gemma-3-4b-it, Qwen/Qwen3-0.6B, nvidia/parakeet-tdt, facebook/dinov2-small-imagenet1k-1-layer, SocialLocalMobile/Qwen3.5-35B-A3B-HQQ-INT4"
     exit 1
     ;;
 esac
diff --git a/.github/workflows/cuda.yml b/.github/workflows/cuda.yml
index 21e4c155e8e..68ded356b99 100644
--- a/.github/workflows/cuda.yml
+++ b/.github/workflows/cuda.yml
@@ -180,7 +180,7 @@ jobs:
           - repo: "facebook"
             name: "dinov2-small-imagenet1k-1-layer"
           - repo: "SocialLocalMobile"
-            name: "Qwen3.6-35B-A3B-HQQ-INT4"
+            name: "Qwen3.5-35B-A3B-HQQ-INT4"
         quant:
           - "non-quantized"
           - "quantized-int4-tile-packed"
@@ -194,11 +194,11 @@ jobs:
           # Qwen3.5 MoE uses a prequantized checkpoint, only tile-packed
           - model:
               repo: "SocialLocalMobile"
-              name: "Qwen3.6-35B-A3B-HQQ-INT4"
+              name: "Qwen3.5-35B-A3B-HQQ-INT4"
             quant: "non-quantized"
           - model:
               repo: "SocialLocalMobile"
-              name: "Qwen3.6-35B-A3B-HQQ-INT4"
+              name: "Qwen3.5-35B-A3B-HQQ-INT4"
             quant: "quantized-int4-weight-only"
           # Voxtral Realtime only supports int4-tile-packed on CUDA
           - model:
@@ -254,7 +254,7 @@ jobs:
     with:
       timeout: 90
       secrets-env: EXECUTORCH_HF_TOKEN
-      runner: ${{ matrix.model.name == 'Qwen3.6-35B-A3B-HQQ-INT4' && 'linux.aws.a100' || 'linux.g5.4xlarge.nvidia.gpu' }}
+      runner: ${{ matrix.model.name == 'Qwen3.5-35B-A3B-HQQ-INT4' && 'linux.aws.a100' || 'linux.g5.4xlarge.nvidia.gpu' }}
       gpu-arch-type: cuda
       gpu-arch-version: 12.6
       use-custom-docker-registry: false
@@ -310,7 +310,7 @@ jobs:
           - repo: "facebook"
             name: "dinov2-small-imagenet1k-1-layer"
           - repo: "SocialLocalMobile"
-            name: "Qwen3.6-35B-A3B-HQQ-INT4"
+            name: "Qwen3.5-35B-A3B-HQQ-INT4"
         quant:
           - "non-quantized"
           - "quantized-int4-tile-packed"
@@ -324,11 +324,11 @@ jobs:
           # Qwen3.5 MoE uses a prequantized checkpoint, only tile-packed
           - model:
               repo: "SocialLocalMobile"
-              name: "Qwen3.6-35B-A3B-HQQ-INT4"
+              name: "Qwen3.5-35B-A3B-HQQ-INT4"
             quant: "non-quantized"
           - model:
               repo: "SocialLocalMobile"
-              name: "Qwen3.6-35B-A3B-HQQ-INT4"
+              name: "Qwen3.5-35B-A3B-HQQ-INT4"
             quant: "quantized-int4-weight-only"
           # Voxtral Realtime only supports int4-tile-packed on CUDA
           - model:
@@ -378,7 +378,7 @@ jobs:
             quant: "non-quantized"
     with:
       timeout: 90
-      runner: ${{ matrix.model.name == 'Qwen3.6-35B-A3B-HQQ-INT4' && 'linux.aws.a100' || 'linux.g5.4xlarge.nvidia.gpu' }}
+      runner: ${{ matrix.model.name == 'Qwen3.5-35B-A3B-HQQ-INT4' && 'linux.aws.a100' || 'linux.g5.4xlarge.nvidia.gpu' }}
       gpu-arch-type: cuda
       gpu-arch-version: 12.6
       use-custom-docker-registry: false
diff --git a/examples/models/qwen3_5_moe/README.md b/examples/models/qwen3_5_moe/README.md
index 4296cf9f122..83373a804f4 100644
--- a/examples/models/qwen3_5_moe/README.md
+++ b/examples/models/qwen3_5_moe/README.md
@@ -30,24 +30,6 @@ Export produces a `model.pte` and `aoti_cuda_blob.ptd` containing the
 compiled CUDA kernels and quantized weights. Int4 quantization is
 recommended — the model is too large to fit in VRAM at bf16.
 
-### Quick start: prequantized weights
-
-The fastest path is to export from prequantized weights, which skips
-the slow quantization step entirely.
-
-Prequantized checkpoints are available for download:
-- [SocialLocalMobile/Qwen3.5-35B-A3B-HQQ-INT4](https://huggingface.co/SocialLocalMobile/Qwen3.5-35B-A3B-HQQ-INT4)
-- [SocialLocalMobile/Qwen3.6-35B-A3B-HQQ-INT4](https://huggingface.co/SocialLocalMobile/Qwen3.6-35B-A3B-HQQ-INT4)
-
-```bash
-python export.py --prequantized <path-to-bundle>
-```
-
-See [Generating Prequantized Weights](#generating-prequantized-weights)
-to create your own.
-
-### Quantize and Export
-
 ```bash
 python export.py \
     --model-id Qwen/Qwen3.5-35B-A3B \
@@ -78,7 +60,7 @@ python export.py \
 | `--qlinear-group-size` | `32` | Group size for linear quantization |
 | `--qembedding` | (none) | Embedding quantization: `8w` |
 | `--hqq` | off | Use HQQ scale-only optimization for expert quantization (slower, better accuracy) |
-| `--prequantized` | (none) | Path to prequantized checkpoint directory (skips quantization) |
+| `--prequantized` | (none) | Path to prequantized bundle directory (skips quantization) |
 | `--turboquant` | off | Enable TurboQuant TQ4 KV cache compression (3.8x cache savings) |
 
 ### TurboQuant KV Cache Compression
@@ -90,11 +72,11 @@ KV cache compression (3.8x savings) on the 10 full-attention layers.
 python export.py --prequantized qwen35_moe_int4_hqq --turboquant
 ```
 
-### Generating Prequantized Weights
+### Prequantized Export
 
 Quantization is slow (~30 min with HQQ). To avoid re-quantizing on every
-export, use `quantize_and_save.py` to create a prequantized checkpoint
-directory, then export from it:
+export, use `quantize_and_save.py` to create a self-contained bundle, then
+export from it:
 
 ```bash
 # Step 1: Quantize once (slow)
@@ -106,13 +88,13 @@ python quantize_and_save.py \
     --hqq \
     --output qwen35_moe_int4_hqq
 
-# Step 2: Export from prequantized checkpoint (fast, no --model-dir needed)
+# Step 2: Export from bundle (fast, no --model-dir needed)
 python export.py \
     --prequantized qwen35_moe_int4_hqq
 ```
 
-The output directory contains `model.safetensors`, `config.json`, and
-tokenizer files. It can be uploaded to HuggingFace Hub for easy sharing.
+The bundle contains `model.safetensors`, `config.json`, and tokenizer files.
+It can be uploaded to HuggingFace Hub for easy sharing.
 
 ## Build
 
diff --git a/examples/models/qwen3_6_moe/README.md b/examples/models/qwen3_6_moe/README.md
deleted file mode 100644
index 70d38298f83..00000000000
--- a/examples/models/qwen3_6_moe/README.md
+++ /dev/null
@@ -1,11 +0,0 @@
-# Qwen 3.6 MoE
-
-Qwen 3.6 MoE uses the same architecture and runner as Qwen 3.5 MoE.
-See [examples/models/qwen3_5_moe](../qwen3_5_moe/) for export, build,
-and inference instructions.
-
-Prequantized INT4 weights are available at
-[SocialLocalMobile/Qwen3.6-35B-A3B-HQQ-INT4](https://huggingface.co/SocialLocalMobile/Qwen3.6-35B-A3B-HQQ-INT4).
-
-**Note:** This model has not been tested or evaluated. It is provided
-mainly for development purposes.