From dd34d3586b3fb00c85691419d9192dcdf3fcc81a Mon Sep 17 00:00:00 2001
From: Mengwei Liu <mengweiliu600267@gmail.com>
Date: Fri, 17 Oct 2025 17:19:20 -0700
Subject: [PATCH 1/8] Bump transformers pin to 4.56.1

---
 .ci/docker/requirements-ci.txt | 2 +-
 requirements-examples.txt      | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/.ci/docker/requirements-ci.txt b/.ci/docker/requirements-ci.txt
index 4d2fb63122f..d16b91cc7a3 100644
--- a/.ci/docker/requirements-ci.txt
+++ b/.ci/docker/requirements-ci.txt
@@ -6,7 +6,7 @@ sympy==1.12
 timm==0.6.13
 tomli==2.0.1
 torchsr==1.0.4
-transformers==4.47.1
+transformers==4.56.1
 zstd==1.5.5.1
 pandas>=2.2.2; python_version >= '3.10'
 pytest==7.2.0
diff --git a/requirements-examples.txt b/requirements-examples.txt
index 0923cf8fefc..368159f96e9 100644
--- a/requirements-examples.txt
+++ b/requirements-examples.txt
@@ -4,4 +4,4 @@ datasets == 3.6.0 # 4.0.0 deprecates trust_remote_code and load scripts. For now
 timm == 1.0.7
 torchsr == 1.0.4
 torchtune >= 0.6.1
-transformers == 4.53.1
+transformers == 4.56.1

From 496de652691bd6d72c550025bd0a84c51aa6406f Mon Sep 17 00:00:00 2001
From: Mengwei Liu <mengweiliu600267@gmail.com>
Date: Sun, 19 Oct 2025 11:59:09 -0700
Subject: [PATCH 2/8] Remove unnecessary phi-3-mini export code

---
 .ci/scripts/test_phi_3_mini.sh                |  23 ++-
 .github/workflows/pull.yml                    |   7 +-
 examples/models/phi-3-mini/README.md          |  28 +--
 examples/models/phi-3-mini/eager.py           | 118 ------------
 .../models/phi-3-mini/export_phi-3-mini.py    | 168 ------------------
 .../models/phi-3-mini/install_requirements.sh |   5 +
 examples/models/phi-3-mini/main.cpp           |   5 +-
 examples/models/phi-3-mini/phi_3_mini.py      |  41 -----
 examples/models/phi-3-mini/static_cache.py    |  43 -----
 9 files changed, 41 insertions(+), 397 deletions(-)
 delete mode 100644 examples/models/phi-3-mini/eager.py
 delete mode 100644 examples/models/phi-3-mini/export_phi-3-mini.py
 delete mode 100644 examples/models/phi-3-mini/phi_3_mini.py
 delete mode 100644 examples/models/phi-3-mini/static_cache.py

diff --git a/.ci/scripts/test_phi_3_mini.sh b/.ci/scripts/test_phi_3_mini.sh
index 289263ace37..24ba4e0a1b5 100644
--- a/.ci/scripts/test_phi_3_mini.sh
+++ b/.ci/scripts/test_phi_3_mini.sh
@@ -36,34 +36,33 @@ cmake_build_phi_3_mini() {
   cmake --build ${BUILD_DIR}/${MODEL_DIR} -j${NPROC} --config ${BUILD_TYPE}
 }
 
-# Download and convert tokenizer.model
+# Download tokenizer.model
 prepare_tokenizer() {
-  echo "Downloading and converting tokenizer.model"
-  wget -O tokenizer.model "https://huggingface.co/microsoft/Phi-3-mini-128k-instruct/resolve/main/tokenizer.model?download=true"
-  $PYTHON_EXECUTABLE -m pytorch_tokenizers.tools.llama2c.convert -t tokenizer.model -o tokenizer.bin
+  echo "Downloading tokenizer.model"
+  wget -O tokenizer.model "https://huggingface.co/microsoft/Phi-3-mini-4k-instruct/resolve/main/tokenizer.model?download=true"
 }
 
 # Export phi-3-mini model to pte
 export_phi_3_mini () {
   echo "Exporting phi-3-mini. This will take a few minutes"
-  $PYTHON_EXECUTABLE -m executorch.examples.models.phi-3-mini.export_phi-3-mini -c "4k" -s 128 -o phi-3-mini.pte
+  optimum-cli export executorch --model microsoft/Phi-3-mini-4k-instruct --task text-generation --recipe xnnpack --output_dir ./
 }
 
 run_and_verify() {
     NOW=$(date +"%H:%M:%S")
     echo "Starting to run phi-3-mini runner at ${NOW}"
-    if [[ ! -f "phi-3-mini.pte" ]]; then
-        echo "Export failed. Abort"
+    if [[ ! -f "model.pte" ]]; then
+        echo "Missing model artifact. Abort"
         exit 1
     fi
-    if [[ ! -f "tokenizer.bin" ]]; then
-        echo "tokenizer.bin is missing."
+    if [[ ! -f "tokenizer.model" ]]; then
+        echo "tokenizer.model is missing."
         exit 1
     fi
 
     ${BUILD_DIR}/${MODEL_DIR}/phi_3_mini_runner \
-    --model_path=phi-3-mini.pte \
-    --tokenizer_path=tokenizer.bin \
+    --model_path=model.pte \
+    --tokenizer_path=tokenizer.model \
     --seq_len=60 \
     --temperature=0 \
     --prompt="<|system|>
@@ -92,7 +91,7 @@ What is the capital of France?<|end|>
 cmake_install_executorch_libraries
 cmake_build_phi_3_mini
 
-# Step 2. Export the tokenizer and model
+# Step 2. Export the model
 prepare_tokenizer
 export_phi_3_mini
 
diff --git a/.github/workflows/pull.yml b/.github/workflows/pull.yml
index c96b85740bc..6f4afe3e52b 100644
--- a/.github/workflows/pull.yml
+++ b/.github/workflows/pull.yml
@@ -632,11 +632,14 @@ jobs:
         # The generic Linux job chooses to use base env, not the one setup by the image
         CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]")
         conda activate "${CONDA_ENV}"
-
+        echo "::group::Setup ExecuTorch"
         PYTHON_EXECUTABLE=python bash .ci/scripts/setup-linux.sh --build-tool "cmake"
-
+        echo "::endgroup::"
+                
+        echo "::group::Setup requirements"
         # install phi-3-mini requirements
         bash examples/models/phi-3-mini/install_requirements.sh
+        echo "::endgroup::"
 
         # run e2e (export, tokenizer and runner)
         PYTHON_EXECUTABLE=python bash .ci/scripts/test_phi_3_mini.sh Release
diff --git a/examples/models/phi-3-mini/README.md b/examples/models/phi-3-mini/README.md
index b1a78f26954..86160e0b39a 100644
--- a/examples/models/phi-3-mini/README.md
+++ b/examples/models/phi-3-mini/README.md
@@ -1,24 +1,32 @@
 # Summary
-This example demonstrates how to run a [Phi-3-mini](https://huggingface.co/microsoft/Phi-3-mini-128k-instruct) 3.8B model via ExecuTorch. We use XNNPACK to accelarate the performance and XNNPACK symmetric per channel quantization.
+This example demonstrates how to run a [Phi-3-mini](https://huggingface.co/microsoft/Phi-3-mini-4k-instruct) 3.8B model via ExecuTorch. We use XNNPACK to accelarate the performance and XNNPACK symmetric per channel quantization.
 
 # Instructions
 ## Step 1: Setup
 1. Follow the [tutorial](https://pytorch.org/executorch/main/getting-started-setup) to set up ExecuTorch. For installation run `./install_executorch.sh`
-2. Currently, we support transformers v4.53.1. Install transformers with the following command:
+2. Currently, we support transformers v4.56.1. Install transformers with the following command:
 ```
-pip uninstall -y transformers ; pip install transformers==4.53.1
+pip uninstall -y transformers ; pip install transformers==4.56.1
 ```
+3. Install `optimum-executorch`:
+
+```
+OPTIMUM_ET_VERSION=$(cat .ci/docker/ci_commit_pins/optimum-executorch.txt)
+pip install git+https://github.com/huggingface/optimum-executorch.git@${OPTIMUM_ET_VERSION}
+```
+
 ## Step 2: Prepare and run the model
-1. Download the `tokenizer.model` from HuggingFace and create `tokenizer.bin`.
+1. Download the `tokenizer.model` from HuggingFace.
 ```
 cd executorch
-wget -O tokenizer.model "https://huggingface.co/microsoft/Phi-3-mini-128k-instruct/resolve/main/tokenizer.model?download=true"
-python -m pytorch_tokenizers.tools.llama2c.convert -t tokenizer.model -o tokenizer.bin
+wget -O tokenizer.model "https://huggingface.co/microsoft/Phi-3-mini-4k-instruct/resolve/main/tokenizer.model?download=true"
 ```
 2. Export the model. This step will take a few minutes to finish.
 ```
-python -m executorch.examples.models.phi-3-mini.export_phi-3-mini -c "4k" -s 128 -o phi-3-mini.pte
+optimum-cli export executorch --model microsoft/Phi-3-mini-4k-instruct --task text-generation --recipe xnnpack --qlinear 8da4w --qembedding 8w --output_dir ./
 ```
+The model artifact `model.pte` size is about 2.0GB.
+
 3. Build and run the model.
 - Build executorch with LLM preset:
 ```
@@ -38,9 +46,9 @@ cmake --build cmake-out/examples/models/phi-3-mini -j16 --config Release
 - Run model. Options available [here](https://github.com/pytorch/executorch/blob/main/examples/models/phi-3-mini/main.cpp#L16-L33)
 ```
 cmake-out/examples/models/phi-3-mini/phi_3_mini_runner \
-    --model_path=phi-3-mini.pte \
-    --tokenizer_path=tokenizer.bin \
-    --seq_len=128 \
+    --model_path=model.pte \
+    --tokenizer_path=tokenizer.model \
+    --seq_len=60 \
     --temperature=0 \
     --prompt="<|system|>
 You are a helpful assistant.<|end|>
diff --git a/examples/models/phi-3-mini/eager.py b/examples/models/phi-3-mini/eager.py
deleted file mode 100644
index 8b57b5a24c9..00000000000
--- a/examples/models/phi-3-mini/eager.py
+++ /dev/null
@@ -1,118 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the BSD-style license found in the
-# LICENSE file in the root directory of this source tree.
-
-
-# Script to run phi-3-mini model in eager mode.
-
-import argparse
-import time
-
-import torch
-
-from transformers import AutoTokenizer, Phi3ForCausalLM
-
-from .phi_3_mini import Phi3Mini
-
-end_of_text_token = 32000
-
-
-def _generate_token(args, model, prompt_tokens):
-    current_token = 0
-    generated_tokens = []
-
-    print("Generating tokens:", end="", flush=True)
-
-    while current_token != end_of_text_token and len(generated_tokens) < args.seq_len:
-        outputs = model.forward(input_ids=prompt_tokens)
-        current_token = torch.argmax(outputs.logits[:, -1, :], dim=-1).item()
-        print(f" {current_token}", end="", flush=True)
-        generated_tokens.append(current_token)
-        prompt_tokens = torch.cat(
-            [prompt_tokens, torch.tensor([[current_token]], dtype=torch.long)], dim=-1
-        )
-
-    print("", flush=True)
-
-    return generated_tokens
-
-
-def _generate_token_with_kv_cache(args, model, prompt_tokens):
-    print("Generating tokens:", end="", flush=True)
-
-    model = Phi3Mini(model, 1, args.seq_len + prompt_tokens.shape[-1])
-    result = model.forward(input_ids=prompt_tokens)
-
-    current_token = torch.argmax(result, dim=-1).item()
-    print(f" {current_token}", end="", flush=True)
-    generated_tokens = [current_token]
-
-    while current_token != end_of_text_token and len(generated_tokens) < args.seq_len:
-        result = model.forward(
-            input_ids=torch.tensor([[current_token]], dtype=torch.long),
-        )
-        current_token = torch.argmax(result, dim=-1).item()
-        print(f" {current_token}", end="", flush=True)
-        generated_tokens.append(current_token)
-
-    print("", flush=True)
-
-    return generated_tokens
-
-
-def main(args):
-    seed = 42
-    torch.manual_seed(seed)
-    model_name = "microsoft/Phi-3-mini-4k-instruct"
-    model = Phi3ForCausalLM.from_pretrained(model_name)
-    tokenizer = AutoTokenizer.from_pretrained(model_name)
-
-    tokens = tokenizer.encode(args.prompt, return_tensors="pt")
-
-    start = time.time()
-    generated_tokens = (
-        _generate_token_with_kv_cache(args, model, tokens)
-        if args.use_kv_cache
-        else _generate_token(args, model, tokens)
-    )
-    end = time.time()
-
-    print(
-        "Generated response: \n {}".format(
-            tokenizer.decode(
-                generated_tokens,
-                skip_special_tokens=True,
-                clean_up_tokenization_spaces=False,
-            )
-        ),
-        flush=True,
-    )
-    print(f"Time spent: {end - start}", flush=True)
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "-s",
-        "--seq_len",
-        type=int,
-        default=128,
-        help="Maximum number of tokens to generate",
-    )
-    parser.add_argument(
-        "-kv",
-        "--use_kv_cache",
-        default=False,
-        action="store_true",
-        help="Whether or not to use KV cache",
-    )
-    parser.add_argument(
-        "-p",
-        "--prompt",
-        type=str,
-        default="Tell me a story",
-        help="Prompt as input for the model",
-    )
-    main(parser.parse_args())
diff --git a/examples/models/phi-3-mini/export_phi-3-mini.py b/examples/models/phi-3-mini/export_phi-3-mini.py
deleted file mode 100644
index 017c15f783e..00000000000
--- a/examples/models/phi-3-mini/export_phi-3-mini.py
+++ /dev/null
@@ -1,168 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the BSD-style license found in the
-# LICENSE file in the root directory of this source tree.
-
-
-import argparse
-
-import torch
-
-from executorch.backends.transforms.duplicate_dynamic_quant_chain import (
-    DuplicateDynamicQuantChainPass,
-)
-from executorch.backends.xnnpack.partition.xnnpack_partitioner import XnnpackPartitioner
-
-from executorch.backends.xnnpack.quantizer.xnnpack_quantizer import (
-    get_symmetric_quantization_config,
-    XNNPACKQuantizer,
-)
-from executorch.backends.xnnpack.utils.configs import get_xnnpack_edge_compile_config
-from executorch.exir import to_edge_transform_and_lower
-from executorch.exir.capture._config import ExecutorchBackendConfig
-from executorch.exir.passes import MemoryPlanningPass
-from executorch.exir.passes.sym_shape_eval_pass import ConstraintBasedSymShapeEvalPass
-from torch.export import export as torch_export
-from torch.nn.attention import SDPBackend
-from torchao.quantization.pt2e.quantize_pt2e import convert_pt2e, prepare_pt2e
-
-from transformers import Phi3ForCausalLM
-from transformers.cache_utils import StaticCacheConfig
-
-from transformers.integrations.executorch import TorchExportableModuleForDecoderOnlyLM
-
-
-def _prepare_export_inputs(max_seq_len: int, sliding_window: int):
-    """
-    Prepare example inputs and configurations for export.
-
-    Returns:
-        example_input_ids (torch.Tensor): Example input IDs tensor.
-        example_cache_position (torch.Tensor): Example cache position tensor.
-        dynamic_shapes (dict or None): Dynamic shape specifications for export.
-        strict (bool): Whether to use strict export mode.
-    """
-    # Prepare inputs with dynamic shapes
-    seq_length = 3  # Sequence length > 1 to avoid specialization issues
-    example_input_ids = torch.zeros((1, seq_length), dtype=torch.long)
-    example_cache_position = torch.arange(seq_length, dtype=torch.long)
-    max_dim = min(max_seq_len, sliding_window) - 1
-    seq_len_dim = torch.export.Dim("seq_length_dim", max=max_dim)
-    dynamic_shapes = {
-        "input_ids": {1: seq_len_dim},
-        "cache_position": {0: seq_len_dim},
-    }
-
-    return example_input_ids, example_cache_position, dynamic_shapes
-
-
-def export(args) -> None:
-    torch.manual_seed(0)
-
-    if args.context_length == "4k":
-        model_name = "microsoft/Phi-3-mini-4k-instruct"
-    elif args.context_length == "128k":
-        model_name = "microsoft/Phi-3-mini-128k-instruct"
-    else:
-        raise Exception(
-            f"Invalid context length {args.context_length}. Should be either 4k or 128k"
-        )
-
-    with torch.nn.attention.sdpa_kernel([SDPBackend.MATH]), torch.no_grad():
-        model = Phi3ForCausalLM.from_pretrained(model_name)
-        model.generation_config.cache_implementation = "static"
-        model.generation_config.cache_config = StaticCacheConfig(
-            batch_size=1, max_cache_len=model.config.max_position_embeddings
-        )
-
-        exportable_module = TorchExportableModuleForDecoderOnlyLM(
-            model,
-            max_batch_size=1,
-            max_cache_len=model.config.max_position_embeddings,
-        )
-        input_ids, cache_position, dynamic_shapes = _prepare_export_inputs(
-            model.config.max_position_embeddings, model.config.sliding_window
-        )
-        example_inputs = (input_ids, cache_position)
-        exported_program = exportable_module.export(
-            input_ids, cache_position, dynamic_shapes, strict=False
-        )
-        # Apply RemoveTransposes pass to remove
-        # any back-to-back transpose ops that are not needed
-        # e.g. output of update_cache is transposed and
-        # input to custom_sdpa is transposed.
-        from executorch.extension.llm.export.export_passes import (
-            RemoveRedundantTransposes,
-        )
-
-        mutated_gm = RemoveRedundantTransposes()(exported_program.module())[0]
-
-        xnnpack_quant_config = get_symmetric_quantization_config(
-            is_per_channel=True, is_dynamic=True
-        )
-        xnnpack_quantizer = XNNPACKQuantizer()
-        xnnpack_quantizer.set_global(xnnpack_quant_config)
-
-        gm = prepare_pt2e(mutated_gm, xnnpack_quantizer)  # pyre-fixme[6]
-        gm(*example_inputs)
-        gm = convert_pt2e(gm)
-        DuplicateDynamicQuantChainPass()(gm)
-        exported_program = torch_export(
-            gm, example_inputs, dynamic_shapes=dynamic_shapes, strict=False
-        )
-
-    edge_config = get_xnnpack_edge_compile_config()
-    edge_manager = to_edge_transform_and_lower(
-        exported_program,
-        partitioner=[XnnpackPartitioner()],
-        compile_config=edge_config,
-        constant_methods={
-            "get_eos_ids": [32000],
-            "use_kv_cache": True,
-            "enable_dynamic_shape": True,
-            "get_max_seq_len": model.config.max_position_embeddings - 1,
-        },
-    )
-    edge_manager = edge_manager.to_backend(XnnpackPartitioner())
-    et_program = edge_manager.to_executorch(
-        ExecutorchBackendConfig(
-            extract_delegate_segments=True,
-            do_quant_fusion_and_const_prop=True,
-            memory_planning_pass=MemoryPlanningPass(alloc_graph_input=False),
-            sym_shape_eval_pass=ConstraintBasedSymShapeEvalPass(),
-        )
-    )
-
-    with open(args.output_name, "wb") as file:
-        file.write(et_program.buffer)
-
-
-def main():
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "-c",
-        "--context_length",
-        type=str,
-        default="4k",
-        choices=["4k", "128k"],
-        help="Phi-3-mini provides two context length variants: 4k and 128k",
-    )
-    parser.add_argument(
-        "-s",
-        "--seq_len",
-        type=int,
-        default=128,
-        help="Maximum number of tokens including prompt to generate",
-    )
-    parser.add_argument(
-        "-o",
-        "--output_name",
-        default="phi-3-mini.pte",
-        help="Override the output filename of the saved pte model file.",
-    )
-    export(parser.parse_args())
-
-
-if __name__ == "__main__":
-    main()
diff --git a/examples/models/phi-3-mini/install_requirements.sh b/examples/models/phi-3-mini/install_requirements.sh
index dabeab2ba66..92f91e8a58d 100644
--- a/examples/models/phi-3-mini/install_requirements.sh
+++ b/examples/models/phi-3-mini/install_requirements.sh
@@ -9,4 +9,9 @@ set -x
 
 pip install sentencepiece
 
+EXECUTORCH_ROOT="$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")/../.." && pwd)"
+
+OPTIMUM_ET_VERSION=$(cat "${EXECUTORCH_ROOT}/.ci/docker/ci_commit_pins/optimum-executorch.txt")
+pip install git+https://github.com/huggingface/optimum-executorch.git@${OPTIMUM_ET_VERSION}
+
 pip list
diff --git a/examples/models/phi-3-mini/main.cpp b/examples/models/phi-3-mini/main.cpp
index cc500511624..a644fe2f81b 100644
--- a/examples/models/phi-3-mini/main.cpp
+++ b/examples/models/phi-3-mini/main.cpp
@@ -6,9 +6,9 @@
  * LICENSE file in the root directory of this source tree.
  */
 
+#include <executorch/extension/llm/runner/llm_runner_helper.h>
 #include <executorch/extension/llm/runner/text_llm_runner.h>
 #include <gflags/gflags.h>
-#include <pytorch/tokenizers/llama2c_tokenizer.h>
 #include <iostream>
 
 using executorch::extension::llm::TextLLMRunner;
@@ -46,8 +46,7 @@ int main(int32_t argc, char** argv) {
   int32_t seq_len = FLAGS_seq_len;
 
   std::unique_ptr<tokenizers::Tokenizer> tokenizer =
-      std::make_unique<tokenizers::Llama2cTokenizer>();
-  tokenizer->load(tokenizer_path);
+      executorch::extension::llm::load_tokenizer(tokenizer_path);
 
   auto runner = executorch::extension::llm::create_text_llm_runner(
       model_path, std::move(tokenizer));
diff --git a/examples/models/phi-3-mini/phi_3_mini.py b/examples/models/phi-3-mini/phi_3_mini.py
deleted file mode 100644
index f355beb882a..00000000000
--- a/examples/models/phi-3-mini/phi_3_mini.py
+++ /dev/null
@@ -1,41 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the BSD-style license found in the
-# LICENSE file in the root directory of this source tree.
-
-
-import torch.nn
-from transformers import Phi3ForCausalLM
-
-from .static_cache import ETStaticCache
-
-
-class Phi3Mini(torch.nn.Module):
-
-    def __init__(self, model: Phi3ForCausalLM, max_batch_size: int, max_seq_len: int):
-        super().__init__()
-        self.model = model
-        self.cache = ETStaticCache(
-            # pyre-fixme[16]: `Phi3ForCausalLM` has no attribute `config`.
-            config=model.config,
-            max_batch_size=max_batch_size,
-            max_cache_len=max_seq_len,
-            # pyre-fixme[16]: `Phi3ForCausalLM` has no attribute `device`.
-            device=self.model.device,
-            # pyre-fixme[16]: `Phi3ForCausalLM` has no attribute `dtype`.
-            dtype=self.model.dtype,
-        )
-
-    def forward(
-        self,
-        # pyre-fixme[9]: input_ids has type `LongTensor`; used as `None`.
-        input_ids: torch.LongTensor,
-    ) -> torch.FloatTensor:
-        # pyre-fixme[16]: `Phi3ForCausalLM` has no attribute `forward`.
-        return self.model.forward(
-            input_ids=input_ids,
-            use_cache=True,
-            return_dict=True,
-            past_key_values=self.cache,
-        ).logits[:, -1, :]
diff --git a/examples/models/phi-3-mini/static_cache.py b/examples/models/phi-3-mini/static_cache.py
deleted file mode 100644
index baf66ac2d17..00000000000
--- a/examples/models/phi-3-mini/static_cache.py
+++ /dev/null
@@ -1,43 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the BSD-style license found in the
-# LICENSE file in the root directory of this source tree.
-
-
-from typing import Optional
-
-import torch
-from transformers import PretrainedConfig, StaticCache
-
-
-class ETStaticCache(StaticCache):
-    """
-    A customized static cache implementation, which overrides a few methods to make it exportable to ExecuTorch.
-    This can be removed once transformers supports static cache for Phi3 properly.
-    """
-
-    def __init__(
-        self,
-        config: PretrainedConfig,
-        max_batch_size: int,
-        max_cache_len: int,
-        device,
-        dtype=torch.float32,
-    ) -> None:
-        super().__init__(
-            config=config,
-            max_batch_size=max_batch_size,
-            max_cache_len=max_cache_len,
-            device=device,
-            dtype=dtype,
-        )
-
-    def get_seq_length(self, layer_idx: Optional[int] = 0) -> int:
-        # pyre-fixme[16]: `ETStaticCache` has no attribute `key_cache`.
-        return (self.key_cache[layer_idx][0, 0].any(dim=-1)).sum().item()
-
-    def get_usable_length(
-        self, new_seq_length: int, layer_idx: Optional[int] = 0
-    ) -> int:
-        return self.get_seq_length(layer_idx)

From bbd12e6590e60a73e3708eb365d6c227fc85d479 Mon Sep 17 00:00:00 2001
From: Mengwei Liu <mengweiliu600267@gmail.com>
Date: Sun, 19 Oct 2025 13:43:53 -0700
Subject: [PATCH 3/8] Fix

---
 examples/models/phi-3-mini/install_requirements.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/models/phi-3-mini/install_requirements.sh b/examples/models/phi-3-mini/install_requirements.sh
index 92f91e8a58d..696b21a247f 100644
--- a/examples/models/phi-3-mini/install_requirements.sh
+++ b/examples/models/phi-3-mini/install_requirements.sh
@@ -9,7 +9,7 @@ set -x
 
 pip install sentencepiece
 
-EXECUTORCH_ROOT="$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")/../.." && pwd)"
+EXECUTORCH_ROOT="$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")/../../.." && pwd)"
 
 OPTIMUM_ET_VERSION=$(cat "${EXECUTORCH_ROOT}/.ci/docker/ci_commit_pins/optimum-executorch.txt")
 pip install git+https://github.com/huggingface/optimum-executorch.git@${OPTIMUM_ET_VERSION}

From f1edb82a69c4327196a96cd25484d11eaddeb8c7 Mon Sep 17 00:00:00 2001
From: Mengwei Liu <mengweiliu600267@gmail.com>
Date: Sun, 19 Oct 2025 14:22:42 -0700
Subject: [PATCH 4/8] Bump optimum-executorch

---
 .ci/docker/ci_commit_pins/optimum-executorch.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.ci/docker/ci_commit_pins/optimum-executorch.txt b/.ci/docker/ci_commit_pins/optimum-executorch.txt
index 3c085a7ef3a..574ccb745d0 100644
--- a/.ci/docker/ci_commit_pins/optimum-executorch.txt
+++ b/.ci/docker/ci_commit_pins/optimum-executorch.txt
@@ -1 +1 @@
-09fdbd0a0639b128f712a4f5202ed42ca4c60957
+467660923a5a25e4718e1d6697b93ff1bab4e807

From fefd006c9bf5f8b07e0b8f63a5cc17b8ae3b9e55 Mon Sep 17 00:00:00 2001
From: Mengwei Liu <mengweiliu600267@gmail.com>
Date: Sun, 19 Oct 2025 15:33:50 -0700
Subject: [PATCH 5/8] Bump optimum-executorch

---
 .ci/docker/ci_commit_pins/optimum-executorch.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.ci/docker/ci_commit_pins/optimum-executorch.txt b/.ci/docker/ci_commit_pins/optimum-executorch.txt
index 574ccb745d0..e42ee83cab3 100644
--- a/.ci/docker/ci_commit_pins/optimum-executorch.txt
+++ b/.ci/docker/ci_commit_pins/optimum-executorch.txt
@@ -1 +1 @@
-467660923a5a25e4718e1d6697b93ff1bab4e807
+e8f76b4295584c4328e7fd7971c131cb341c7438

From fe82a7a2022794fd8064a917806f5b2bdef6c0f3 Mon Sep 17 00:00:00 2001
From: Mengwei Liu <mengweiliu600267@gmail.com>
Date: Sun, 19 Oct 2025 16:16:58 -0700
Subject: [PATCH 6/8] install accelerate

---
 examples/models/phi-3-mini/install_requirements.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/models/phi-3-mini/install_requirements.sh b/examples/models/phi-3-mini/install_requirements.sh
index 696b21a247f..731a71a75d4 100644
--- a/examples/models/phi-3-mini/install_requirements.sh
+++ b/examples/models/phi-3-mini/install_requirements.sh
@@ -7,7 +7,7 @@
 
 set -x
 
-pip install sentencepiece
+pip install sentencepiece accelerate
 
 EXECUTORCH_ROOT="$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")/../../.." && pwd)"
 

From 4169cd2a5d2ea4bc8aaaff02f47d4e37ac45ab97 Mon Sep 17 00:00:00 2001
From: Mengwei Liu <mengweiliu600267@gmail.com>
Date: Sun, 19 Oct 2025 16:47:37 -0700
Subject: [PATCH 7/8] Fix NXP test

---
 backends/nxp/tests/test_batch_norm_fusion.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/backends/nxp/tests/test_batch_norm_fusion.py b/backends/nxp/tests/test_batch_norm_fusion.py
index 788d04c6dad..21e70fdbfbe 100644
--- a/backends/nxp/tests/test_batch_norm_fusion.py
+++ b/backends/nxp/tests/test_batch_norm_fusion.py
@@ -105,7 +105,7 @@ def test_batch_norm_conv_fusing(bias: bool, input_shape: list[int]):
     og_nodes = list(program.graph.nodes)
     transformed_nodes = list(graph_module_out.graph.nodes)
 
-    assert len(og_nodes) == (11 if bias else 10)
+    assert len(og_nodes) == (10 if bias else 9)
     assert og_nodes[9 if bias else 8].target.__name__ == "batch_norm.default"
 
     assert len(transformed_nodes) == 5
@@ -139,7 +139,7 @@ def test_batch_norm_linear_fusing(bias: bool):
     og_nodes = list(og_module.graph.nodes)
     transformed_nodes = list(graph_module_out.graph.nodes)
 
-    assert len(og_nodes) == (11 if bias else 10)
+    assert len(og_nodes) == (10 if bias else 9)
     assert og_nodes[8 if bias else 7].target.__name__ == "linear.default"
 
     assert len(transformed_nodes) == 5

From 963ee234ab2d4af1491c280e90b555d01e200610 Mon Sep 17 00:00:00 2001
From: Mengwei Liu <mengweiliu600267@gmail.com>
Date: Sun, 19 Oct 2025 17:16:20 -0700
Subject: [PATCH 8/8] Revert nxp changes

---
 backends/nxp/tests/test_batch_norm_fusion.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/backends/nxp/tests/test_batch_norm_fusion.py b/backends/nxp/tests/test_batch_norm_fusion.py
index 21e70fdbfbe..788d04c6dad 100644
--- a/backends/nxp/tests/test_batch_norm_fusion.py
+++ b/backends/nxp/tests/test_batch_norm_fusion.py
@@ -105,7 +105,7 @@ def test_batch_norm_conv_fusing(bias: bool, input_shape: list[int]):
     og_nodes = list(program.graph.nodes)
     transformed_nodes = list(graph_module_out.graph.nodes)
 
-    assert len(og_nodes) == (10 if bias else 9)
+    assert len(og_nodes) == (11 if bias else 10)
     assert og_nodes[9 if bias else 8].target.__name__ == "batch_norm.default"
 
     assert len(transformed_nodes) == 5
@@ -139,7 +139,7 @@ def test_batch_norm_linear_fusing(bias: bool):
     og_nodes = list(og_module.graph.nodes)
     transformed_nodes = list(graph_module_out.graph.nodes)
 
-    assert len(og_nodes) == (10 if bias else 9)
+    assert len(og_nodes) == (11 if bias else 10)
     assert og_nodes[8 if bias else 7].target.__name__ == "linear.default"
 
     assert len(transformed_nodes) == 5