diff --git a/.buckconfig b/.buckconfig
index 8ccb8ef4aae..6aaf7221b3e 100644
--- a/.buckconfig
+++ b/.buckconfig
@@ -23,3 +23,14 @@
 
 [parser]
   target_platform_detector_spec = target:root//...->prelude//platforms:default target:shim//...->prelude//platforms:default
+
+# Limit the number of files that the buck daemon needs to monitor. If every
+# submodule is cloned recursively, some system can fail to build with "OS file
+# watch limit reached".
+[project]
+  ignore = \
+      .git, \
+      **/.git, \
+      third-party/pytorch/third_party, \
+      cmake-out, \
+      pip-out
diff --git a/.ci/docker/ci_commit_pins/pytorch.txt b/.ci/docker/ci_commit_pins/pytorch.txt
index 44632703e32..1fcaede5ad1 100644
--- a/.ci/docker/ci_commit_pins/pytorch.txt
+++ b/.ci/docker/ci_commit_pins/pytorch.txt
@@ -1 +1 @@
-0a038cf0cff2d071b7359ac0491fd2ba7798a438
+release/2.3
diff --git a/.ci/scripts/gather_test_models.py b/.ci/scripts/gather_test_models.py
index ef65b6f9b42..717eff6157d 100755
--- a/.ci/scripts/gather_test_models.py
+++ b/.ci/scripts/gather_test_models.py
@@ -23,7 +23,7 @@
         "w2l": "linux.12xlarge",
         "ic4": "linux.12xlarge",
         "resnet50": "linux.12xlarge",
-        "llava_encoder": "linux.4xlarge",
+        "llava_encoder": "linux.12xlarge",
         # This one causes timeout on smaller runner, the root cause is unclear (T161064121)
         "dl3": "linux.12xlarge",
         "emformer_join": "linux.12xlarge",
diff --git a/.ci/scripts/utils.sh b/.ci/scripts/utils.sh
index c7c00be2574..6b543c15267 100644
--- a/.ci/scripts/utils.sh
+++ b/.ci/scripts/utils.sh
@@ -18,8 +18,11 @@ retry () {
 install_executorch() {
   which pip
   # Install executorch, this assumes that Executorch is checked out in the
-  # current directory
-  pip install . --no-build-isolation -v
+  # current directory. The --extra-index-url options tell pip to look on the
+  # pytorch servers for nightly and pre-release versions of torch packages.
+  pip install . --no-build-isolation -v \
+      --extra-index-url https://download.pytorch.org/whl/test/cpu \
+      --extra-index-url https://download.pytorch.org/whl/nightly/cpu
   # Just print out the list of packages for debugging
   pip list
 }
diff --git a/.github/workflows/_unittest.yml b/.github/workflows/_unittest.yml
index c36c5861168..81a4bd60e9e 100644
--- a/.github/workflows/_unittest.yml
+++ b/.github/workflows/_unittest.yml
@@ -14,7 +14,7 @@ on:
 
 jobs:
   linux:
-    uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
+    uses: pytorch/test-infra/.github/workflows/linux_job.yml@release/2.3
     strategy:
       matrix:
         include:
@@ -44,7 +44,7 @@ jobs:
         pytest -n auto --cov=./ --cov-report=xml
 
   macos:
-    uses: pytorch/test-infra/.github/workflows/macos_job.yml@main
+    uses: pytorch/test-infra/.github/workflows/macos_job.yml@release/2.3
     strategy:
       matrix:
         include:
diff --git a/.github/workflows/android.yml b/.github/workflows/android.yml
index 0d8931cf102..f08aeede385 100644
--- a/.github/workflows/android.yml
+++ b/.github/workflows/android.yml
@@ -10,7 +10,8 @@ on:
       - .ci/docker/**
       - .github/workflows/android.yml
       - install_requirements.sh
-      - examples/demo-apps/**
+      - examples/demo-apps/android/**
+      - extension/android/**
       - extension/module/**
   workflow_dispatch:
 
@@ -33,6 +34,7 @@ jobs:
       submodules: 'true'
       ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
       timeout: 90
+      upload-artifact: android-apps
       script: |
         set -eux
 
@@ -45,3 +47,44 @@ jobs:
         PYTHON_EXECUTABLE=python bash .ci/scripts/setup-linux.sh "${BUILD_TOOL}"
         # Build Android demo app
         bash build/test_android_ci.sh
+
+        mkdir -p artifacts-to-be-uploaded
+        mkdir -p artifacts-to-be-uploaded/arm64-v8a/
+        mkdir -p artifacts-to-be-uploaded/x86_64/
+        # Copy the jar to S3
+        cp extension/android/build/libs/executorch.jar artifacts-to-be-uploaded/
+        # Copy the app to S3
+        cp examples/demo-apps/android/LlamaDemo/app/build/outputs/apk/debug/*.apk artifacts-to-be-uploaded/
+        # Also copy the libraries
+        cp cmake-out-android-arm64-v8a/lib/*.a artifacts-to-be-uploaded/arm64-v8a/
+        cp cmake-out-android-arm64-v8a/extension/android/*.so artifacts-to-be-uploaded/arm64-v8a/
+        cp cmake-out-android-x86_64/lib/*.a artifacts-to-be-uploaded/x86_64/
+        cp cmake-out-android-x86_64/extension/android/*.so artifacts-to-be-uploaded/x86_64/
+
+  # Upload the app and its test suite to S3 so that they can be downloaded by the test job
+  upload-artifacts:
+    needs: test-demo-android
+    runs-on: linux.2xlarge
+    steps:
+      - name: Download the artifacts
+        uses: actions/download-artifact@v3
+        with:
+          # The name here needs to match the name of the upload-artifact parameter
+          name: android-apps
+          path: ${{ runner.temp }}/artifacts/
+
+      - name: Verify the artifacts
+        shell: bash
+        working-directory: ${{ runner.temp }}/artifacts/
+        run: |
+          ls -lah ./
+
+      - name: Upload the artifacts to S3
+        uses: seemethere/upload-artifact-s3@v5
+        with:
+          s3-bucket: gha-artifacts
+          s3-prefix: |
+            ${{ github.repository }}/${{ github.run_id }}/artifact
+          retention-days: 14
+          if-no-files-found: ignore
+          path: ${{ runner.temp }}/artifacts/
diff --git a/.github/workflows/apple.yml b/.github/workflows/apple.yml
index 06aa6a66e98..667ddb500d3 100644
--- a/.github/workflows/apple.yml
+++ b/.github/workflows/apple.yml
@@ -26,7 +26,7 @@ concurrency:
 jobs:
   test-demo-ios:
     name: test-demo-ios
-    uses: pytorch/test-infra/.github/workflows/macos_job.yml@main
+    uses: pytorch/test-infra/.github/workflows/macos_job.yml@release/2.3
     with:
       runner: macos-latest-xlarge
       python-version: '3.11'
@@ -52,7 +52,7 @@ jobs:
 
   build-frameworks-ios:
     name: build-frameworks-ios
-    uses: pytorch/test-infra/.github/workflows/macos_job.yml@main
+    uses: pytorch/test-infra/.github/workflows/macos_job.yml@release/2.3
     with:
       runner: macos-latest-xlarge
       python-version: '3.11'
@@ -64,7 +64,7 @@ jobs:
         WORKSPACE=$(pwd)
         pushd "${WORKSPACE}/pytorch/executorch"
         BUILD_TOOL=cmake
-        VERSION="0.1.0"
+        VERSION="0.2.0"
         FRAMEWORKS=(
           "executorch"
           "coreml_backend"
@@ -137,8 +137,8 @@ jobs:
           # NB: The name here needs to match the upload-artifact name from build-frameworks-ios job
           name: executorch-frameworks-ios
           path: ${{ runner.temp }}/frameworks-ios/
-      - name: Only push to S3 when running the workflow manually from main branch
-        if: ${{ github.event_name == 'workflow_dispatch' && github.ref == 'refs/heads/main' }}
+      - name: Only push to S3 when running the workflow manually from release/0.2 branch
+        if: ${{ github.event_name == 'workflow_dispatch' && github.ref == 'refs/heads/release/0.2' }}
         shell: bash
         run: |
           set -eux
diff --git a/.github/workflows/build-wheels-linux.yml b/.github/workflows/build-wheels-linux.yml
index a2f86b219f8..abe680f946e 100644
--- a/.github/workflows/build-wheels-linux.yml
+++ b/.github/workflows/build-wheels-linux.yml
@@ -19,12 +19,12 @@ on:
 
 jobs:
   generate-matrix:
-    uses: pytorch/test-infra/.github/workflows/generate_binary_build_matrix.yml@main
+    uses: pytorch/test-infra/.github/workflows/generate_binary_build_matrix.yml@release/2.3
     with:
       package-type: wheel
       os: linux
       test-infra-repository: pytorch/test-infra
-      test-infra-ref: main
+      test-infra-ref: release/2.3
       with-cuda: disabled
       with-rocm: disabled
 
@@ -43,13 +43,18 @@ jobs:
             smoke-test-script: build/packaging/smoke_test.py
             package-name: executorch
     name: ${{ matrix.repository }}
-    uses: pytorch/test-infra/.github/workflows/build_wheels_linux.yml@main
+    uses: pytorch/test-infra/.github/workflows/build_wheels_linux.yml@release/2.3
     with:
       repository: ${{ matrix.repository }}
       ref: ""
       test-infra-repository: pytorch/test-infra
-      test-infra-ref: main
+      test-infra-ref: release/2.3
       build-matrix: ${{ needs.generate-matrix.outputs.matrix }}
+      # ExecuTorch only needs the first layer of submodules; override the
+      # "recursive" default to do less work, and to give the buck daemon fewer
+      # files to look at.
+      submodules: true
+      env-var-script: build/packaging/env_var_script_linux.sh
       pre-script: ${{ matrix.pre-script }}
       post-script: ${{ matrix.post-script }}
       package-name: ${{ matrix.package-name }}
diff --git a/.github/workflows/build-wheels-m1.yml b/.github/workflows/build-wheels-m1.yml
index dbc74433ff8..0fa451c378e 100644
--- a/.github/workflows/build-wheels-m1.yml
+++ b/.github/workflows/build-wheels-m1.yml
@@ -19,12 +19,12 @@ on:
 
 jobs:
   generate-matrix:
-    uses: pytorch/test-infra/.github/workflows/generate_binary_build_matrix.yml@main
+    uses: pytorch/test-infra/.github/workflows/generate_binary_build_matrix.yml@release/2.3
     with:
       package-type: wheel
       os: macos-arm64
       test-infra-repository: pytorch/test-infra
-      test-infra-ref: main
+      test-infra-ref: release/2.3
       with-cuda: disabled
       with-rocm: disabled
 
@@ -43,13 +43,19 @@ jobs:
             smoke-test-script: build/packaging/smoke_test.py
             package-name: executorch
     name: ${{ matrix.repository }}
-    uses: pytorch/test-infra/.github/workflows/build_wheels_macos.yml@main
+    uses: pytorch/test-infra/.github/workflows/build_wheels_macos.yml@release/2.3
     with:
       repository: ${{ matrix.repository }}
       ref: ""
       test-infra-repository: pytorch/test-infra
-      test-infra-ref: main
+      test-infra-ref: release/2.3
       build-matrix: ${{ needs.generate-matrix.outputs.matrix }}
+      # ExecuTorch only needs the first layer of submodules; override the
+      # "recursive" default to do less work, and to give the buck daemon fewer
+      # files to look at.
+      submodules: true
+      delocate-wheel: false
+      env-var-script: build/packaging/env_var_script_m1.sh
       pre-script: ${{ matrix.pre-script }}
       post-script: ${{ matrix.post-script }}
       package-name: ${{ matrix.package-name }}
diff --git a/.github/workflows/doc-build.yml b/.github/workflows/doc-build.yml
index ee5cfb859b3..b243d4ffa02 100644
--- a/.github/workflows/doc-build.yml
+++ b/.github/workflows/doc-build.yml
@@ -14,7 +14,7 @@ on:
 
 jobs:
   build:
-    uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
+    uses: pytorch/test-infra/.github/workflows/linux_job.yml@release/2.3
     strategy:
       matrix:
         include:
@@ -46,13 +46,9 @@ jobs:
         # ET_VERSION_DOCS will be pulled during the doc build to add to the version dropdown
         # on the website. See docs/source/conf.py for details
 
-        REF_TYPE=${{ github.ref_type }}
-        REF_NAME=${{ github.ref_name }}
-
-        echo "$REF_TYPE"
-        echo "$REF_NAME"
-
-        ET_VERSION_DOCS="${REF_NAME}"
+        GITHUB_REF=${{ github.ref }}
+        echo "$GITHUB_REF"
+        ET_VERSION_DOCS="${GITHUB_REF}"
         echo "$ET_VERSION_DOCS"
 
         set -eux
@@ -68,26 +64,24 @@ jobs:
         make html
         cd ..
 
+        # If it's main branch, add noindex tag to all .html files to exclude from Google Search indexing.
+        echo "GitHub Ref: ${GITHUB_REF}"
+        if [[ "${{ github.ref }}" == 'refs/heads/main' ]]; then
+          find docs/_build/html/ -name "*.html" -print0 | xargs -0 sed -i '/<head>/a \ \ <meta name="robots" content="noindex">';
+        fi
+
         cp -rf docs/_build/html/* "${RUNNER_DOCS_DIR}"
 
         mv docs/_build/html "${RUNNER_ARTIFACT_DIR}"
 
         ls -R "${RUNNER_ARTIFACT_DIR}"/*/*.html
 
-# Enable preview later. Previews are available publicly
-#
-# upload-preview:
-#    if: github.repository == 'pytorch/executorch' && github.event_name == 'push' &&
-#        (github.ref_type == 'branch' && github.ref_name == 'main')
-#    uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
-
   upload-gh-pages:
     needs: build
-    if: github.repository == 'pytorch/executorch' && github.event_name == 'push' &&
-        ((github.ref_type == 'branch' && github.ref_name == 'main') || github.ref_type == 'tag')
+    if: github.repository == 'pytorch/executorch' && github.event_name == 'push' && (github.ref == 'refs/heads/main' || startsWith(github.ref, 'refs/tags/v'))
     permissions:
       contents: write
-    uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
+    uses: pytorch/test-infra/.github/workflows/linux_job.yml@release/2.3
     with:
       repository: pytorch/executorch
       download-artifact: docs
@@ -96,22 +90,17 @@ jobs:
       script: |
         set -euo pipefail
 
-        REF_TYPE=${{ github.ref_type }}
-        REF_NAME=${{ github.ref_name }}
-
-        # If building for a release tag, branch, set the branch/tag name
-        # as the target folder in the gh-pages branch. The artifacts created
-        # during the build will be copied over to the target dir in the
-        # gh-pages branch.
-        if [[ "${REF_TYPE}" == branch ]]; then
-          TARGET_FOLDER="${REF_NAME}"
-        elif [[ "${REF_TYPE}" == tag ]]; then
-          # Strip the leading "v" as well as the trailing patch version and "-rc" suffix.
-          # For example: 'v0.1.2' -> '0.1' and 'v0.1.2-rc1' -> 0.1.
-          TARGET_FOLDER=$(echo "${REF_NAME}" | sed 's/^v//i; s/-rc[0-9]*$//; s/\.[0-9]*$//')
+        # Get github.ref for the output doc folder. By default "main"
+        # If matches a tag like refs/tags/v1.12.0-rc3 or
+        # refs/tags/v1.12.0 convert to 1.12
+        GITHUB_REF=${{ github.ref }}
+
+        # Convert refs/tags/v1.12.0rc3 into 1.12.
+        # Adopted from https://github.com/pytorch/pytorch/blob/main/.github/workflows/_docs.yml#L150C11-L155C13
+        if [[ "${GITHUB_REF}" =~ ^refs/tags/v([0-9]+\\.[0-9]+)\\. ]]; then
+          TARGET_FOLDER="${BASH_REMATCH[1]}"
         else
-          echo "ERROR: Invalid REF_TYPE: ${REF_TYPE}. Expected 'branch' or 'tag'."
-          exit 1
+          TARGET_FOLDER="main"
         fi
         echo "Target Folder: ${TARGET_FOLDER}"
 
@@ -122,12 +111,6 @@ jobs:
         mv "${RUNNER_ARTIFACT_DIR}"/html/* "${TARGET_FOLDER}"
         git add "${TARGET_FOLDER}" || true
 
-        # If it's main branch, add noindex tag to all .html files to exclude from Google Search indexing.
-        if [[ "${REF_NAME}" == 'main' ]]; then
-          find "${TARGET_FOLDER}" -type f -name "*.html" -exec sed -i '/<head>/a <meta name="robots" content="noindex">' {} \;
-          git add "${TARGET_FOLDER}"/**/*.html || true
-        fi
-
         git config user.name 'pytorchbot'
         git config user.email 'soumith+bot@pytorch.org'
         git commit -m "Auto-generating sphinx docs" || true
diff --git a/.github/workflows/docker-builds.yml b/.github/workflows/docker-builds.yml
index f773f3aca88..6cf6e0495b3 100644
--- a/.github/workflows/docker-builds.yml
+++ b/.github/workflows/docker-builds.yml
@@ -50,7 +50,7 @@ jobs:
           mkdir "${GITHUB_WORKSPACE}"
 
       - name: Setup SSH (Click me for login details)
-        uses: pytorch/test-infra/.github/actions/setup-ssh@main
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.3
         with:
           github-secret: ${{ secrets.GITHUB_TOKEN }}
 
@@ -58,11 +58,11 @@ jobs:
         uses: actions/checkout@v3
 
       - name: Setup Linux
-        uses: pytorch/test-infra/.github/actions/setup-linux@main
+        uses: pytorch/test-infra/.github/actions/setup-linux@release/2.3
 
       - name: Build docker image
         id: build-docker-image
-        uses: pytorch/test-infra/.github/actions/calculate-docker-image@main
+        uses: pytorch/test-infra/.github/actions/calculate-docker-image@release/2.3
         with:
           docker-image-name: ${{ matrix.docker-image-name }}
           always-rebuild: true
@@ -70,5 +70,5 @@ jobs:
           force-push: true
 
       - name: Teardown Linux
-        uses: pytorch/test-infra/.github/actions/teardown-linux@main
+        uses: pytorch/test-infra/.github/actions/teardown-linux@release/2.3
         if: always()
diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml
index 7cb2cf69b8b..a47f38d1b86 100644
--- a/.github/workflows/lint.yml
+++ b/.github/workflows/lint.yml
@@ -16,7 +16,7 @@ concurrency:
 
 jobs:
   lintrunner:
-    uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
+    uses: pytorch/test-infra/.github/workflows/linux_job.yml@release/2.3
     with:
       runner: linux.2xlarge
       docker-image: executorch-ubuntu-22.04-linter
diff --git a/.github/workflows/nightly.yml b/.github/workflows/nightly.yml
index f8ffd41d214..efa3ed6f540 100644
--- a/.github/workflows/nightly.yml
+++ b/.github/workflows/nightly.yml
@@ -21,12 +21,12 @@ jobs:
     environment: ${{ (github.event_name == 'schedule') && 'update-commit-hash' || '' }}
     steps:
       - name: update-pytorch-commit-hash
-        uses: pytorch/test-infra/.github/actions/update-commit-hash@main
+        uses: pytorch/test-infra/.github/actions/update-commit-hash@release/2.3
         if: ${{ github.event_name == 'schedule' }}
         with:
           repo-name: pytorch
           branch: main
           pin-folder: .ci/docker/ci_commit_pins
-          test-infra-ref: main
+          test-infra-ref: release/2.3
           updatebot-token: ${{ secrets.UPDATEBOT_TOKEN }}
           pytorchbot-token: ${{ secrets.GH_PYTORCHBOT_TOKEN }}
diff --git a/.github/workflows/pull.yml b/.github/workflows/pull.yml
index 9751b906cd8..6b3a25d89c8 100644
--- a/.github/workflows/pull.yml
+++ b/.github/workflows/pull.yml
@@ -33,7 +33,7 @@ jobs:
 
   test-setup-linux-gcc:
     name: test-setup-linux-gcc
-    uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
+    uses: pytorch/test-infra/.github/workflows/linux_job.yml@release/2.3
     strategy:
       matrix:
         include:
@@ -58,7 +58,7 @@ jobs:
 
   test-models-linux:
     name: test-models-linux
-    uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
+    uses: pytorch/test-infra/.github/workflows/linux_job.yml@release/2.3
     needs: gather-models
     strategy:
       matrix: ${{ fromJSON(needs.gather-models.outputs.models) }}
@@ -85,7 +85,7 @@ jobs:
 
   test-llama-runner-linux:
     name: test-llama-runner-linux
-    uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
+    uses: pytorch/test-infra/.github/workflows/linux_job.yml@release/2.3
     strategy:
       matrix:
         dtype: [fp32]
@@ -139,7 +139,7 @@ jobs:
 
   test-custom-ops-linux:
     name: test-custom-ops-linux
-    uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
+    uses: pytorch/test-infra/.github/workflows/linux_job.yml@release/2.3
     strategy:
       matrix:
         include:
@@ -164,7 +164,7 @@ jobs:
 
   test-selective-build-linux:
     name: test-selective-build-linux
-    uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
+    uses: pytorch/test-infra/.github/workflows/linux_job.yml@release/2.3
     strategy:
       matrix:
         include:
@@ -189,7 +189,7 @@ jobs:
 
   test-quantized-aot-lib-linux:
     name: test-quantized-aot-lib-linux
-    uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
+    uses: pytorch/test-infra/.github/workflows/linux_job.yml@release/2.3
     strategy:
       matrix:
         include:
@@ -212,7 +212,7 @@ jobs:
 
   test-pybind-build-linux:
     name: test-pybind-build-linux
-    uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
+    uses: pytorch/test-infra/.github/workflows/linux_job.yml@release/2.3
     strategy:
       matrix:
         include:
diff --git a/.github/workflows/trunk.yml b/.github/workflows/trunk.yml
index 16ed6a27577..a21e02a468c 100644
--- a/.github/workflows/trunk.yml
+++ b/.github/workflows/trunk.yml
@@ -34,7 +34,7 @@ jobs:
 
   test-models-macos:
     name: test-models-macos
-    uses: pytorch/test-infra/.github/workflows/macos_job.yml@main
+    uses: pytorch/test-infra/.github/workflows/macos_job.yml@release/2.3
     needs: gather-models
     strategy:
       matrix: ${{ fromJSON(needs.gather-models.outputs.models) }}
@@ -63,7 +63,7 @@ jobs:
 
   test-custom-ops-macos:
     name: test-custom-ops-macos
-    uses: pytorch/test-infra/.github/workflows/macos_job.yml@main
+    uses: pytorch/test-infra/.github/workflows/macos_job.yml@release/2.3
     strategy:
       matrix:
         include:
@@ -89,7 +89,7 @@ jobs:
 
   test-selective-build-macos:
     name: test-selective-build-macos
-    uses: pytorch/test-infra/.github/workflows/macos_job.yml@main
+    uses: pytorch/test-infra/.github/workflows/macos_job.yml@release/2.3
     strategy:
       matrix:
         include:
@@ -115,7 +115,7 @@ jobs:
 
   test-demo-backend-delegation:
     name: test-demo-backend-delegation
-    uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
+    uses: pytorch/test-infra/.github/workflows/linux_job.yml@release/2.3
     strategy:
       matrix:
         include:
@@ -139,7 +139,7 @@ jobs:
 
   test-arm-backend-delegation:
     name: test-arm-backend-delegation
-    uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
+    uses: pytorch/test-infra/.github/workflows/linux_job.yml@release/2.3
     with:
       runner: linux.2xlarge
       docker-image: executorch-ubuntu-22.04-arm-sdk
@@ -169,7 +169,7 @@ jobs:
 
   test-arm-reference-delegation:
     name: test-arm-reference-delegation
-    uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
+    uses: pytorch/test-infra/.github/workflows/linux_job.yml@release/2.3
     with:
       runner: linux.2xlarge
       docker-image: executorch-ubuntu-22.04-arm-sdk
@@ -200,7 +200,7 @@ jobs:
 
   test-coreml-delegate:
     name: test-coreml-delegate
-    uses: pytorch/test-infra/.github/workflows/macos_job.yml@main
+    uses: pytorch/test-infra/.github/workflows/macos_job.yml@release/2.3
     with:
       runner: macos-13-xlarge
       python-version: '3.11'
@@ -222,7 +222,7 @@ jobs:
 
   test-pybind-build-macos:
     name: test-pybind-build-macos
-    uses: pytorch/test-infra/.github/workflows/macos_job.yml@main
+    uses: pytorch/test-infra/.github/workflows/macos_job.yml@release/2.3
     strategy:
       matrix:
         include:
@@ -249,7 +249,7 @@ jobs:
 
   test-llama-runner-macos:
     name: test-llama-runner-mac
-    uses: pytorch/test-infra/.github/workflows/macos_job.yml@main
+    uses: pytorch/test-infra/.github/workflows/macos_job.yml@release/2.3
     strategy:
       matrix:
         dtype: [fp32]
diff --git a/.github/workflows/update-viablestrict.yml b/.github/workflows/update-viablestrict.yml
index 189a5cf3aa3..9bb89aa2be3 100644
--- a/.github/workflows/update-viablestrict.yml
+++ b/.github/workflows/update-viablestrict.yml
@@ -16,7 +16,7 @@ jobs:
     environment: ${{ (github.event_name == 'schedule') && 'update-viable-strict' || '' }}
     steps:
       - name: Update viable/strict
-        uses: pytorch/test-infra/.github/actions/update-viablestrict@main
+        uses: pytorch/test-infra/.github/actions/update-viablestrict@release/2.3
         with:
           repository: pytorch/executorch
           stable-branch: viable/strict
diff --git a/.gitignore b/.gitignore
index 6661daed13e..26a46f23f62 100644
--- a/.gitignore
+++ b/.gitignore
@@ -2,6 +2,7 @@
 buck-out/
 cmake-out/
 cmake-android-out/
+cmake-out-android/
 cmake-ios-out/
 ethos-u-scratch/
 executorch.egg-info
diff --git a/.swift/coreml_backend/dummy.swift b/.swift/coreml_backend/dummy.swift
deleted file mode 100644
index e69de29bb2d..00000000000
diff --git a/.swift/coreml_backend_debug/dummy.swift b/.swift/coreml_backend_debug/dummy.swift
deleted file mode 100644
index e69de29bb2d..00000000000
diff --git a/.swift/custom_backend/dummy.swift b/.swift/custom_backend/dummy.swift
deleted file mode 100644
index e69de29bb2d..00000000000
diff --git a/.swift/custom_backend_debug/dummy.swift b/.swift/custom_backend_debug/dummy.swift
deleted file mode 100644
index e69de29bb2d..00000000000
diff --git a/.swift/executorch/dummy.swift b/.swift/executorch/dummy.swift
deleted file mode 100644
index e69de29bb2d..00000000000
diff --git a/.swift/executorch_debug/dummy.swift b/.swift/executorch_debug/dummy.swift
deleted file mode 100644
index e69de29bb2d..00000000000
diff --git a/.swift/mps_backend/dummy.swift b/.swift/mps_backend/dummy.swift
deleted file mode 100644
index e69de29bb2d..00000000000
diff --git a/.swift/mps_backend_debug/dummy.swift b/.swift/mps_backend_debug/dummy.swift
deleted file mode 100644
index e69de29bb2d..00000000000
diff --git a/.swift/optimized_backend/dummy.swift b/.swift/optimized_backend/dummy.swift
deleted file mode 100644
index e69de29bb2d..00000000000
diff --git a/.swift/optimized_backend_debug/dummy.swift b/.swift/optimized_backend_debug/dummy.swift
deleted file mode 100644
index e69de29bb2d..00000000000
diff --git a/.swift/portable_backend/dummy.swift b/.swift/portable_backend/dummy.swift
deleted file mode 100644
index e69de29bb2d..00000000000
diff --git a/.swift/portable_backend_debug/dummy.swift b/.swift/portable_backend_debug/dummy.swift
deleted file mode 100644
index e69de29bb2d..00000000000
diff --git a/.swift/quantized_backend/dummy.swift b/.swift/quantized_backend/dummy.swift
deleted file mode 100644
index e69de29bb2d..00000000000
diff --git a/.swift/quantized_backend_debug/dummy.swift b/.swift/quantized_backend_debug/dummy.swift
deleted file mode 100644
index e69de29bb2d..00000000000
diff --git a/.swift/xnnpack_backend/dummy.swift b/.swift/xnnpack_backend/dummy.swift
deleted file mode 100644
index e69de29bb2d..00000000000
diff --git a/.swift/xnnpack_backend_debug/dummy.swift b/.swift/xnnpack_backend_debug/dummy.swift
deleted file mode 100644
index e69de29bb2d..00000000000
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 46b73f63492..e8f4c93a808 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -352,23 +352,27 @@ add_subdirectory(schema)
 # Only contains primitive operators; does not contain portable kernels or other
 # full operators. Does not contain any backends.
 #
-
-add_library(executorch ${_executorch__srcs})
-target_link_libraries(executorch PRIVATE program_schema)
-target_link_options_shared_lib(executorch)
+add_library(executorch_no_prim_ops ${_executorch_no_prim_ops__srcs})
+target_link_libraries(executorch_no_prim_ops PRIVATE program_schema)
 # Check if dl exists for this toolchain and only then link it.
 find_library(DL_LIBRARY_EXISTS NAMES dl)
 # Check if the library was found
 if(DL_LIBRARY_EXISTS)
-  target_link_libraries(executorch PRIVATE dl) # For dladdr()
+  target_link_libraries(executorch_no_prim_ops PRIVATE dl) # For dladdr()
 endif()
-target_include_directories(executorch PUBLIC ${_common_include_directories})
-target_compile_options(executorch PUBLIC ${_common_compile_options})
+target_include_directories(executorch_no_prim_ops PUBLIC ${_common_include_directories})
+target_compile_options(executorch_no_prim_ops PUBLIC ${_common_compile_options})
 if(MAX_KERNEL_NUM)
-  target_compile_definitions(executorch
+  target_compile_definitions(executorch_no_prim_ops
                              PRIVATE MAX_KERNEL_NUM=${MAX_KERNEL_NUM})
 endif()
 
+add_library(executorch ${_executorch__srcs})
+target_link_libraries(executorch PRIVATE executorch_no_prim_ops)
+target_include_directories(executorch PUBLIC ${_common_include_directories})
+target_compile_options(executorch PUBLIC ${_common_compile_options})
+target_link_options_shared_lib(executorch)
+
 #
 # portable_ops_lib: A library to register core ATen ops using portable kernels,
 # see kernels/portable/CMakeLists.txt.
@@ -406,7 +410,7 @@ endif()
 # Install `executorch` library as well as `executorch-config.cmake` under
 # ${CMAKE_INSTALL_PREFIX}/
 install(
-  TARGETS executorch
+  TARGETS executorch executorch_no_prim_ops
   DESTINATION lib
   INCLUDES
   DESTINATION ${_common_include_directories})
@@ -523,10 +527,19 @@ if(EXECUTORCH_BUILD_PYBIND)
   find_library(TORCH_PYTHON_LIBRARY torch_python
                PATHS "${TORCH_INSTALL_PREFIX}/lib")
 
+  # TODO(larryliu): Fix macOS 2 dylibs having 2 sets of static variables issue
+  if(EXECUTORCH_BUILD_CUSTOM_OPS_AOT AND NOT APPLE)
+    list(APPEND _dep_libs custom_ops_aot_lib)
+  endif()
   # compile options for pybind
-
-  set(_pybind_compile_options -Wno-deprecated-declarations -fPIC -frtti
-                              -fexceptions)
+  set(_pybind_compile_options
+      -Wno-deprecated-declarations
+      -fPIC
+      -frtti
+      -fexceptions
+      # libtorch is built with the old ABI, so we need to do the same for any
+      # .cpp files that include torch, c10, or ATen targets.
+      -D_GLIBCXX_USE_CXX11_ABI=0)
   # util lib
   add_library(
     util
@@ -540,8 +553,11 @@ if(EXECUTORCH_BUILD_PYBIND)
 
   # pybind portable_lib
   pybind11_add_module(portable_lib extension/pybindings/pybindings.cpp)
+  # The actual output file needs a leading underscore so it can coexist with
+  # portable_lib.py in the same python package.
+  set_target_properties(portable_lib PROPERTIES OUTPUT_NAME "_portable_lib")
   target_compile_definitions(portable_lib
-                             PUBLIC EXECUTORCH_PYTHON_MODULE_NAME=portable_lib)
+                             PUBLIC EXECUTORCH_PYTHON_MODULE_NAME=_portable_lib)
   target_include_directories(portable_lib PRIVATE ${TORCH_INCLUDE_DIRS})
   target_compile_options(portable_lib PUBLIC ${_pybind_compile_options})
   target_link_libraries(
@@ -557,6 +573,24 @@ if(EXECUTORCH_BUILD_PYBIND)
            ${PYBIND_LINK_COREML}
            ${PYBIND_LINK_MPS}
            ${PYBIND_LINK_XNNPACK})
+  if(APPLE)
+    # pip wheels will need to be able to find the torch libraries. On Linux, the
+    # .so has non-absolute dependencies on libs like "libtorch.so" without
+    # paths; as long as we `import torch` first, those dependencies will work.
+    # But Apple dylibs do not support non-absolute dependencies, so we need to
+    # tell the loader where to look for its libraries. The LC_LOAD_DYLIB entries
+    # for the torch libraries will look like "@rpath/libtorch.dylib", so we can
+    # add an LC_RPATH entry to look in a directory relative to the installed
+    # location of our _portable_lib.so file. To see these LC_* values, run
+    # `otool -l _portable_lib*.so`.
+    set_target_properties(
+      portable_lib
+      PROPERTIES # Assume that this library will be installed in
+                 # `site-packages/executorch/extension/pybindings`, and that
+                 # the torch libs are in `site-packages/torch/lib`.
+                 BUILD_RPATH "@loader_path/../../../torch/lib"
+                 INSTALL_RPATH "@loader_path/../../../torch/lib")
+  endif()
 
   install(TARGETS portable_lib
           LIBRARY DESTINATION executorch/extension/pybindings)
diff --git a/Package.swift b/Package.swift
deleted file mode 100644
index b0dfec174f2..00000000000
--- a/Package.swift
+++ /dev/null
@@ -1,101 +0,0 @@
-// swift-tools-version:5.9
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-import PackageDescription
-
-let version = "0.1.0"
-let url = "https://ossci-ios.s3.amazonaws.com/executorch/"
-let debug = "_debug"
-let deliverables = [
-  "coreml_backend": [
-    "sha256": "5bfa35cb5143b4af6840e0e5dd2d40bce93dff331b8eb5798a46274239391a5d",
-    "sha256" + debug: "1422019da9000f8ff7be597de9e0e3b2482f99cdaa75c2d179835778647be1a6",
-    "frameworks": [
-      "Accelerate",
-      "CoreML",
-    ],
-    "libraries": [
-      "sqlite3",
-    ],
-  ],
-  "custom_backend": [
-    "sha256": "2201a61eaf7e06e1937cb73a469fb36cabc219496ba004b85feb2cc7c10f300d",
-    "sha256" + debug: "3eb6eb97bf0641d2305b0f50ff05a8862d7d65e2491cf4aa05ef1d108649f07c",
-  ],
-  "executorch": [
-    "sha256": "2b55cbcff845ab9eaf16a21e520546b2975ef8c55b9e3fbbcc0c375334e40c6f",
-    "sha256" + debug: "12933cedff6cf21c9d21668779f8d8af8049646fe7d290787b12227ff7abe4a7",
-  ],
-  "mps_backend": [
-    "sha256": "510d708361b6ea0692ce5aeb638725d6275824b37bbe744aa876fda24cc2bbbf",
-    "sha256" + debug: "6a67ba0bf8033f17bd66acb222446df51cd1304e24a4fb2c6d97e15a30fb24f0",
-    "frameworks": [
-      "Metal",
-      "MetalPerformanceShaders",
-      "MetalPerformanceShadersGraph",
-    ],
-  ],
-  "optimized_backend": [
-    "sha256": "50aaa54901a7cee1059e71cc623f054610406d65ba8fd6edb10b45861be67237",
-    "sha256" + debug: "3f43f465727c8705432f4bb69260cc9501c519e5da006fc19ee2ab2ea260d1f0",
-  ],
-  "portable_backend": [
-    "sha256": "964238e92828665aa598c05b2264faab91fb13ce0f42633cc7d5653300af3e9b",
-    "sha256" + debug: "d6d85304a4b40f13c9b893e8c264ebdb15307cacf8997494b3818a52e4914b28",
-  ],
-  "quantized_backend": [
-    "sha256": "37d31a319f92e26bab2b7ec5e783a8b14457dee0a4638dcdca1d9e17539ee3fb",
-    "sha256" + debug: "6b45f66f60f6106a41e191418c970bf7b0605df73b9815a06441a5f0809b54e6",
-  ],
-  "xnnpack_backend": [
-    "sha256": "03d506243c392e872519ae1335a025ef202319c1db339a753f9d7d74cba226f0",
-    "sha256" + debug: "3341e89abc99552a6a5bad360003baed194a83e865338bc07afe9e4f171ea169",
-  ],
-].reduce(into: [String: [String: Any]]()) {
-  $0[$1.key] = $1.value
-  $0[$1.key + debug] = $1.value
-}
-.reduce(into: [String: [String: Any]]()) {
-  var newValue = $1.value
-  if $1.key.hasSuffix(debug) {
-    $1.value.forEach { key, value in
-      if key.hasSuffix(debug) {
-        newValue[String(key.dropLast(debug.count))] = value
-      }
-    }
-  }
-  $0[$1.key] = newValue.filter { key, _ in !key.hasSuffix(debug) }
-}
-
-let package = Package(
-  name: "executorch",
-  platforms: [
-    .iOS(.v15),
-  ],
-  products: deliverables.keys.map { key in
-    .library(name: key, targets: ["\(key)_dependencies"])
-  }.sorted { $0.name < $1.name },
-  targets: deliverables.flatMap { key, value -> [Target] in
-    [
-      .binaryTarget(
-        name: key,
-        url: "\(url)\(key)-\(version).zip",
-        checksum: value["sha256"] as? String ?? ""
-      ),
-      .target(
-        name: "\(key)_dependencies",
-        dependencies: [.target(name: key)],
-        path: ".swift/\(key)",
-        linkerSettings:
-          (value["frameworks"] as? [String] ?? []).map { .linkedFramework($0) } +
-          (value["libraries"] as? [String] ?? []).map { .linkedLibrary($0) }
-      ),
-    ]
-  }
-)
diff --git a/README-wheel.md b/README-wheel.md
new file mode 100644
index 00000000000..ebbaab90a87
--- /dev/null
+++ b/README-wheel.md
@@ -0,0 +1,39 @@
+**ExecuTorch** is a [PyTorch](https://pytorch.org/) platform that provides
+infrastructure to run PyTorch programs everywhere from AR/VR wearables to
+standard on-device iOS and Android mobile deployments. One of the main goals for
+ExecuTorch is to enable wider customization and deployment capabilities of the
+PyTorch programs.
+
+The `executorch` pip package is in alpha.
+* Required python version: `==3.10`
+* Compatible systems: Linux x86_64, macOS aarch64
+
+The prebuilt `executorch.extension.pybindings.portable_lib` module included in
+this package provides a way to run ExecuTorch `.pte` files, with some
+restrictions:
+* Only [core ATen
+  operators](https://pytorch.org/executorch/stable/ir-ops-set-definition.html)
+  are linked into the prebuilt module
+* Only the [XNNPACK backend
+  delegate](https://pytorch.org/executorch/main/native-delegates-executorch-xnnpack-delegate.html)
+  is linked into the prebuilt module
+
+Please visit the [ExecuTorch website](https://pytorch.org/executorch/) for
+tutorials and documentation. Here are some starting points:
+* [Getting
+  Started](https://pytorch.org/executorch/stable/getting-started-setup.html)
+  * Set up the ExecuTorch environment and run PyTorch models locally.
+* [Working with
+  local LLMs](https://pytorch.org/executorch/stable/llm/getting-started.html)
+  * Learn how to use ExecuTorch to export and accelerate a large-language model
+    from scratch.
+* [Exporting to
+  ExecuTorch](https://pytorch.org/executorch/main/tutorials/export-to-executorch-tutorial.html)
+  * Learn the fundamentals of exporting a PyTorch `nn.Module` to ExecuTorch, and
+    optimizing its performance using quantization and hardware delegation.
+* Running LLaMA on
+  [iOS](https://pytorch.org/executorch/stable/llm/llama-demo-ios.html) and
+  [Android](https://pytorch.org/executorch/stable/llm/llama-demo-android.html)
+  devices.
+  * Build and run LLaMA in a demo mobile app, and learn how to integrate models
+    with your own apps.
diff --git a/backends/apple/coreml/CMakeLists.txt b/backends/apple/coreml/CMakeLists.txt
index f1c19d00ee8..b3d0182999a 100644
--- a/backends/apple/coreml/CMakeLists.txt
+++ b/backends/apple/coreml/CMakeLists.txt
@@ -13,6 +13,8 @@ if(NOT EXECUTORCH_ROOT)
   set(EXECUTORCH_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/../../..)
 endif()
 
+option(COREML_BUILD_EXECUTOR_RUNNER "Build CoreML executor runner." OFF)
+
 # inmemoryfs sources
 set(INMEMORYFS_SOURCES
   runtime/inmemoryfs/inmemory_filesystem.cpp
@@ -144,7 +146,7 @@ target_include_directories(
 )
 target_link_libraries(
   coremldelegate PRIVATE
-  executorch
+  executorch_no_prim_ops
 )
 
 if(EXECUTORCH_BUILD_SDK)
@@ -174,18 +176,26 @@ find_library(SQLITE_LIBRARY sqlite3)
 
 target_link_libraries(coremldelegate
   PRIVATE
-  executorch
+  executorch_no_prim_ops
   ${ACCELERATE_FRAMEWORK}
   ${COREML_FRAMEWORK}
   ${FOUNDATION_FRAMEWORK}
   ${SQLITE_LIBRARY}
 )
 
+if(COREML_BUILD_EXECUTOR_RUNNER)
+target_link_libraries(coremldelegate
+  PRIVATE
+  portable_ops_lib
+  portable_kernels
+)
+endif()
+
 target_compile_options(coremldelegate PRIVATE "-fobjc-arc")
 target_compile_options(coremldelegate PRIVATE "-fno-exceptions")
 
 if(EXECUTORCH_BUILD_SDK)
-target_compile_options(executorch PUBLIC -DET_EVENT_TRACER_ENABLED)
+target_compile_options(executorch_no_prim_ops PUBLIC -DET_EVENT_TRACER_ENABLED)
 target_compile_options(coremldelegate PRIVATE "-frtti")
 target_compile_options(libprotobuf-lite PRIVATE "-frtti")
 else()
diff --git a/backends/apple/coreml/README.md b/backends/apple/coreml/README.md
index 1710860f87e..4a21d8d8ae1 100644
--- a/backends/apple/coreml/README.md
+++ b/backends/apple/coreml/README.md
@@ -6,54 +6,123 @@ Core ML is an optimized framework for running machine learning models on Apple d
 
 ## Layout
 - `compiler/` : Lowers a module to Core ML backend.
+- `partition/`: Partitions a module fully or partially to Core ML backend.
+- `quantizer/`: Quantizes a module in Core ML favored scheme.
 - `scripts/` : Scripts for installing dependencies and running tests.
 - `runtime/`: Core ML delegate runtime implementation.
     - `inmemoryfs`: InMemory filesystem implementation used to serialize/de-serialize AOT blob.
     - `kvstore`: Persistent Key-Value store implementation.
     - `delegate`: Runtime implementation.
     - `include` : Public headers.
-    - `tests` :  Tests for Core ML delegate.
-    - `workspace` : Xcode workspace for tests.
+    - `sdk` : SDK implementation.
+    - `tests` :  Unit tests.
+    - `workspace` : Xcode workspace for the runtime.
 - `third-party/`: External dependencies.
 
-## Help & Improvements
-If you have problems or questions or have suggestions for ways to make
-implementation and testing better, please create an issue on [github](https://www.github.com/pytorch/executorch/issues).
+## Partition and Delegation
 
-## Delegation
-
-For delegating the Program to the **Core ML** backend, the client must be responsible for calling `to_backend` with the **CoreMLBackend** tag.
+To delegate a Program to the **Core ML** backend, the client must call `to_backend` with the **CoreMLPartitioner**.
 
 ```python
-import executorch.exir as exir
 import torch
-
-from executorch.exir.backend.backend_api import to_backend
+import executorch.exir
 
 from executorch.backends.apple.coreml.compiler import CoreMLBackend
+from executorch.backends.apple.coreml.partition.coreml_partitioner import CoreMLPartitioner
 
-class LowerableSubModel(torch.nn.Module):
+class Model(torch.nn.Module):
     def __init__(self):
         super().__init__()
 
     def forward(self, x):
         return torch.sin(x)
 
-# Convert the lowerable module to Edge IR Representation
-to_be_lowered = LowerableSubModel()
-example_input = (torch.ones(1), )
-to_be_lowered_exir_submodule = exir.capture(to_be_lowered, example_input).to_edge()
+source_model = Model()
+example_inputs = (torch.ones(1), )
 
-# Lower to Core ML backend
-lowered_module = to_backend('CoreMLBackend', to_be_lowered_exir_submodule.exported_program, [])
+# Export the source model to Edge IR representation
+aten_program = torch.export.export(source_model, example_inputs)
+edge_program_manager = executorch.exir.to_edge(aten_program)
+
+# Delegate to Core ML backend
+delegated_program_manager = edge_program_manager.to_backend(CoreMLPartitioner())
+
+# Serialize delegated program
+executorch_program = delegated_program_manager.to_executorch()
+with open("model.pte", "wb") as f:
+    f.write(executorch_program.buffer)
 ```
 
-Currently, the **Core ML** backend delegates the whole module to **Core ML**. If a specific op is not supported by the **Core ML** backend then the `to_backend` call would throw an exception. We will be adding a **Core ML Partitioner** to resolve the issue.
+The module will be fully or partially delegated to **Core ML**, depending on whether all or part of ops are supported by the **Core ML** backend. User may force skip certain ops by `CoreMLPartitioner(skip_ops_for_coreml_delegation=...)`
+
+The `to_backend` implementation is a thin wrapper over [coremltools](https://apple.github.io/coremltools/docs-guides/), `coremltools` is responsible for converting an **ExportedProgram** to a **MLModel**. The converted **MLModel** data is saved, flattened, and returned as bytes to **ExecuTorch**.
 
-The `to_backend` implementation is a thin wrapper over `coremltools`, `coremltools` is responsible for converting an **ExportedProgram** to a **MLModel**. The converted **MLModel** data is saved, flattened, and returned as bytes to **ExecuTorch**.
+## Quantization
+
+To quantize a Program in a Core ML favored way, the client may utilize **CoreMLQuantizer**.
+
+```python
+import torch
+import executorch.exir
+
+from torch._export import capture_pre_autograd_graph
+from torch.ao.quantization.quantize_pt2e import (
+    convert_pt2e,
+    prepare_pt2e,
+    prepare_qat_pt2e,
+)
+
+from executorch.backends.apple.coreml.quantizer.coreml_quantizer import CoreMLQuantizer
+from coremltools.optimize.torch.quantization.quantization_config import (
+    LinearQuantizerConfig,
+    QuantizationScheme,
+)
+
+class Model(torch.nn.Module):
+    def __init__(self) -> None:
+        super().__init__()
+        self.conv = torch.nn.Conv2d(
+            in_channels=3, out_channels=16, kernel_size=3, padding=1
+        )
+        self.relu = torch.nn.ReLU()
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        a = self.conv(x)
+        return self.relu(a)
+
+source_model = Model()
+example_inputs = (torch.randn((1, 3, 256, 256)), )
+
+pre_autograd_aten_dialect = capture_pre_autograd_graph(model, example_inputs)
+
+quantization_config = LinearQuantizerConfig.from_dict(
+    {
+        "global_config": {
+            "quantization_scheme": QuantizationScheme.symmetric,
+            "activation_dtype": torch.uint8,
+            "weight_dtype": torch.int8,
+            "weight_per_channel": True,
+        }
+    }
+)
+quantizer = CoreMLQuantizer(quantization_config)
+
+# For post-training quantization, use `prepare_pt2e`
+# For quantization-aware trainin,g use `prepare_qat_pt2e`
+prepared_graph = prepare_pt2e(pre_autograd_aten_dialect, quantizer)
+
+prepared_graph(*example_inputs)
+converted_graph = convert_pt2e(prepared_graph)
+```
+
+The `converted_graph` is the quantized torch model, and can be delegated to **Core ML** similarly through **CoreMLPartitioner**
 
 ## Runtime
 
-To execute a **Core ML** delegated **Program**, the client must link to the `coremldelegate` library. Once linked there are no additional steps required, **ExecuTorch** when running the **Program** would call the **Core ML** runtime to execute the **Core ML** delegated part of the **Program**.
+To execute a Core ML delegated program, the application must link to the `coremldelegate` library. Once linked there are no additional steps required, ExecuTorch when running the program would call the Core ML runtime to execute the Core ML delegated part of the program.
 
 Please follow the instructions described in the [Core ML setup](/backends/apple/coreml/setup.md) to link the `coremldelegate` library.
+
+## Help & Improvements
+If you have problems or questions or have suggestions for ways to make
+implementation and testing better, please create an issue on [github](https://www.github.com/pytorch/executorch/issues).
diff --git a/backends/apple/coreml/runtime/delegate/ETCoreMLAssetManager.mm b/backends/apple/coreml/runtime/delegate/ETCoreMLAssetManager.mm
index da399e80d54..6fe37925d27 100644
--- a/backends/apple/coreml/runtime/delegate/ETCoreMLAssetManager.mm
+++ b/backends/apple/coreml/runtime/delegate/ETCoreMLAssetManager.mm
@@ -630,7 +630,7 @@ - (NSUInteger)_compact:(NSUInteger)sizeInBytes error:(NSError * __autoreleasing
     }
     
     if (_estimatedSizeInBytes <= sizeInBytes) {
-        return YES;
+        return _estimatedSizeInBytes;
     }
     
     std::error_code ec;
diff --git a/backends/apple/coreml/runtime/delegate/ETCoreMLDefaultModelExecutor.h b/backends/apple/coreml/runtime/delegate/ETCoreMLDefaultModelExecutor.h
index eab239b496c..78c76fadd04 100644
--- a/backends/apple/coreml/runtime/delegate/ETCoreMLDefaultModelExecutor.h
+++ b/backends/apple/coreml/runtime/delegate/ETCoreMLDefaultModelExecutor.h
@@ -27,6 +27,9 @@ __attribute__((objc_subclassing_restricted)) @interface ETCoreMLDefaultModelExec
 /// The model.
 @property (readonly, strong, nonatomic) ETCoreMLModel* model;
 
+/// If set to `YES` then output backing are ignored.
+@property (readwrite, atomic) BOOL ignoreOutputBackings;
+
 @end
 
 NS_ASSUME_NONNULL_END
diff --git a/backends/apple/coreml/runtime/delegate/ETCoreMLDefaultModelExecutor.mm b/backends/apple/coreml/runtime/delegate/ETCoreMLDefaultModelExecutor.mm
index 399c91bd495..57316e28015 100644
--- a/backends/apple/coreml/runtime/delegate/ETCoreMLDefaultModelExecutor.mm
+++ b/backends/apple/coreml/runtime/delegate/ETCoreMLDefaultModelExecutor.mm
@@ -26,6 +26,9 @@ - (instancetype)initWithModel:(ETCoreMLModel *)model {
                                               loggingOptions:(const executorchcoreml::ModelLoggingOptions& __unused)loggingOptions
                                                  eventLogger:(const executorchcoreml::ModelEventLogger* _Nullable __unused)eventLogger
                                                        error:(NSError * __autoreleasing *)error {
+    if (self.ignoreOutputBackings) {
+        predictionOptions.outputBackings = @{};
+    }
     id<MLFeatureProvider> outputs = [self.model.mlModel predictionFromFeatures:inputs
                                                                        options:predictionOptions
                                                                          error:error];
diff --git a/backends/apple/coreml/runtime/delegate/ETCoreMLLogging.h b/backends/apple/coreml/runtime/delegate/ETCoreMLLogging.h
index 1a1b10848bb..d9c4d4ef638 100644
--- a/backends/apple/coreml/runtime/delegate/ETCoreMLLogging.h
+++ b/backends/apple/coreml/runtime/delegate/ETCoreMLLogging.h
@@ -7,6 +7,7 @@
 
 #import <Foundation/Foundation.h>
 
+#import <executorch/runtime/platform/log.h>
 #import <os/log.h>
 
 NS_ASSUME_NONNULL_BEGIN
@@ -48,7 +49,11 @@ typedef NS_ERROR_ENUM(ETCoreMLErrorDomain, ETCoreMLError) {
 
 /// Record the error with `os_log_error` and fills `*errorOut` with `NSError`.
 #define ETCoreMLLogErrorAndSetNSError(errorOut, errorCode, formatString, ...)                                        \
-    os_log_error(ETCoreMLErrorUtils.loggingChannel, formatString, ##__VA_ARGS__);                                    \
+    if (ET_LOG_ENABLED) {                                                                                            \
+        ET_LOG(Error, "%s", [NSString stringWithFormat:@formatString, ##__VA_ARGS__].UTF8String);                    \
+    } else {                                                                                                         \
+        os_log_error(ETCoreMLErrorUtils.loggingChannel, formatString, ##__VA_ARGS__);                                \
+    }                                                                                                                \
     if (errorOut) {                                                                                                  \
         *errorOut =                                                                                                  \
             [NSError errorWithDomain:ETCoreMLErrorDomain                                                             \
@@ -58,24 +63,31 @@ typedef NS_ERROR_ENUM(ETCoreMLErrorDomain, ETCoreMLError) {
                             }];                                                                                      \
     }
 
-/// Record the error and its underlying error with `os_log_error` and fills
-/// `*errorOut` with NSError.
+/// Record the error and its underlying error with `os_log_error` and fills `*errorOut` with `NSError`.
 #define ETCoreMLLogUnderlyingErrorAndSetNSError(errorOut, errorCode, underlyingNSError, formatString, ...) \
-    os_log_error(ETCoreMLErrorUtils.loggingChannel,                                                        \
-                 formatString ", with underlying error= %@.",                                              \
-                 ##__VA_ARGS__,                                                                            \
-                 (underlyingNSError).localizedDescription);                                                \
+    if (ET_LOG_ENABLED) {                                                                                  \
+        ET_LOG(Error, "%s", [NSString stringWithFormat:@formatString, ##__VA_ARGS__].UTF8String);          \
+    } else {                                                                                               \
+        os_log_error(ETCoreMLErrorUtils.loggingChannel,                                                    \
+                     formatString ", with underlying error= %@.",                                          \
+                     ##__VA_ARGS__,                                                                        \
+                     (underlyingNSError).localizedDescription);                                            \
+    }                                                                                                      \
     if (errorOut) {                                                                                        \
         *errorOut = [ETCoreMLErrorUtils errorWithCode:errorCode                                            \
                                       underlyingError:underlyingNSError                                    \
                                                format:@formatString, ##__VA_ARGS__];                       \
     }
 
-#define ETCoreMLLogError(error, formatString, ...)  \
-    os_log_error(ETCoreMLErrorUtils.loggingChannel, \
-                 formatString ", with error= %@.",  \
-                 ##__VA_ARGS__,                     \
-                 (error).localizedDescription);
+#define ETCoreMLLogError(error, formatString, ...)                                                \
+    if (ET_LOG_ENABLED) {                                                                         \
+        ET_LOG(Error, "%s", [NSString stringWithFormat:@formatString, ##__VA_ARGS__].UTF8String); \
+    } else {                                                                                      \
+        os_log_error(ETCoreMLErrorUtils.loggingChannel,                                           \
+                     formatString ", with error= %@.",                                            \
+                     ##__VA_ARGS__,                                                               \
+                     (error).localizedDescription);                                               \
+    }
 
 
 #pragma clang diagnostic pop
diff --git a/backends/apple/coreml/runtime/delegate/ETCoreMLModel.h b/backends/apple/coreml/runtime/delegate/ETCoreMLModel.h
index 0f8a440c858..14c90694464 100644
--- a/backends/apple/coreml/runtime/delegate/ETCoreMLModel.h
+++ b/backends/apple/coreml/runtime/delegate/ETCoreMLModel.h
@@ -6,13 +6,18 @@
 // Please refer to the license found in the LICENSE file in the root directory of the source tree.
 
 #import <CoreML/CoreML.h>
+#import <vector>
 
 NS_ASSUME_NONNULL_BEGIN
 
 @class ETCoreMLAsset;
 
+namespace executorchcoreml {
+class MultiArray;
+}
+
 /// Represents a ML model, the class is a thin wrapper over `MLModel` with additional properties.
-@interface ETCoreMLModel : NSObject
+__attribute__((objc_subclassing_restricted)) @interface ETCoreMLModel : NSObject
 
 - (instancetype)init NS_UNAVAILABLE;
 
@@ -31,6 +36,12 @@ NS_ASSUME_NONNULL_BEGIN
                     orderedOutputNames:(NSOrderedSet<NSString*>*)orderedOutputNames
                                  error:(NSError* __autoreleasing*)error NS_DESIGNATED_INITIALIZER;
 
+- (nullable NSArray<MLMultiArray*>*)prepareInputs:(const std::vector<executorchcoreml::MultiArray>&)inputs
+                                            error:(NSError* __autoreleasing*)error;
+
+- (nullable NSArray<MLMultiArray*>*)prepareOutputBackings:(const std::vector<executorchcoreml::MultiArray>&)outputs
+                                                    error:(NSError* __autoreleasing*)error;
+
 /// The underlying MLModel.
 @property (strong, readonly, nonatomic) MLModel* mlModel;
 
diff --git a/backends/apple/coreml/runtime/delegate/ETCoreMLModel.mm b/backends/apple/coreml/runtime/delegate/ETCoreMLModel.mm
index 791fb7c03b6..ee7218bd271 100644
--- a/backends/apple/coreml/runtime/delegate/ETCoreMLModel.mm
+++ b/backends/apple/coreml/runtime/delegate/ETCoreMLModel.mm
@@ -8,6 +8,164 @@
 #import <ETCoreMLModel.h>
 
 #import <ETCoreMLAsset.h>
+#import <functional>
+#import <objc_array_util.h>
+#import <multiarray.h>
+#import <numeric>
+
+#pragma mark - ETCoreMLMultiArrayDescriptor
+__attribute__((objc_subclassing_restricted))
+@interface ETCoreMLMultiArrayDescriptor: NSObject <NSCopying>
+
+- (instancetype)init NS_UNAVAILABLE;
+
++ (instancetype)new NS_UNAVAILABLE;
+
+- (instancetype)initWithShape:(NSArray<NSNumber *> *)shape
+                     dataType:(MLMultiArrayDataType)dataType NS_DESIGNATED_INITIALIZER;
+
+@property (copy, readonly, nonatomic) NSArray<NSNumber *> *shape;
+
+@property (assign, readonly, nonatomic) MLMultiArrayDataType dataType;
+
+@end
+
+@implementation ETCoreMLMultiArrayDescriptor
+
+- (instancetype)initWithShape:(NSArray<NSNumber *> *)shape
+                     dataType:(MLMultiArrayDataType)dataType {
+    self = [super init];
+    if (self) {
+        _shape = shape;
+        _dataType = dataType;
+    }
+    
+    return self;
+}
+
+- (BOOL)isEqual:(id)object {
+    if (object == self) {
+        return YES;
+    }
+    
+    if (![object isKindOfClass:self.class]) {
+        return NO;
+    }
+    
+    ETCoreMLMultiArrayDescriptor *other = (ETCoreMLMultiArrayDescriptor *)object;
+    return [self.shape isEqualToArray:other.shape] && self.dataType == other.dataType;
+}
+
+- (NSUInteger)hash {
+    return [self.shape hash] ^ (NSUInteger)self.dataType;
+}
+
+- (instancetype)copyWithZone:(NSZone *)zone {
+    return [[ETCoreMLMultiArrayDescriptor allocWithZone:zone] initWithShape:self.shape
+                                                                   dataType:self.dataType];
+}
+
+@end
+
+namespace {
+
+using namespace executorchcoreml;
+
+size_t get_number_of_bytes(MLMultiArrayDataType data_type) {
+    switch (data_type) {
+        case MLMultiArrayDataTypeFloat16: {
+            return 2;
+        }
+        case MLMultiArrayDataTypeFloat32: {
+            return 4;
+        }
+        case MLMultiArrayDataTypeInt32: {
+            return 4;
+        }
+        case MLMultiArrayDataTypeFloat64: {
+            return 8;
+        }
+        default: {
+            return 0;
+        }
+    }
+}
+
+std::vector<size_t> calculate_strides(const std::vector<size_t>& shape) {
+    if (shape.size() == 0) {
+        return {};
+    }
+    
+    if (shape.size() == 1) {
+        return {1};
+    }
+    
+    std::vector<size_t> strides(shape.size(), 1);
+    size_t product = 1;
+    for (size_t i = shape.size(); i > 0; i--) {
+        strides[i - 1] = product;
+        product *= shape[i - 1];
+    }
+    
+    return strides;
+}
+
+MLMultiArray * _Nullable make_ml_multi_array(const std::vector<size_t>& shape,
+                                             MLMultiArrayDataType dataType,
+                                             NSCache<ETCoreMLMultiArrayDescriptor *, NSMutableData *> *cache,
+                                             NSError * __autoreleasing *error) {
+    ETCoreMLMultiArrayDescriptor *descriptor = [[ETCoreMLMultiArrayDescriptor alloc] initWithShape:to_array(shape)
+                                                                                          dataType:dataType];
+    // Check the cache first otherwise allocate a new backing storage.
+    NSMutableData *backing_storage = [cache objectForKey:descriptor];
+    if (backing_storage) {
+        [cache removeObjectForKey:descriptor];
+    } else {
+        size_t n = std::accumulate(shape.cbegin(), shape.cend(), 1, std::multiplies<size_t>{});
+        backing_storage = [[NSMutableData alloc] initWithLength:n * get_number_of_bytes(dataType)];
+    }
+    
+    __weak NSCache<ETCoreMLMultiArrayDescriptor *, NSMutableData *> *weakCache = cache;
+    // Add the storage back to the cache when it gets deallocated, the next prediction would use the same storage.
+    MLMultiArray *result = [[MLMultiArray alloc] initWithDataPointer:backing_storage.mutableBytes
+                                                               shape:descriptor.shape
+                                                            dataType:descriptor.dataType
+                                                             strides:to_array(calculate_strides(shape))
+                                                         deallocator:^(void * _Nonnull bytes) {[weakCache setObject:backing_storage forKey:descriptor];}
+                                                               error:error];
+    
+    return result;
+}
+
+NSDictionary<NSString *, MLMultiArrayConstraint *> *
+get_multi_array_constraints_by_name(NSDictionary<NSString *, MLFeatureDescription *> *feature_descriptions) {
+    NSMutableDictionary<NSString *, MLMultiArrayConstraint *> *result = [NSMutableDictionary dictionaryWithCapacity:feature_descriptions.count];
+    [feature_descriptions enumerateKeysAndObjectsUsingBlock:^(NSString *key, MLFeatureDescription *description, BOOL * _Nonnull stop) {
+        result[key] = description.multiArrayConstraint;
+    }];
+    
+    return result;
+}
+
+NSDictionary<NSString *, MLMultiArrayConstraint *> *get_multi_array_input_constraints_by_name(MLModelDescription *description) {
+    return get_multi_array_constraints_by_name(description.inputDescriptionsByName);
+}
+
+NSDictionary<NSString *, MLMultiArrayConstraint *> *get_multi_array_output_constraints_by_name(MLModelDescription *description) {
+    return get_multi_array_constraints_by_name(description.outputDescriptionsByName);
+}
+
+}
+
+#pragma mark - ETCoreMLModel
+@interface ETCoreMLModel ()
+
+@property (strong, readonly, nonatomic) NSCache<ETCoreMLMultiArrayDescriptor *, NSMutableData *> *cache;
+@property (copy, readonly, nonatomic) NSDictionary<NSString *, MLMultiArrayConstraint *> *inputConstraintsByName;
+@property (copy, readonly, nonatomic) NSDictionary<NSString *, MLMultiArrayConstraint *> *outputConstraintsByName;
+
+@end
+
 
 @implementation ETCoreMLModel
 
@@ -33,8 +191,11 @@ - (nullable instancetype)initWithAsset:(ETCoreMLAsset *)asset
         _asset = asset;
         _orderedInputNames = [orderedInputNames copy];
         _orderedOutputNames = [orderedOutputNames copy];
+        _cache = [[NSCache alloc] init];
+        _inputConstraintsByName = get_multi_array_input_constraints_by_name(mlModel.modelDescription);
+        _outputConstraintsByName = get_multi_array_output_constraints_by_name(mlModel.modelDescription);
     }
-
+    
     return self;
 }
 
@@ -42,4 +203,73 @@ - (NSString *)identifier {
     return self.asset.identifier;
 }
 
+- (nullable NSArray<MLMultiArray *> *)prepareArgs:(const std::vector<executorchcoreml::MultiArray>&)args
+                                         argNames:(NSOrderedSet<NSString *> *)argNames
+                             argConstraintsByName:(NSDictionary<NSString *, MLMultiArrayConstraint *> *)argConstraintsByName
+                                         copyData:(const BOOL)copyData
+                                            error:(NSError * __autoreleasing *)error {
+    NSEnumerator *nameEnumerator = [argNames objectEnumerator];
+    NSMutableArray<MLMultiArray *> *result = [NSMutableArray arrayWithCapacity:args.size()];
+    for (const auto& arg : args) {
+        BOOL lCopyData = copyData;
+        NSString *argName = [nameEnumerator nextObject];
+        MLMultiArrayConstraint *constraint = argConstraintsByName[argName];
+        const auto& layout = arg.layout();
+        auto dataType = to_ml_multiarray_data_type(layout.dataType());
+        MLMultiArray *multiArrayArg = nil;
+        if (dataType == constraint.dataType) {
+            // We can use the same data storage.
+            multiArrayArg = [[MLMultiArray alloc] initWithDataPointer:arg.data()
+                                                                shape:to_array(layout.shape())
+                                                             dataType:constraint.dataType
+                                                              strides:to_array(layout.strides())
+                                                          deallocator:^(void * _Nonnull bytes) {}
+                                                                error:error];
+            lCopyData = NO;
+        } else {
+            // We can't use the same data storage, data types are not the same.
+            multiArrayArg = ::make_ml_multi_array(layout.shape(), constraint.dataType, self.cache, error);
+        }
+        
+        if (!multiArrayArg) {
+            return nil;
+        }
+        
+        if (multiArrayArg && lCopyData) {
+            [multiArrayArg getMutableBytesWithHandler:^(void *_Nonnull mutableBytes,
+                                                        NSInteger __unused size,
+                                                        NSArray<NSNumber *> *strides) {
+                MultiArray buffer(mutableBytes, MultiArray::MemoryLayout(to_multiarray_data_type(constraint.dataType).value(),
+                                                                         layout.shape(),
+                                                                         to_vector<ssize_t>(strides)));
+                arg.copy(buffer);
+            }];
+        }
+        
+        [result addObject:multiArrayArg];
+    }
+    
+    return result;
+}
+
+- (nullable NSArray<MLMultiArray *> *)prepareInputs:(const std::vector<executorchcoreml::MultiArray>&)inputs
+                                              error:(NSError * __autoreleasing *)error {
+    return [self prepareArgs:inputs
+                    argNames:self.orderedInputNames
+        argConstraintsByName:self.inputConstraintsByName
+                    copyData:YES
+                       error:error];
+    
+}
+
+- (nullable NSArray<MLMultiArray *> *)prepareOutputBackings:(const std::vector<executorchcoreml::MultiArray>&)outputs
+                                                      error:(NSError * __autoreleasing *)error {
+    return [self prepareArgs:outputs
+                    argNames:self.orderedOutputNames
+        argConstraintsByName:self.outputConstraintsByName
+                    copyData:NO
+                       error:error];
+    
+}
+
 @end
diff --git a/backends/apple/coreml/runtime/delegate/ETCoreMLModelExecutor.h b/backends/apple/coreml/runtime/delegate/ETCoreMLModelExecutor.h
index e6e329c9ddd..2f1b22f456b 100644
--- a/backends/apple/coreml/runtime/delegate/ETCoreMLModelExecutor.h
+++ b/backends/apple/coreml/runtime/delegate/ETCoreMLModelExecutor.h
@@ -35,6 +35,9 @@ NS_ASSUME_NONNULL_BEGIN
 /// The model.
 @property (readonly, strong, nonatomic) ETCoreMLModel* model;
 
+/// If set to `YES` then output backing are ignored.
+@property (readwrite, atomic) BOOL ignoreOutputBackings;
+
 
 @end
 
diff --git a/backends/apple/coreml/runtime/delegate/ETCoreMLModelManager.h b/backends/apple/coreml/runtime/delegate/ETCoreMLModelManager.h
index fb616c71527..6bfdbade9c4 100644
--- a/backends/apple/coreml/runtime/delegate/ETCoreMLModelManager.h
+++ b/backends/apple/coreml/runtime/delegate/ETCoreMLModelManager.h
@@ -7,11 +7,14 @@
 
 #import <CoreML/CoreML.h>
 
+#import <vector>
+
 NS_ASSUME_NONNULL_BEGIN
 
 namespace executorchcoreml {
 struct ModelLoggingOptions;
 class ModelEventLogger;
+class MultiArray;
 };
 
 @class ETCoreMLModel;
@@ -49,7 +52,7 @@ __attribute__((objc_subclassing_restricted)) @interface ETCoreMLModelManager : N
 /// Executes the loaded model.
 ///
 /// @param handle The handle to the loaded model.
-/// @param args The arguments to the model.
+/// @param args The arguments (inputs and outputs) of the model.
 /// @param loggingOptions The model logging options.
 /// @param error   On failure, error is filled with the failure information.
 /// @retval `YES` if the execution succeeded otherwise `NO`.
@@ -59,6 +62,19 @@ __attribute__((objc_subclassing_restricted)) @interface ETCoreMLModelManager : N
                    eventLogger:(const executorchcoreml::ModelEventLogger* _Nullable)eventLogger
                          error:(NSError* __autoreleasing*)error;
 
+/// Executes the loaded model.
+///
+/// @param handle The handle to the loaded model.
+/// @param argsVec The arguments (inputs and outputs) of the model.
+/// @param loggingOptions The model logging options.
+/// @param error   On failure, error is filled with the failure information.
+/// @retval `YES` if the execution succeeded otherwise `NO`.
+- (BOOL)executeModelWithHandle:(ModelHandle*)handle
+                       argsVec:(const std::vector<executorchcoreml::MultiArray>&)argsVec
+                loggingOptions:(const executorchcoreml::ModelLoggingOptions&)loggingOptions
+                   eventLogger:(const executorchcoreml::ModelEventLogger* _Nullable)eventLogger
+                         error:(NSError* __autoreleasing*)error;
+
 /// Unloads the loaded model.
 ///
 /// @param handle The handle to the loaded model.
diff --git a/backends/apple/coreml/runtime/delegate/ETCoreMLModelManager.mm b/backends/apple/coreml/runtime/delegate/ETCoreMLModelManager.mm
index 1c0d2a30f97..c51de9d1e14 100644
--- a/backends/apple/coreml/runtime/delegate/ETCoreMLModelManager.mm
+++ b/backends/apple/coreml/runtime/delegate/ETCoreMLModelManager.mm
@@ -22,6 +22,8 @@
 #import <iostream>
 #import <memory>
 #import <model_metadata.h>
+#import <multiarray.h>
+#import <objc_array_util.h>
 #import <optional>
 #import <os/lock.h>
 #import <serde_json.h>
@@ -98,32 +100,60 @@ BOOL is_backed_by_same_buffer(MLMultiArray *array1, MLMultiArray *array2) {
     return options;
 }
 
-BOOL copy(MLMultiArray *src, MLMultiArray *dst, NSError * __autoreleasing *error) {
-    if (![src.shape isEqualToArray:dst.shape]) {
-        ETCoreMLLogErrorAndSetNSError(error, 0, "%@: Model is broken", NSStringFromClass(ETCoreMLModelManager.class));
-        return NO;
-    }
+void copy(MLMultiArray *src, MLMultiArray *dst) {
     if (::is_backed_by_same_buffer(src, dst)) {
-        return YES;
-    }
-    @autoreleasepool {
-        [src copyInto:dst];
+        return;
     }
-    return YES;
+    
+    [src copyInto:dst];
 }
 
-BOOL set_outputs(NSArray<MLMultiArray *> *outputs,
-                 NSArray<MLMultiArray *> *model_outputs,
-                 NSError * __autoreleasing *error) {
+void set_outputs(NSArray<MLMultiArray *> *outputs, NSArray<MLMultiArray *> *model_outputs) {
     NSEnumerator<MLMultiArray *> *enumerator = [model_outputs objectEnumerator];
     for (MLMultiArray *output in outputs) {
         MLMultiArray *model_output = [enumerator nextObject];
-        if (!::copy(output, model_output, error)) {
-            return NO;
+        ::copy(model_output, output);
+    }
+}
+
+std::optional<MultiArray::DataType> get_data_type(MLMultiArrayDataType data_type) {
+    switch (data_type) {
+        case MLMultiArrayDataTypeFloat16: {
+            return MultiArray::DataType::Float16;
+        }
+        case MLMultiArrayDataTypeFloat32: {
+            return MultiArray::DataType::Float32;
+        }
+        case MLMultiArrayDataTypeFloat64: {
+            return MultiArray::DataType::Float64;
+        }
+        case MLMultiArrayDataTypeInt32: {
+            return MultiArray::DataType::Int32;
+        }
+        default: {
+            return std::nullopt;
         }
     }
-    
-    return YES;
+}
+
+void copy(MLMultiArray *src, executorchcoreml::MultiArray& dst) {
+    [src getBytesWithHandler:^(const void * _Nonnull bytes, NSInteger size) {
+        if (bytes == dst.data()) {
+            return;
+        }
+        
+        MultiArray::MemoryLayout src_layout(get_data_type(src.dataType).value(), to_vector<size_t>(src.shape), to_vector<ssize_t>(src.strides));
+        MultiArray(const_cast<void *>(bytes), std::move(src_layout)).copy(dst);
+    }];
+}
+
+void set_outputs(std::vector<executorchcoreml::MultiArray>& outputs,
+                 NSArray<MLMultiArray *> *model_outputs) {
+    NSEnumerator<MLMultiArray *> *enumerator = [model_outputs objectEnumerator];
+    for (auto& output : outputs) {
+        MLMultiArray *model_output = [enumerator nextObject];
+        ::copy(model_output, output);
+    }
 }
 
 NSData * _Nullable get_file_data(const inmemoryfs::InMemoryFileSystem *inMemoryFS,
@@ -313,6 +343,7 @@ void add_compute_unit(std::string& identifier, MLComputeUnits compute_units) {
     
     return result;
 }
+
 #endif
 } //namespace
 
@@ -467,7 +498,7 @@ - (nullable NSURL *)compiledModelURLWithIdentifier:(NSString *)identifier
     return [[ETCoreMLModelAnalyzer alloc] initWithCompiledModelAsset:compiledModelAsset
                                                           modelAsset:modelAsset
                                                             metadata:metadata
-                                       operationPathToDebugSymbolMap: operation_path_to_symbol_name_map
+                                       operationPathToDebugSymbolMap:operation_path_to_symbol_name_map
                                                        configuration:configuration
                                                         assetManager:self.assetManager
                                                                error:error];
@@ -641,6 +672,48 @@ - (void)addPrewarmedAsset:(ETCoreMLAsset *)asset {
     os_unfair_lock_unlock(&_lock);
 }
 
+- (nullable NSArray<MLMultiArray *> *)executeModelUsingExecutor:(id<ETCoreMLModelExecutor>)executor
+                                                         inputs:(NSArray<MLMultiArray *> *)inputs
+                                                 outputBackings:(NSArray<MLMultiArray *> *)outputBackings
+                                                 loggingOptions:(const executorchcoreml::ModelLoggingOptions&)loggingOptions
+                                                    eventLogger:(const executorchcoreml::ModelEventLogger* _Nullable)eventLogger
+                                                          error:(NSError * __autoreleasing *)error {
+    NSError *localError = nil;
+    ETCoreMLModel *model = executor.model;
+    MLPredictionOptions *predictionOptions = ::get_prediction_options(outputBackings, model.orderedOutputNames, error);
+    if (!predictionOptions) {
+        return nil;
+    }
+    
+    id<MLFeatureProvider> inputFeatures = ::get_feature_provider(inputs, model.orderedInputNames, error);
+    if (!inputFeatures) {
+        return nil;
+    }
+    
+    NSArray<MLMultiArray *> *modelOutputs = [executor executeModelWithInputs:inputFeatures
+                                                           predictionOptions:predictionOptions
+                                                             loggingOptions:loggingOptions
+                                                                 eventLogger:eventLogger
+                                                                       error:&localError];
+    // Try without output backings.
+    if (!modelOutputs && predictionOptions.outputBackings.count > 0) {
+        localError = nil;
+        executor.ignoreOutputBackings = YES;
+    }
+    
+    modelOutputs = [executor executeModelWithInputs:inputFeatures
+                                  predictionOptions:predictionOptions
+                                     loggingOptions:loggingOptions
+                                        eventLogger:eventLogger
+                                              error:&localError];
+    
+    if (error) {
+        *error = localError;
+    }
+    
+    return modelOutputs;
+}
+
 - (BOOL)executeModelWithHandle:(ModelHandle *)handle
                           args:(NSArray<MLMultiArray *> *)args
                 loggingOptions:(const executorchcoreml::ModelLoggingOptions&)loggingOptions
@@ -659,33 +732,91 @@ - (BOOL)executeModelWithHandle:(ModelHandle *)handle
     if (args.count != model.orderedInputNames.count + model.orderedOutputNames.count) {
         ETCoreMLLogErrorAndSetNSError(error,
                                       ETCoreMLErrorCorruptedModel,
-                                      "%@: Model is invalid.",
-                                      NSStringFromClass(self.class));
+                                      "%@: Model is invalid, expected args count to be %lu but got %lu.",
+                                      NSStringFromClass(self.class),
+                                      static_cast<unsigned long>(model.orderedInputNames.count + model.orderedOutputNames.count),
+                                      args.count);
         return NO;
     }
-    
-    NSArray<MLMultiArray *> *inputs = [args subarrayWithRange:NSMakeRange(0, model.orderedInputNames.count)];
-    NSArray<MLMultiArray *> *outputs = [args subarrayWithRange:NSMakeRange(model.orderedInputNames.count, args.count - model.orderedInputNames.count)];
-    id<MLFeatureProvider> inputFeatures = ::get_feature_provider(inputs, model.orderedInputNames, error);
-    if (!inputFeatures) {
-        return NO;
+    @autoreleasepool {
+        NSArray<MLMultiArray *> *inputs = [args subarrayWithRange:NSMakeRange(0, model.orderedInputNames.count)];
+        NSArray<MLMultiArray *> *outputs = [args subarrayWithRange:NSMakeRange(model.orderedInputNames.count, args.count - model.orderedInputNames.count)];
+        NSArray<MLMultiArray *> *outputBackings = @[];
+        if (executor.ignoreOutputBackings == NO) {
+            outputBackings = outputs;
+        }
+        
+        NSArray<MLMultiArray *> *modelOutputs = [self executeModelUsingExecutor:executor
+                                                                         inputs:inputs
+                                                                 outputBackings:outputBackings
+                                                                 loggingOptions:loggingOptions
+                                                                    eventLogger:eventLogger
+                                                                          error:error];
+        if (!modelOutputs) {
+            return NO;
+        }
+        
+        ::set_outputs(outputs, modelOutputs);
     }
     
-    MLPredictionOptions *predictionOptions = ::get_prediction_options(outputs, model.orderedOutputNames, error);
-    if (!predictionOptions) {
+    return YES;
+}
+
+- (BOOL)executeModelWithHandle:(ModelHandle *)handle
+                       argsVec:(const std::vector<executorchcoreml::MultiArray>&)argsVec
+                loggingOptions:(const executorchcoreml::ModelLoggingOptions&)loggingOptions
+                   eventLogger:(const executorchcoreml::ModelEventLogger* _Nullable)eventLogger
+                         error:(NSError * __autoreleasing *)error {
+    id<ETCoreMLModelExecutor> executor = [self executorWithHandle:handle];
+    if (!executor) {
+        ETCoreMLLogErrorAndSetNSError(error,
+                                      0,
+                                      "%@: Model is already unloaded.",
+                                      NSStringFromClass(self.class));
         return NO;
     }
     
-    NSArray<MLMultiArray *> *modelOutputs = [executor executeModelWithInputs:inputFeatures
-                                                           predictionOptions:predictionOptions
-                                                             loggingOptions:loggingOptions
-                                                                 eventLogger:eventLogger
-                                                                       error:error];
-    if (!outputs) {
+    ETCoreMLModel *model = executor.model;
+    if (argsVec.size() != model.orderedInputNames.count + model.orderedOutputNames.count) {
+        ETCoreMLLogErrorAndSetNSError(error,
+                                      ETCoreMLErrorCorruptedModel,
+                                      "%@: Model is invalid, expected args count to be %lu but got %lu.",
+                                      NSStringFromClass(self.class),
+                                      static_cast<unsigned long>(model.orderedInputNames.count + model.orderedOutputNames.count),
+                                      argsVec.size());
         return NO;
     }
     
-    return ::set_outputs(outputs, modelOutputs, error);
+    std::vector<executorchcoreml::MultiArray> inputArgs(argsVec.begin(), argsVec.begin() + model.orderedInputNames.count);
+    std::vector<executorchcoreml::MultiArray> outputArgs(argsVec.begin() + model.orderedInputNames.count, argsVec.end());
+    @autoreleasepool {
+        NSArray<MLMultiArray *> *inputs = [model prepareInputs:inputArgs error:error];
+        if (!inputs) {
+            return NO;
+        }
+        
+        NSArray<MLMultiArray *> *outputBackings = @[];
+        if (executor.ignoreOutputBackings == NO) {
+            outputBackings = [model prepareOutputBackings:outputArgs error:error];
+        }
+        
+        if (!outputBackings) {
+            return NO;
+        }
+        
+        NSArray<MLMultiArray *> *modelOutputs = [self executeModelUsingExecutor:executor
+                                                                         inputs:inputs
+                                                                 outputBackings:outputBackings
+                                                                 loggingOptions:loggingOptions
+                                                                    eventLogger:eventLogger
+                                                                          error:error];
+        if (!modelOutputs) {
+            return NO;
+        }
+        
+        ::set_outputs(outputArgs, modelOutputs);
+        return YES;
+    }
 }
 
 - (BOOL)unloadModelWithHandle:(ModelHandle *)handle {
diff --git a/backends/apple/coreml/runtime/delegate/MLMultiArray_Copy.mm b/backends/apple/coreml/runtime/delegate/MLMultiArray_Copy.mm
index 4aa5fffe94a..b8a10fcbbbc 100644
--- a/backends/apple/coreml/runtime/delegate/MLMultiArray_Copy.mm
+++ b/backends/apple/coreml/runtime/delegate/MLMultiArray_Copy.mm
@@ -7,55 +7,17 @@
 
 #import <MLMultiArray_Copy.h>
 
+#import <objc_array_util.h>
 #import <multiarray.h>
 
 namespace {
 using namespace executorchcoreml;
 
-template<typename T>
-T toValue(NSNumber *value);
-
-template<> size_t toValue(NSNumber *value) {
-    return value.unsignedLongValue;
-}
-
-template<> ssize_t toValue(NSNumber *value) {
-    return value.longLongValue;
-}
-
-template<typename T, typename = typename std::enable_if<std::is_arithmetic<T>::value, T>::type>
-std::vector<T> to_vector(NSArray<NSNumber *> *numbers) {
-    std::vector<T> result;
-    result.reserve(numbers.count);
-    for (NSNumber *number in numbers) {
-        result.emplace_back(toValue<T>(number));
-    }
-    
-    return result;
-}
-
-MultiArray::DataType to_multi_array_data_type(MLMultiArrayDataType data_type) {
-    switch (data_type) {
-        case MLMultiArrayDataTypeInt32: {
-            return MultiArray::DataType::Int;
-        }
-        case MLMultiArrayDataTypeFloat: {
-            return MultiArray::DataType::Float;
-        }
-        case MLMultiArrayDataTypeFloat16: {
-            return MultiArray::DataType::Float16;
-        }
-        case MLMultiArrayDataTypeDouble: {
-            return MultiArray::DataType::Double;
-        }
-    }
-}
-
 MultiArray to_multi_array(void *data,
                           MLMultiArrayDataType dataType,
                           NSArray<NSNumber *> *shape,
                           NSArray<NSNumber *> *strides) {
-    auto layout = MultiArray::MemoryLayout(to_multi_array_data_type(dataType),
+    auto layout = MultiArray::MemoryLayout(to_multiarray_data_type(dataType).value(),
                                            to_vector<size_t>(shape),
                                            to_vector<ssize_t>(strides));
     return MultiArray(data, std::move(layout));
diff --git a/backends/apple/coreml/runtime/delegate/backend_delegate.h b/backends/apple/coreml/runtime/delegate/backend_delegate.h
index d6a6016c087..ed921fb35bd 100644
--- a/backends/apple/coreml/runtime/delegate/backend_delegate.h
+++ b/backends/apple/coreml/runtime/delegate/backend_delegate.h
@@ -26,7 +26,7 @@ class BackendDelegate {
 
     struct Config {
         // Max models cache size in bytes.
-        size_t max_models_cache_size = 2 * size_t(1024) * size_t(1024) * size_t(1024);
+        size_t max_models_cache_size = 10 * size_t(1024) * size_t(1024) * size_t(1024);
         // If set to `true`, delegate pre-warms the most recently used asset.
         bool should_prewarm_asset = true;
         // If set to `true`, delegate pre-warms the model in `init`.
diff --git a/backends/apple/coreml/runtime/delegate/backend_delegate.mm b/backends/apple/coreml/runtime/delegate/backend_delegate.mm
index b91a6208b6a..1ded4a76b3b 100644
--- a/backends/apple/coreml/runtime/delegate/backend_delegate.mm
+++ b/backends/apple/coreml/runtime/delegate/backend_delegate.mm
@@ -44,44 +44,6 @@ MLComputeUnits get_compute_units(const Buffer& buffer) {
     return configuration;
 }
 
-template<typename T, typename = typename std::enable_if<std::is_arithmetic<T>::value, T>::type>
-NSArray<NSNumber *> *to_array(const std::vector<T>& array) {
-    NSMutableArray<NSNumber *> *result = [NSMutableArray arrayWithCapacity:array.size()];
-    for (T value : array) {
-        [result addObject:@(value)];
-    }
-    
-    return result;
-}
-
-MLMultiArrayDataType get_data_type(MultiArray::DataType dataType) {
-    switch (dataType) {
-        case MultiArray::DataType::Float16: {
-            return MLMultiArrayDataTypeFloat16;
-        }
-        case MultiArray::DataType::Float: {
-            return MLMultiArrayDataTypeFloat32;
-        }
-        case MultiArray::DataType::Double: {
-            return MLMultiArrayDataTypeDouble;
-        }
-        case MultiArray::DataType::Int: {
-            return MLMultiArrayDataTypeInt32;
-        }
-    }
-}
-
-MLMultiArray * _Nullable to_ml_multiarray(const MultiArray& array, NSError * __autoreleasing *error) {
-    const auto& layout = array.layout();
-    MLMultiArray *result = [[MLMultiArray alloc] initWithDataPointer:array.data()
-                                                               shape:to_array(layout.shape())
-                                                            dataType:get_data_type(layout.dataType())
-                                                             strides:to_array(layout.strides())
-                                                         deallocator:^(void * _Nonnull bytes) {}
-                                                               error:error];
-    return result;
-}
-
 NSURL * _Nullable create_directory_if_needed(NSURL *url,
                                              NSFileManager *fileManager,
                                              NSError * __autoreleasing *error) {
@@ -194,17 +156,8 @@ bool execute(Handle* handle,
                  ModelEventLogger *event_logger,
                  std::error_code& ec) const noexcept override {
         NSError *error = nil;
-        NSMutableArray<MLMultiArray *> *model_args = [NSMutableArray arrayWithCapacity:args.size()];
-        for (const auto& arg : args) {
-            MLMultiArray *multi_array = to_ml_multiarray(arg, &error);
-            if (!multi_array) {
-                return false;
-            }
-            [model_args addObject:multi_array];
-        }
-        
         if (![model_manager_ executeModelWithHandle:handle
-                                               args:model_args
+                                            argsVec:args
                                     loggingOptions:logging_options
                                         eventLogger:event_logger
                                               error:&error]) {
diff --git a/backends/apple/coreml/runtime/delegate/com.apple.executorchcoreml_config.plist b/backends/apple/coreml/runtime/delegate/com.apple.executorchcoreml_config.plist
index 7dd12acaaf8..df37a47755f 100644
--- a/backends/apple/coreml/runtime/delegate/com.apple.executorchcoreml_config.plist
+++ b/backends/apple/coreml/runtime/delegate/com.apple.executorchcoreml_config.plist
@@ -7,6 +7,6 @@
 	<key>shouldPrewarmModel</key>
 	<true/>
 	<key>maxAssetsSizeInBytes</key>
-	<integer>2147483648</integer>
+	<integer>1073741824</integer>
 </dict>
 </plist>
diff --git a/backends/apple/coreml/runtime/delegate/coreml_backend_delegate.mm b/backends/apple/coreml/runtime/delegate/coreml_backend_delegate.mm
index a51e73ee68d..b672d4a08e4 100644
--- a/backends/apple/coreml/runtime/delegate/coreml_backend_delegate.mm
+++ b/backends/apple/coreml/runtime/delegate/coreml_backend_delegate.mm
@@ -28,16 +28,25 @@
 using namespace executorchcoreml;
 
 std::optional<MultiArray::DataType> get_data_type(ScalarType scalar_type) {
-    if (scalar_type == ScalarType::Float) {
-        return MultiArray::DataType::Float;
-    } else if (scalar_type == ScalarType::Double) {
-        return MultiArray::DataType::Double;
-    } else if (scalar_type == ScalarType::Half) {
-        return MultiArray::DataType::Float16;
-    } else if (scalar_type == ScalarType::Int) {
-        return MultiArray::DataType::Int;
-    } else {
-        return std::nullopt;
+    switch (scalar_type) {
+        case ScalarType::Bool:
+            return MultiArray::DataType::Bool;
+        case ScalarType::Byte:
+            return MultiArray::DataType::Byte;
+        case ScalarType::Short:
+            return MultiArray::DataType::Short;
+        case ScalarType::Int:
+            return MultiArray::DataType::Int32;
+        case ScalarType::Long:
+            return MultiArray::DataType::Int64;
+        case ScalarType::Half:
+            return MultiArray::DataType::Float16;
+        case ScalarType::Float:
+            return MultiArray::DataType::Float32;
+        case ScalarType::Double:
+            return MultiArray::DataType::Float64;
+        default:
+            return std::nullopt;
     }
 }
 
@@ -54,6 +63,7 @@
     auto tensor = eValue->toTensor();
     auto dataType = get_data_type(tensor.scalar_type());
     if (!dataType.has_value()) {
+        ET_LOG(Error, "%s: DataType=%d is not supported", ETCoreMLStrings.delegateIdentifier.UTF8String, (int)tensor.scalar_type());
         return std::nullopt;
     }
     
@@ -167,7 +177,7 @@ ModelLoggingOptions get_logging_options(BackendExecutionContext& context) {
         auto multi_array = get_multi_array(args[i], ArgType::Input);
         ET_CHECK_OR_RETURN_ERROR(multi_array.has_value(),
                                  Internal,
-                                 "%s: Expected tensor at args[%zu]", ETCoreMLStrings.delegateIdentifier.UTF8String, i);
+                                 "%s: Failed to create multiarray from input at args[%zu]", ETCoreMLStrings.delegateIdentifier.UTF8String, i);
         delegate_args.emplace_back(std::move(multi_array.value()));
     }
     
@@ -176,7 +186,7 @@ ModelLoggingOptions get_logging_options(BackendExecutionContext& context) {
         auto multi_array = get_multi_array(args[i], ArgType::Output);
         ET_CHECK_OR_RETURN_ERROR(multi_array.has_value(),
                                  Internal,
-                                 "%s: Expected tensor at args[%zu]", ETCoreMLStrings.delegateIdentifier.UTF8String, i);
+                                 "%s: Failed to create multiarray from output at args[%zu]", ETCoreMLStrings.delegateIdentifier.UTF8String, i);
         delegate_args.emplace_back(std::move(multi_array.value()));
     }
     
diff --git a/backends/apple/coreml/runtime/delegate/multiarray.h b/backends/apple/coreml/runtime/delegate/multiarray.h
index cd165373dc8..70a2a08a2f7 100644
--- a/backends/apple/coreml/runtime/delegate/multiarray.h
+++ b/backends/apple/coreml/runtime/delegate/multiarray.h
@@ -7,6 +7,9 @@
 
 #pragma once
 
+#import <CoreML/CoreML.h>
+#import <iostream>
+#import <optional>
 #import <vector>
 
 namespace executorchcoreml {
@@ -29,13 +32,33 @@ class Buffer {
 };
 
 /// A class representing a MultiArray.
-class MultiArray {
+class MultiArray final {
 public:
     /// The MultiArray datatype.
-    enum class DataType : uint8_t { Int = 0, Double, Float, Float16 };
+    enum class DataType : uint8_t {
+        Bool = 0,
+        Byte,
+        Char,
+        Short,
+        Int32,
+        Int64,
+        Float16,
+        Float32,
+        Float64,
+    };
+
+    /// Options for copying.
+    struct CopyOptions {
+        inline CopyOptions() noexcept : use_bnns(true), use_memcpy(true) { }
+
+        inline CopyOptions(bool use_bnns, bool use_memcpy) noexcept : use_bnns(use_bnns), use_memcpy(use_memcpy) { }
+
+        bool use_bnns = true;
+        bool use_memcpy = true;
+    };
 
     /// A class describing the memory layout of a MultiArray.
-    class MemoryLayout {
+    class MemoryLayout final {
     public:
         MemoryLayout(DataType dataType, std::vector<size_t> shape, std::vector<ssize_t> strides)
             : dataType_(dataType), shape_(std::move(shape)), strides_(std::move(strides)) { }
@@ -53,7 +76,10 @@ class MultiArray {
         inline size_t rank() const noexcept { return shape_.size(); }
 
         /// Returns the number of elements in the MultiArray.
-        size_t get_num_elements() const noexcept;
+        size_t num_elements() const noexcept;
+
+        /// Returns the byte size of an element.
+        size_t num_bytes() const noexcept;
 
         /// Returns `true` if the memory layout is packed otherwise `false`.
         bool is_packed() const noexcept;
@@ -78,11 +104,42 @@ class MultiArray {
     /// Copies this into another `MultiArray`.
     ///
     /// @param dst The destination `MultiArray`.
-    bool copy(MultiArray& dst) const noexcept;
+    void copy(MultiArray& dst, CopyOptions options = CopyOptions()) const noexcept;
+
+    /// Get the value at `indices`.
+    template <typename T> inline T value(const std::vector<size_t>& indices) const noexcept {
+        return *(static_cast<T*>(data(indices)));
+    }
+
+    /// Set the value at `indices`.
+    template <typename T> inline void set_value(const std::vector<size_t>& indices, T value) const noexcept {
+        T* ptr = static_cast<T*>(data(indices));
+        *ptr = value;
+    }
+
+    /// Get the value at `index`.
+    template <typename T> inline T value(size_t index) const noexcept { return *(static_cast<T*>(data(index))); }
+
+    /// Set the value at `index`.
+    template <typename T> inline void set_value(size_t index, T value) const noexcept {
+        T* ptr = static_cast<T*>(data(index));
+        *ptr = value;
+    }
 
 private:
+    void* data(const std::vector<size_t>& indices) const noexcept;
+
+    void* data(size_t index) const noexcept;
+
     void* data_;
     MemoryLayout layout_;
 };
 
+/// Converts `MultiArray::DataType` to `MLMultiArrayDataType`.
+std::optional<MLMultiArrayDataType> to_ml_multiarray_data_type(MultiArray::DataType data_type);
+
+/// Converts `MLMultiArrayDataType` to `MultiArray::DataType`.
+std::optional<MultiArray::DataType> to_multiarray_data_type(MLMultiArrayDataType data_type);
+
+
 } // namespace executorchcoreml
diff --git a/backends/apple/coreml/runtime/delegate/multiarray.mm b/backends/apple/coreml/runtime/delegate/multiarray.mm
index 3b8dcb98a30..74996fb8d5a 100644
--- a/backends/apple/coreml/runtime/delegate/multiarray.mm
+++ b/backends/apple/coreml/runtime/delegate/multiarray.mm
@@ -10,120 +10,16 @@
 
 #import <Accelerate/Accelerate.h>
 #import <CoreML/CoreML.h>
-
 #import <functional>
 #import <numeric>
+#import <objc_array_util.h>
+#import <optional>
 #import <vector>
 
 namespace  {
 using namespace executorchcoreml;
 
-template<typename T>
-struct TypedMultiArray {
-    explicit TypedMultiArray(T *data, MultiArray::MemoryLayout layout) noexcept
-    :data(data), layout(std::move(layout))
-    {}
-    
-    T *data;
-    MultiArray::MemoryLayout layout;
-};
-
-#pragma mark - BNNS
-
-template<typename T1, typename T2>
-struct BNNSCopier {
-    static bool supported() noexcept {
-        return false;
-    }
-    
-    static void copy(BNNSNDArrayDescriptor *src_bnns_desc, BNNSNDArrayDescriptor *dstNNSDesc) noexcept {}
-};
-
-// float -> _Float16
-template<>
-struct BNNSCopier<float, _Float16> {
-    static bool supported() noexcept {
-        return true;
-    }
-    
-    static void copy(BNNSNDArrayDescriptor *src_bnns_desc, BNNSNDArrayDescriptor *dst_bnns_desc) noexcept {
-        src_bnns_desc->data_type = BNNSDataTypeFloat32;
-        dst_bnns_desc->data_type = BNNSDataTypeFloat16;
-        BNNSCopy(src_bnns_desc, dst_bnns_desc, NULL);
-    }
-};
-
-// float -> int32_t
-template<>
-struct BNNSCopier<float, int32_t> {
-    static bool supported() noexcept {
-        return true;
-    }
-    
-    static void copy(BNNSNDArrayDescriptor *src_bnns_desc, BNNSNDArrayDescriptor *dst_bnns_desc) noexcept {
-        src_bnns_desc->data_type = BNNSDataTypeFloat32;
-        dst_bnns_desc->data_type = BNNSDataTypeInt32;
-        BNNSCopy(src_bnns_desc, dst_bnns_desc, NULL);
-    }
-};
-
-// _Float16 -> float
-template<>
-struct BNNSCopier<_Float16, float> {
-    static bool supported() noexcept {
-        return true;
-    }
-    
-    static void copy(BNNSNDArrayDescriptor *src_bnns_desc, BNNSNDArrayDescriptor *dst_bnns_desc) noexcept {
-        src_bnns_desc->data_type = BNNSDataTypeFloat16;
-        dst_bnns_desc->data_type = BNNSDataTypeFloat32;
-        BNNSCopy(src_bnns_desc, dst_bnns_desc, NULL);
-    }
-};
-
-// _Float16 -> int32_t
-template<>
-struct BNNSCopier<_Float16, int32_t> {
-    static bool supported() noexcept {
-        return true;
-    }
-    
-    static void copy(BNNSNDArrayDescriptor *src_bnns_desc, BNNSNDArrayDescriptor *dst_bnns_desc) noexcept {
-        src_bnns_desc->data_type = BNNSDataTypeFloat16;
-        dst_bnns_desc->data_type = BNNSDataTypeInt32;
-        BNNSCopy(src_bnns_desc, dst_bnns_desc, NULL);
-    }
-};
-
-// int32_t -> _Float16
-template<>
-struct BNNSCopier<int32_t, _Float16> {
-    static bool supported() noexcept {
-        return true;
-    }
-    
-    static void copy(BNNSNDArrayDescriptor *src_bnns_desc, BNNSNDArrayDescriptor *dst_bnns_desc) noexcept {
-        src_bnns_desc->data_type = BNNSDataTypeInt32;
-        dst_bnns_desc->data_type = BNNSDataTypeFloat16;
-        BNNSCopy(src_bnns_desc, dst_bnns_desc, NULL);
-    }
-};
-
-// int32_t -> float
-template<>
-struct BNNSCopier<int32_t, float> {
-    static bool supported() noexcept {
-        return true;
-    }
-    
-    static void copy(BNNSNDArrayDescriptor *src_bnns_desc, BNNSNDArrayDescriptor *dst_bnns_desc) noexcept {
-        src_bnns_desc->data_type = BNNSDataTypeInt32;
-        dst_bnns_desc->data_type = BNNSDataTypeFloat32;
-        BNNSCopy(src_bnns_desc, dst_bnns_desc, NULL);
-    }
-};
-
-/// Returns BNNSDataLayout and sets strides from the multi-array strides.
+// Returns BNNSDataLayout and sets strides from the multi-array strides.
 ///
 /// BNNS requires strides to be non-decreasing order;
 /// `bnns_strides[i] <= bnns_strides[i + 1]`. BNNSDataLayout defines
@@ -132,408 +28,491 @@ static void copy(BNNSNDArrayDescriptor *src_bnns_desc, BNNSNDArrayDescriptor *ds
 /// @param multi_array_strides  The multiarray strides.
 /// @param bnns_strides   The bnns strides.
 /// @retval The `BNNSDataLayout`.
-BNNSDataLayout get_bnns_data_layout(const std::vector<ssize_t>& multi_array_strides, size_t *bnns_strides) {
-    uint32_t firstMajorFlag = 1;
+std::optional<BNNSDataLayout> get_bnns_data_layout(const std::vector<ssize_t>& multi_array_strides,
+                                                   size_t *bnns_strides) {
+    bool first_major = false;
     uint32_t rank = static_cast<uint32_t>(multi_array_strides.size());
     if (rank > BNNS_MAX_TENSOR_DIMENSION) {
-        return (BNNSDataLayout)-1;
+        return std::nullopt;
     }
     
     if (std::is_sorted(multi_array_strides.begin(), multi_array_strides.end(), std::less())) {
-        firstMajorFlag = 0;
+        first_major = false;
         std::copy(multi_array_strides.begin(), multi_array_strides.end(), bnns_strides);
     } else if (std::is_sorted(multi_array_strides.begin(), multi_array_strides.end(), std::greater()) ) {
-        firstMajorFlag = 1;
+        first_major = true;
         std::copy(multi_array_strides.rbegin(), multi_array_strides.rend(), bnns_strides);
     } else {
-        return (BNNSDataLayout)-1;
+        return std::nullopt;
     }
     
     // See BNNSDataLayout's raw value how this bitwise-or makes sense.
-    return (BNNSDataLayout)((rank << 16) | (8 << 12) | firstMajorFlag);
+    return (BNNSDataLayout) (0x08000 +                    // flags as canonical first/last major type
+                             0x10000 * rank +             // set dimensionality
+                             (first_major ? 1 : 0));      // set first/last major bit
 }
 
-/// Initializes BNNSNDArrayDescriptor for the shape and strides.
+/// Returns `BNNSDataType` from `MultiArray::DataType`.
 ///
-/// @param layout  The memory layout.
-/// @param desc   The ``BNNSNDArrayDescriptor`  to be initialized.
-/// @retval `true` if the initialization succeeded otherwise `false`.
-bool init_bnns_array_descriptor(const MultiArray::MemoryLayout& layout, BNNSNDArrayDescriptor *desc) {
-    BNNSDataLayout bnns_layout = get_bnns_data_layout(layout.strides(), desc->stride);
-    if (bnns_layout == (BNNSDataLayout)-1) {
-        return false;
-    }
-    
-    std::memset(desc, 0, sizeof(*desc));
-    const auto& shape = layout.shape();
-    std::copy(shape.begin(), shape.end(), desc->size);
-    desc->layout = bnns_layout;
-    desc->data_scale = 1.0f;
-    desc->data_bias = 0.0f;
-    
-    return true;
-}
-
-template<typename T1, typename T2>
-struct MultiArrayBNNSCopier {
-    static bool copy(TypedMultiArray<T1>& src, TypedMultiArray<T2>& dst) {
-        if (!BNNSCopier<T1, T2>::supported()) {
-            return false;
+/// @param datatype  The multiarray datatype.
+/// @retval The `BNNSDataType`.
+std::optional<BNNSDataType> get_bnns_data_type(MultiArray::DataType datatype) {
+    switch (datatype) {
+        case MultiArray::DataType::Bool: {
+            return BNNSDataTypeBoolean;
         }
-        
-        BNNSNDArrayDescriptor src_bnns_array;
-        BNNSNDArrayDescriptor dst_bnns_array;
-        if (!init_bnns_array_descriptor(src.layout, &src_bnns_array) || !init_bnns_array_descriptor(dst.layout, &dst_bnns_array)) {
-            return false;
+        case MultiArray::DataType::Byte: {
+            return BNNSDataTypeUInt8;
+        }
+        case MultiArray::DataType::Char: {
+            return BNNSDataTypeInt8;
+        }
+        case MultiArray::DataType::Short: {
+            return BNNSDataTypeInt16;
+        }
+        case MultiArray::DataType::Int32: {
+            return BNNSDataTypeInt32;
+        }
+        case MultiArray::DataType::Int64: {
+            return BNNSDataTypeInt64;
+        }
+        case MultiArray::DataType::Float16: {
+            return BNNSDataTypeFloat16;
+        }
+        case MultiArray::DataType::Float32: {
+            return BNNSDataTypeFloat32;
+        }
+        default: {
+            return std::nullopt;
         }
-        
-        BNNSCopier<T1, T2>::copy(&src_bnns_array, &dst_bnns_array);
-        return true;
     }
-};
-
-#pragma mark - VImageCopier
+}
 
-bool init_vi_Buffer(const MultiArray::MemoryLayout& layout, vImage_Buffer *viBuf, size_t bytesPerScalar) {
-    size_t rank = layout.rank();
-    const auto& shape = layout.shape();
-    const auto& strides = layout.strides();
-    
-    if (rank < 2) {
-        // vImage path requires at least two dimensions.
-        return false;
-    }
-    
-    // vImage blitter requires first major and every dimension except row (shape[rank - 2]) is contiguous.
-    if (!std::is_sorted(strides.begin(), strides.end(), std::greater())) {
+/// Initializes BNNS array descriptor from multi array.
+///
+/// @param bnns_descriptor   The descriptor to be initialized.
+/// @param multi_array  The multiarray.
+/// @retval `true` if the initialization succeeded otherwise `false`.
+bool init_bnns_descriptor(BNNSNDArrayDescriptor& bnns_descriptor, const MultiArray& multi_array) {
+    const auto& layout = multi_array.layout();
+    if (layout.num_elements() == 1) {
         return false;
     }
     
-    if (strides[rank - 1] != 1) {
+    auto bnns_datatype = get_bnns_data_type(layout.dataType());
+    if (!bnns_datatype) {
         return false;
     }
     
-    size_t height = std::accumulate(shape.begin(), shape.end() - 1, size_t(1), std::multiplies<size_t>());
-    if (height * strides[rank - 2] != strides[0] * shape[0]) {
+    std::memset(&bnns_descriptor, 0, sizeof(bnns_descriptor));
+    auto bnns_layout = get_bnns_data_layout(layout.strides(), bnns_descriptor.stride);
+    if (!bnns_layout) {
         return false;
     }
     
-    size_t width = shape[rank - 1];
-    size_t rowBytes = strides[rank - 2] * bytesPerScalar;
-    
-    viBuf->data = NULL;
-    viBuf->height = height;
-    viBuf->width = width;
-    viBuf->rowBytes = rowBytes;
+    const auto& shape = layout.shape();
+    std::copy(shape.begin(), shape.end(), bnns_descriptor.size);
+    bnns_descriptor.layout = bnns_layout.value();
+    bnns_descriptor.data_scale = 1.0f;
+    bnns_descriptor.data_bias = 0.0f;
+    bnns_descriptor.data_type = bnns_datatype.value();
+    bnns_descriptor.data = multi_array.data();
     
     return true;
 }
 
-template<typename T1, typename T2>
-struct VImageCopier {
-    static bool supported() noexcept {
+bool copy_using_bnns(const MultiArray& src, MultiArray& dst) {
+    if (dst.layout().num_bytes() < src.layout().num_bytes()) {
         return false;
     }
-    
-    static void copy(vImage_Buffer *src_vi_buffer, vImage_Buffer *dst_vi_buffer) noexcept {}
-};
-
-template<typename T>
-struct VImageCopier<T, T> {
-    static bool supported() noexcept {
-        return true;
+    BNNSNDArrayDescriptor src_descriptor;
+    if (!init_bnns_descriptor(src_descriptor, src)) {
+        return false;
     }
     
-    static void copy(vImage_Buffer *src_vi_buffer, vImage_Buffer *dst_vi_buffer) noexcept {
-        vImageCopyBuffer(src_vi_buffer, dst_vi_buffer, sizeof(T), kvImageDoNotTile);
-    }
-};
-
-// float -> _Float16
-template <>
-struct VImageCopier<float, _Float16> {
-    static bool supported() noexcept {
-        return true;
+    BNNSNDArrayDescriptor dst_descriptor;
+    if (!init_bnns_descriptor(dst_descriptor, dst)) {
+        return false;
     }
     
-    static void copy(vImage_Buffer *src_vi_buffer, vImage_Buffer *dst_vi_buffer) noexcept {
-        vImageConvert_PlanarFtoPlanar16F(src_vi_buffer, dst_vi_buffer, kvImageDoNotTile);
-    }
-};
+    return BNNSCopy(&dst_descriptor, &src_descriptor, NULL) == 0;
+}
 
-// _Float16 -> float
-template <>
-struct VImageCopier<_Float16, float> {
-    static bool supported() noexcept {
-        return true;
-    }
+std::vector<MultiArray::MemoryLayout> get_layouts(const std::vector<MultiArray>& arrays) {
+    std::vector<MultiArray::MemoryLayout> result;
+    result.reserve(arrays.size());
     
-    static void copy(vImage_Buffer *src_vi_buffer, vImage_Buffer *dst_vi_buffer) noexcept {
-        vImageConvert_Planar16FtoPlanarF(src_vi_buffer, dst_vi_buffer, kvImageDoNotTile);
-    }
-};
-
-template<typename T1, typename T2>
-struct MultiArrayVImageCopier {
-    static bool copy(TypedMultiArray<T1>& src, TypedMultiArray<T2>& dst) {
-        if (!VImageCopier<T1, T2>::supported()) {
-            return false;
-        }
-        
-        vImage_Buffer src_vi_buffer;
-        vImage_Buffer dst_vi_buffer;
-        if (!init_vi_Buffer(src.layout, &src_vi_buffer, sizeof(T1))) {
-            return false;
-        }
-        
-        if (!init_vi_Buffer(dst.layout, &dst_vi_buffer, sizeof(T2))) {
-            return false;
-        }
-        
-        VImageCopier<T1, T2>::copy(&src_vi_buffer, &dst_vi_buffer);
-        return true;
-    }
-};
-
-#pragma mark - VDSPCopier
-
-template<typename T1, typename T2>
-struct VDSPCopier {
-    static bool supported() noexcept {
-        return false;
-    }
+    std::transform(arrays.begin(), arrays.end(), std::back_inserter(result), [](const auto& array) {
+        return array.layout();
+    });
     
-    static void copy(const T1 *src_data, T2 *dst_data, size_t num_elements) noexcept {}
-};
+    return result;
+}
 
-// Double -> Float
-template<>
-struct VDSPCopier<double, float> {
-    static bool supported() noexcept {
-        return true;
-    }
+std::vector<void *> get_datas(const std::vector<MultiArray>& arrays) {
+    std::vector<void *> result;
+    result.reserve(arrays.size());
     
-    static void copy(const double *src_data, float *dst_data, size_t num_elements) noexcept {
-        vDSP_vdpsp(src_data, 1, dst_data, 1, num_elements);
-    }
-};
-
-// Float -> Double
-template<>
-struct VDSPCopier<float, double> {
-    static bool supported() noexcept {
-        return true;
-    }
+    std::transform(arrays.begin(), arrays.end(), std::back_inserter(result), [](const auto& array) {
+        return array.data();
+    });
     
-    static void copy(const float *src_data, double *dst_data, size_t num_elements) noexcept {
-        vDSP_vspdp(src_data, 1, dst_data, 1, num_elements);
-    }
-};
+    return result;
+}
 
-// Float -> Int32
-template<>
-struct VDSPCopier<float, int32_t> {
-    static bool supported() noexcept {
+// We can coalesce two adjacent dimensions if either dim has size 1 or if `shape[n] * stride[n] == stride[n + 1]`.
+bool can_coalesce_dimensions(const std::vector<size_t>& shape,
+                             const std::vector<ssize_t>& strides,
+                             size_t dim1,
+                             size_t dim2) {
+    auto shape1 = shape[dim1];
+    auto shape2 = shape[dim2];
+    if (shape1 == 1 || shape2 == 1) {
         return true;
     }
     
-    static void copy(const float *src_data, int32_t *dst_data, size_t num_elements) noexcept {
-        vDSP_vfix32(src_data, 1, dst_data, 1, num_elements);
-    }
-};
+    auto stride1 = strides[dim1];
+    auto stride2 = strides[dim2];
+    return shape1 * stride1 == stride2;
+}
 
-// Int32 -> Double
-template<>
-struct VDSPCopier<int32_t, double> {
-    static bool supported() noexcept {
-        return true;
+bool can_coalesce_dimensions(const std::vector<size_t>& shape,
+                             const std::vector<std::vector<ssize_t>>& all_strides,
+                             size_t dim1,
+                             size_t dim2) {
+    for (const auto& strides : all_strides) {
+        if (!::can_coalesce_dimensions(shape, strides, dim1, dim2)) {
+            return false;
+        }
     }
     
-    static void copy(const int32_t *src_data, double *dst_data, size_t num_elements) noexcept {
-        vDSP_vflt32D(src_data, 1, dst_data, 1, num_elements);
-    }
-};
+    return true;
+}
 
-// Int32 -> Float
-template<>
-struct VDSPCopier<int32_t, float> {
-    static bool supported() noexcept {
-        return true;
-    }
-    
-    static void copy(const int32_t *src_data, float *dst_data, size_t num_elements) noexcept {
-        vDSP_vflt32(src_data, 1, dst_data, 1, num_elements);
+void update_strides(std::vector<std::vector<ssize_t>>& all_strides,
+                    size_t dim1,
+                    size_t dim2) {
+    for (auto& strides : all_strides) {
+        strides[dim1] = strides[dim2];
     }
-};
+}
 
-template<typename T1, typename T2>
-struct MultiArrayVDSPCopier {
-    static bool copy(TypedMultiArray<T1>& src, TypedMultiArray<T2>& dst) {
-        if (!VDSPCopier<T1, T2>::supported()) {
-            return false;
-        }
-        
-        if (!src.layout.is_packed() || !dst.layout.is_packed()) {
-            return false;
+std::vector<MultiArray::MemoryLayout> coalesce_dimensions(std::vector<MultiArray::MemoryLayout> layouts) {
+    if (layouts.size() == 0) {
+        return {};
+    }
+    
+    std::vector<size_t> shape = layouts.back().shape();
+    // reverse shape.
+    std::reverse(shape.begin(), shape.end());
+    std::vector<std::vector<ssize_t>> all_strides;
+    // reverse strides.
+    all_strides.reserve(layouts.size());
+    std::transform(layouts.begin(), layouts.end(), std::back_inserter(all_strides), [](const MultiArray::MemoryLayout& layout) {
+        auto strides = layout.strides();
+        std::reverse(strides.begin(), strides.end());
+        return strides;
+    });
+    size_t rank = layouts[0].rank();
+    size_t prev_dim = 0;
+    for (size_t dim = 1; dim < rank; ++dim) {
+        if (::can_coalesce_dimensions(shape, all_strides, prev_dim, dim)) {
+            if (shape[prev_dim] == 1) {
+                ::update_strides(all_strides, prev_dim, dim);
+            }
+            shape[prev_dim] *= shape[dim];
+        } else {
+            ++prev_dim;
+            if (prev_dim != dim) {
+                ::update_strides(all_strides, prev_dim, dim);
+                shape[prev_dim] = shape[dim];
+            }
         }
-        
-        VDSPCopier<T1, T2>::copy(src.data, dst.data, src.layout.get_num_elements());
-        return true;
     }
-};
-
-#pragma mark - MemCopy
-
-template<typename T1, typename T2>
-struct MemCopier {
-    static bool supported() noexcept {
-        return false;
+    
+    if (rank == prev_dim + 1) {
+        return layouts;
     }
     
-    static void copy(const T1 *src_data, T2 *dst_data, size_t num_elements) noexcept {}
-};
-
-template<typename T>
-struct MemCopier<T, T> {
-    static bool supported() noexcept {
-        return true;
+    shape.resize(prev_dim + 1);
+    for (auto& strides : all_strides) {
+        strides.resize(prev_dim + 1);
     }
     
-    static void copy(const T *src_data, T *dst_data, size_t num_elements) noexcept {
-        std::memcpy(dst_data, src_data, num_elements);
+    std::vector<MultiArray::MemoryLayout> result;
+    result.reserve(layouts.size());
+    std::reverse(shape.begin(), shape.end());
+    for (size_t i = 0; i < layouts.size(); ++i) {
+        std::reverse(all_strides[i].begin(), all_strides[i].end());
+        result.emplace_back(layouts[i].dataType(), shape, std::move(all_strides[i]));
     }
+    
+    return result;
+}
+
+enum class Direction : uint8_t {
+    Forward = 0,
+    Backward
 };
 
-template<typename T1, typename T2>
-struct MultiArrayMemCopier {
-    static bool copy(TypedMultiArray<T1>& src, TypedMultiArray<T2>& dst) {
-        if (!MemCopier<T1, T2>::supported()) {
-            return false;
-        }
-        
-        if (!src.layout.is_packed() || !dst.layout.is_packed()) {
-            return false;
+void set_data_pointers(std::vector<void *>& data_pointers,
+                       ssize_t index,
+                       size_t dim,
+                       Direction direction,
+                       const std::vector<MultiArray::MemoryLayout>& layouts) {
+    for (size_t i = 0; i < layouts.size(); ++i) {
+        const auto& layout = layouts[i];
+        const ssize_t stride = layout.strides()[dim];
+        const size_t num_bytes = layout.num_bytes();
+        ssize_t offset = 0;
+        switch (direction) {
+            case Direction::Forward: {
+                offset = stride * index * num_bytes;
+                break;
+            }
+            case Direction::Backward: {
+                offset = - stride * index * num_bytes;
+                break;
+            }
         }
-        
-        MemCopier<T1, T2>::copy(src.data, dst.data, src.layout.get_num_elements());
-        return true;
+        data_pointers[i] = (void *)(static_cast<uint8_t *>(data_pointers[i]) + offset);
     }
-};
+}
+
+void increment_data_pointers(std::vector<void *>& data_pointers,
+                             size_t index,
+                             size_t dim,
+                             const std::vector<MultiArray::MemoryLayout>& layouts) {
+    set_data_pointers(data_pointers, index, dim, Direction::Forward, layouts);
+}
 
-#pragma mark - MultiArrayIterator
-/// TODO - remove recursion and coalesce contiguous dimensions.
-template <typename T1, typename T2>
-struct MultiArrayIterator {
-    explicit MultiArrayIterator(TypedMultiArray<T1>& array1, TypedMultiArray<T2>& array2)
-    :array1(array1), array2(array2)
+void decrement_data_pointers(std::vector<void *>& data_pointers,
+                             size_t index,
+                             size_t dim,
+                             const std::vector<MultiArray::MemoryLayout>& layouts) {
+    set_data_pointers(data_pointers, index, dim, Direction::Backward, layouts);
+}
+
+class MultiArrayIterator final {
+public:
+    explicit MultiArrayIterator(const std::vector<MultiArray>& arrays)
+    :datas_(get_datas(arrays)), 
+    layouts_(coalesce_dimensions(get_layouts(arrays)))
     {}
     
+private:
     template<typename FN>
-    void loop(FN&& fn, T1 *data1, T2 *data2, size_t dim) {
-        const size_t index = dim - 1;
-        const auto& layout1 = array1.layout;
-        const auto& layout2 = array2.layout;
-        const ssize_t stride1 = layout1.strides()[index];
-        const ssize_t stride2 = layout2.strides()[index];
-        const size_t bound = layout1.shape()[index];
-        
-        if (index == 0) {
-            for (size_t i = 0; i < bound; i++) {
-                if (fn(data1 + stride1 * i, data2 + stride2 * i)) {
-                    break;
+    void exec(FN&& fn, const std::vector<MultiArray::MemoryLayout>& layouts, std::vector<void *> datas, size_t n) {
+        const auto& layout = layouts.back();
+        // Avoid function call for rank <= 2.
+        switch (n) {
+            case 0: {
+                break;
+            }
+            case 1: {
+                for (size_t i = 0; i < layout.shape()[0]; ++i) {
+                    ::increment_data_pointers(datas, i, 0, layouts);
+                    fn(datas);
+                    ::decrement_data_pointers(datas, i, 0, layouts);
+                }
+                break;
+            }
+            case 2: {
+                for (size_t i = 0; i < layout.shape()[1]; ++i) {
+                    ::increment_data_pointers(datas, i, 1, layouts);
+                    for (size_t j = 0; j < layout.shape()[0]; ++j) {
+                        ::increment_data_pointers(datas, j, 0, layouts);
+                        fn(datas);
+                        ::decrement_data_pointers(datas, j, 0, layouts);
+                    }
+                    ::decrement_data_pointers(datas, i, 1, layouts);
+                }
+                
+                break;
+            }
+                
+            default: {
+                const size_t bound = layouts.back().shape()[n - 1];
+                for (size_t index = 0; index < bound; ++index) {
+                    ::increment_data_pointers(datas, index, n - 1, layouts);
+                    exec(std::forward<FN>(fn), layouts, datas, n - 1);
+                    ::decrement_data_pointers(datas, index, n - 1, layouts);
                 }
             }
-            return;
-        }
-        
-        for (size_t i = 0; i < bound; i++) {
-            loop(fn, data1 + stride1 * i, data2 + stride2 * i, dim - 1);
         }
     }
     
+public:
     template<typename FN>
-    void loop(FN&& fn) {
-        loop(fn, array1.data, array2.data, array1.layout.rank());
+    void exec(FN&& fn) {
+        std::vector<void *> datas = datas_;
+        exec(fn, layouts_, datas, layouts_[0].rank());
     }
     
-    TypedMultiArray<T1> array1;
-    TypedMultiArray<T2> array2;
+private:
+    std::vector<void *> datas_;
+    std::vector<MultiArray::MemoryLayout> layouts_;
 };
 
+/// BNNS has no double type, so we handle the conversions here.
 template<typename T1, typename T2>
-struct MultiArrayLoopingCopier {
-    static bool copy(TypedMultiArray<T1>& src, TypedMultiArray<T2>& dst) {
-        auto looper  = MultiArrayIterator<T1, T2>(src, dst);
-        looper.loop([](T1 *src, T2 *dst){
-            *dst = static_cast<T2>(*src);
-            return true;
-        });
-        
-        return true;
-    }
-};
+inline void copy_value(void *dst, const void *src) {
+    const T2 *src_ptr = static_cast<const T2 *>(src);
+    T1 *dst_ptr = static_cast<T1 *>(dst);
+    *dst_ptr = static_cast<T1>(*src_ptr);
+}
 
-template <typename T1, typename T2>
-struct MultiArrayCopier {
-    static bool copy(TypedMultiArray<T1>& src, TypedMultiArray<T2>& dst) {
-        if (src.layout.shape() != dst.layout.shape()) {
-            return false;
+template<typename T>
+void copy(void *dst,
+          MultiArray::DataType dst_data_type,
+          const void *src) {
+    switch (dst_data_type) {
+        case MultiArray::DataType::Bool: {
+            ::copy_value<bool, T>(dst, src);
+            break;
+        }
+            
+        case MultiArray::DataType::Byte: {
+            ::copy_value<uint8_t, T>(dst, src);
+            break;
+        }
+            
+        case MultiArray::DataType::Char: {
+            ::copy_value<int8_t, T>(dst, src);
+            break;
+        }
+            
+        case MultiArray::DataType::Short: {
+            ::copy_value<int16_t, T>(dst, src);
+            break;
         }
-        
-        if (src.layout.get_num_elements() == 0) {
-            return true;
+            
+        case MultiArray::DataType::Int32: {
+            ::copy_value<int32_t, T>(dst, src);
+            break;
         }
-        
-        if (MultiArrayBNNSCopier<T1, T2>::copy(src, dst)) {
-            return true;
+            
+        case MultiArray::DataType::Int64: {
+            ::copy_value<int64_t, T>(dst, src);
+            break;
         }
-        
-        if (MultiArrayVImageCopier<T1, T2>::copy(src, dst)) {
-            return true;
+            
+        case MultiArray::DataType::Float16: {
+            ::copy_value<_Float16, T>(dst, src);
+            break;
         }
-        
-        if (MultiArrayVDSPCopier<T1, T2>::copy(src, dst)) {
-            return true;
+            
+        case MultiArray::DataType::Float32: {
+            ::copy_value<float, T>(dst, src);
+            break;
         }
-        
-        if (MultiArrayMemCopier<T1, T2>::copy(src, dst)) {
-            return true;
+            
+        case MultiArray::DataType::Float64: {
+            ::copy_value<double, T>(dst, src);
+            break;
         }
-        
-        return MultiArrayLoopingCopier<T1, T2>::copy(src, dst);
     }
-};
+}
 
-template <typename T>
-bool copy(TypedMultiArray<T>& src, MultiArray& dst) {
-    const auto& dstLayout = dst.layout();
-    switch (dstLayout.dataType()) {
-        case MultiArray::DataType::Int: {
-            auto dst_array = TypedMultiArray<int32_t>(reinterpret_cast<int32_t *>(dst.data()), dstLayout);
-            return MultiArrayCopier<T, int32_t>::copy(src, dst_array);
+void copy(void *dst,
+          MultiArray::DataType dst_data_type,
+          const void *src,
+          MultiArray::DataType src_data_type) {
+    switch (src_data_type) {
+        case MultiArray::DataType::Bool: {
+            ::copy<uint8_t>(dst, dst_data_type, src);
+            break;
+        }
+            
+        case MultiArray::DataType::Byte: {
+            ::copy<uint8_t>(dst, dst_data_type, src);
+            break;
+        }
+            
+        case MultiArray::DataType::Char: {
+            ::copy<int8_t>(dst, dst_data_type, src);
+            break;
+        }
+            
+        case MultiArray::DataType::Short: {
+            ::copy<int16_t>(dst, dst_data_type, src);
+            break;
+        }
+            
+        case MultiArray::DataType::Int32: {
+            ::copy<int32_t>(dst, dst_data_type, src);
+            break;
+        }
+            
+        case MultiArray::DataType::Int64: {
+            ::copy<int64_t>(dst, dst_data_type, src);
+            break;
         }
             
         case MultiArray::DataType::Float16: {
-            auto dst_array = TypedMultiArray<_Float16>(reinterpret_cast<_Float16 *>(dst.data()), dstLayout);
-            return MultiArrayCopier<T, _Float16>::copy(src, dst_array);
+            ::copy<_Float16>(dst, dst_data_type, src);
+            break;
         }
             
-        case MultiArray::DataType::Float: {
-            auto dst_array = TypedMultiArray<float>(reinterpret_cast<float *>(dst.data()), dstLayout);
-            return MultiArrayCopier<T, float>::copy(src, dst_array);
+        case MultiArray::DataType::Float32: {
+            ::copy<float>(dst, dst_data_type, src);
+            break;
         }
             
-        case MultiArray::DataType::Double: {
-            auto dst_array = TypedMultiArray<double>(reinterpret_cast<double *>(dst.data()), dstLayout);
-            return MultiArrayCopier<T, double>::copy(src, dst_array);
+        case MultiArray::DataType::Float64: {
+            ::copy<double>(dst, dst_data_type, src);
+            break;
         }
     }
 }
-} //namespace
+
+void copy(const MultiArray& src, MultiArray& dst, MultiArray::CopyOptions options) {
+    if (options.use_bnns && copy_using_bnns(src, dst)) {
+        return;
+    }
+    
+    if (options.use_memcpy &&
+        src.layout().dataType() == dst.layout().dataType() &&
+        src.layout().is_packed() &&
+        dst.layout().is_packed()) {
+        std::memcpy(dst.data(), src.data(), src.layout().num_elements() * src.layout().num_bytes());
+        return;
+    }
+    
+    auto iterator = MultiArrayIterator({src, dst});
+    iterator.exec([&](const std::vector<void *>& datas){
+        void *src_data = datas[0];
+        void *dst_data = datas[1];
+        ::copy(dst_data, dst.layout().dataType(), src_data, src.layout().dataType());
+    });
+}
+
+ssize_t get_data_offset(const std::vector<size_t>& indices, const std::vector<ssize_t>& strides) {
+    ssize_t offset = 0;
+    for (size_t i = 0; i < indices.size(); ++i) {
+        offset += static_cast<ssize_t>(indices[i]) * strides[i];
+    }
+    
+    return offset;
+}
+
+ssize_t get_data_offset(size_t index, const std::vector<size_t>& shape, const std::vector<ssize_t>& strides) {
+    size_t div = std::accumulate(shape.begin(), shape.end(), size_t(1), std::multiplies<size_t>());;
+    size_t offset = 0;
+    for (size_t i = 0; i < shape.size(); ++i) {
+        div /= shape[i];
+        size_t dim_index = index / div;
+        offset += dim_index * strides[i];
+        index %= div;
+    }
+    
+    return offset;
+}
+}
 
 namespace executorchcoreml {
 
-size_t MultiArray::MemoryLayout::get_num_elements() const noexcept {
+size_t MultiArray::MemoryLayout::num_elements() const noexcept {
     if (shape_.size() == 0) {
         return 0;
     }
@@ -553,32 +532,101 @@ bool copy(TypedMultiArray<T>& src, MultiArray& dst) {
             return false;
         }
         expectedStride = expectedStride * (*shapeIt);
+        stridesIt++;
     }
     
     return true;
 }
 
-bool MultiArray::copy(MultiArray& dst) const noexcept {
-    switch (layout().dataType()) {
-        case MultiArray::DataType::Int: {
-            auto src = TypedMultiArray<int32_t>(reinterpret_cast<int32_t *>(data()), layout());
-            return ::copy(src, dst);
+size_t MultiArray::MemoryLayout::num_bytes() const noexcept {
+    switch (dataType()) {
+        case MultiArray::DataType::Bool: {
+            return 1;
+        }
+        case MultiArray::DataType::Byte: {
+            return 1;
+        }
+        case MultiArray::DataType::Char: {
+            return 1;
+        }
+        case MultiArray::DataType::Short: {
+            return 2;
+        }
+        case MultiArray::DataType::Int32: {
+            return 4;
+        }
+        case MultiArray::DataType::Int64: {
+            return 8;
         }
-            
         case MultiArray::DataType::Float16: {
-            auto src = TypedMultiArray<_Float16>(reinterpret_cast<_Float16 *>(data()), layout());
-            return ::copy(src, dst);
+            return 2;
         }
-            
-        case MultiArray::DataType::Float: {
-            auto src = TypedMultiArray<float>(reinterpret_cast<float *>(data()), layout());
-            return ::copy(src, dst);
+        case MultiArray::DataType::Float32: {
+            return 4;
         }
-            
-        case MultiArray::DataType::Double: {
-            auto src = TypedMultiArray<double>(reinterpret_cast<double *>(data()), layout());
-            return ::copy(src, dst);
+        case MultiArray::DataType::Float64: {
+            return 8;
+        }
+    }
+}
+
+void MultiArray::copy(MultiArray& dst, CopyOptions options) const noexcept {
+    assert(layout().shape() == dst.layout().shape());
+    ::copy(*this, dst, options);
+}
+
+std::optional<MLMultiArrayDataType> to_ml_multiarray_data_type(MultiArray::DataType data_type) {
+    switch (data_type) {
+        case MultiArray::DataType::Float16: {
+            return MLMultiArrayDataTypeFloat16;
+        }
+        case MultiArray::DataType::Float32: {
+            return MLMultiArrayDataTypeFloat32;
+        }
+        case MultiArray::DataType::Float64: {
+            return MLMultiArrayDataTypeDouble;
+        }
+        case MultiArray::DataType::Int32: {
+            return MLMultiArrayDataTypeInt32;
+        }
+        default: {
+            return std::nullopt;
+        }
+    }
+}
+
+std::optional<MultiArray::DataType> to_multiarray_data_type(MLMultiArrayDataType data_type) {
+    switch (data_type) {
+        case MLMultiArrayDataTypeFloat16: {
+            return MultiArray::DataType::Float16;
+        }
+        case MLMultiArrayDataTypeFloat32: {
+            return MultiArray::DataType::Float32;
+        }
+        case MLMultiArrayDataTypeFloat64: {
+            return MultiArray::DataType::Float64;
+        }
+        case MLMultiArrayDataTypeInt32: {
+            return MultiArray::DataType::Int32;
+        }
+        default: {
+            return std::nullopt;
         }
     }
 }
+
+void *MultiArray::data(const std::vector<size_t>& indices) const noexcept {
+    assert(indices.size() == layout().shape().size());
+    uint8_t *ptr = static_cast<uint8_t *>(data());
+    ssize_t offset = ::get_data_offset(indices, layout().strides());
+    return ptr + offset * layout().num_bytes();
+}
+
+void *MultiArray::data(size_t index) const noexcept {
+    assert(index < layout().num_elements());
+    uint8_t *ptr = static_cast<uint8_t *>(data());
+    ssize_t offset = ::get_data_offset(index, layout().shape(), layout().strides());
+    return ptr + offset * layout().num_bytes();
+}
+
 } // namespace executorchcoreml
diff --git a/backends/apple/coreml/runtime/sdk/ETCoreMLModelAnalyzer.h b/backends/apple/coreml/runtime/sdk/ETCoreMLModelAnalyzer.h
index 51204e34387..4048dae5fea 100644
--- a/backends/apple/coreml/runtime/sdk/ETCoreMLModelAnalyzer.h
+++ b/backends/apple/coreml/runtime/sdk/ETCoreMLModelAnalyzer.h
@@ -48,6 +48,9 @@ __attribute__((objc_subclassing_restricted))
 /// The model.
 @property (readonly, strong, nonatomic) ETCoreMLModel* model;
 
+/// If set to `YES` then output backing are ignored.
+@property (readwrite, atomic) BOOL ignoreOutputBackings;
+
 @end
 
 NS_ASSUME_NONNULL_END
diff --git a/backends/apple/coreml/runtime/sdk/ETCoreMLModelAnalyzer.mm b/backends/apple/coreml/runtime/sdk/ETCoreMLModelAnalyzer.mm
index e7f05662d28..57212445e55 100644
--- a/backends/apple/coreml/runtime/sdk/ETCoreMLModelAnalyzer.mm
+++ b/backends/apple/coreml/runtime/sdk/ETCoreMLModelAnalyzer.mm
@@ -170,6 +170,10 @@ - (nullable instancetype)initWithCompiledModelAsset:(ETCoreMLAsset *)compiledMod
                                               loggingOptions:(const executorchcoreml::ModelLoggingOptions&)loggingOptions
                                                  eventLogger:(const executorchcoreml::ModelEventLogger* _Nullable)eventLogger
                                                        error:(NSError * __autoreleasing *)error {
+    if (self.ignoreOutputBackings) {
+        predictionOptions.outputBackings = @{};
+    }
+    
     NSError *localError = nil;
     NSArray<MLMultiArray *> *outputs = nil;
     if (loggingOptions.log_profiling_info) {
diff --git a/backends/apple/coreml/runtime/test/BackendDelegateTests.mm b/backends/apple/coreml/runtime/test/BackendDelegateTests.mm
index c74cb564495..6f0e3cff31f 100644
--- a/backends/apple/coreml/runtime/test/BackendDelegateTests.mm
+++ b/backends/apple/coreml/runtime/test/BackendDelegateTests.mm
@@ -14,69 +14,32 @@
 #import <coreml_backend/delegate.h>
 #import <model_logging_options.h>
 #import <multiarray.h>
+#import <objc_array_util.h>
 
 using namespace executorchcoreml;
 
 namespace {
-template<typename T>
-T toValue(NSNumber *value);
 
-template<>
-size_t toValue(NSNumber *value) {
-    return value.unsignedLongLongValue;
-}
-
-template<>
-ssize_t toValue(NSNumber *value) {
-    return value.longLongValue;
-}
-
-template<typename T>
-std::vector<T> toVector(NSArray<NSNumber *> *values) {
-    std::vector<T> result;
-    result.reserve(values.count);
-    for (NSNumber *value in values) {
-        result.emplace_back(toValue<T>(value));
-    }
-    
-    return result;
-}
-
-MultiArray::DataType toDataType(MLMultiArrayDataType dataType) {
-    switch (dataType) {
-        case MLMultiArrayDataTypeFloat: {
-            return MultiArray::DataType::Float;
-        }
-        case MLMultiArrayDataTypeFloat16: {
-            return MultiArray::DataType::Float16;
-        }
-        case MLMultiArrayDataTypeDouble: {
-            return MultiArray::DataType::Double;
-        }
-        case MLMultiArrayDataTypeInt32: {
-            return MultiArray::DataType::Int;
-        }
-    }
-}
-
-MultiArray toMultiArray(MLMultiArray *mlMultiArray) {
-    auto shape = toVector<size_t>(mlMultiArray.shape);
-    auto strides = toVector<ssize_t>(mlMultiArray.strides);
-    auto layout = MultiArray::MemoryLayout(toDataType(mlMultiArray.dataType), std::move(shape), std::move(strides));
+MultiArray to_multiarray(MLMultiArray *ml_multiarray) {
+    auto shape = to_vector<size_t>(ml_multiarray.shape);
+    auto strides = to_vector<ssize_t>(ml_multiarray.strides);
+    auto layout = MultiArray::MemoryLayout(to_multiarray_data_type(ml_multiarray.dataType).value(),
+                                           std::move(shape),
+                                           std::move(strides));
     __block void *bytes = nullptr;
-    [mlMultiArray getMutableBytesWithHandler:^(void *mutableBytes, __unused NSInteger size, __unused NSArray<NSNumber *> *strides) {
+    [ml_multiarray getMutableBytesWithHandler:^(void *mutableBytes, __unused NSInteger size, __unused NSArray<NSNumber *> *strides) {
         bytes = mutableBytes;
     }];
     
     return MultiArray(bytes, std::move(layout));
 }
 
-std::vector<MultiArray> toMultiArrays(NSArray<MLMultiArray *> *mlMultiArrays) {
+std::vector<MultiArray> to_multiarrays(NSArray<MLMultiArray *> *ml_multiarrays) {
     std::vector<MultiArray> result;
-    result.reserve(mlMultiArrays.count);
+    result.reserve(ml_multiarrays.count);
     
-    for (MLMultiArray *mlMultiArray in mlMultiArrays) {
-        result.emplace_back(toMultiArray(mlMultiArray));
+    for (MLMultiArray *ml_multiarray in ml_multiarrays) {
+        result.emplace_back(to_multiarray(ml_multiarray));
     }
     return result;
 }
@@ -198,7 +161,7 @@ - (void)testAddModelExecution {
     NSArray<MLMultiArray *> *args = [inputs arrayByAddingObject:output];
     std::error_code errorCode;
     XCTAssertTrue(_delegate->execute(handle,
-                                     toMultiArrays(args),
+                                     to_multiarrays(args),
                                      ModelLoggingOptions(),
                                      nullptr,
                                      errorCode));
@@ -223,7 +186,7 @@ - (void)testMulModelExecution {
     NSArray<MLMultiArray *> *args = [inputs arrayByAddingObject:output];
     std::error_code errorCode;
     XCTAssertTrue(_delegate->execute(handle, 
-                                     toMultiArrays(args),
+                                     to_multiarrays(args),
                                      ModelLoggingOptions(),
                                      nullptr,
                                      errorCode));
diff --git a/backends/apple/coreml/runtime/test/CoreMLBackendDelegateTests.mm b/backends/apple/coreml/runtime/test/CoreMLBackendDelegateTests.mm
index 13f8343adf2..94b862d8424 100644
--- a/backends/apple/coreml/runtime/test/CoreMLBackendDelegateTests.mm
+++ b/backends/apple/coreml/runtime/test/CoreMLBackendDelegateTests.mm
@@ -15,7 +15,7 @@
 #import <executorch/runtime/executor/program.h>
 #import <executorch/runtime/platform/runtime.h>
 
-static constexpr size_t kRuntimeMemorySize = 10 * 1024U * 1024U; // 10 MB
+static constexpr size_t kRuntimeMemorySize = 50 * 1024U * 1024U; // 50 MB
 
 using namespace torch::executor;
 using torch::executor::testing::TensorFactory;
@@ -104,7 +104,7 @@
              ET_LOG(Info, "Skipping non-tensor input %zu", i);
              continue;
          }
-         Buffer buffer(tensor_meta->nbytes(), 1);
+         Buffer buffer(tensor_meta->nbytes(), 0);
          auto sizes = tensor_meta->sizes();
          exec_aten::TensorImpl tensor_impl(tensor_meta->scalar_type(), std::size(sizes), const_cast<int *>(sizes.data()), buffer.data());
          exec_aten::Tensor tensor(&tensor_impl);
@@ -155,8 +155,8 @@ - (void)testProgramLoad {
     XCTAssert(method.ok());
 }
 
-- (void)executeModelAtURL:(NSURL *)modelURL nTimes:(NSUInteger)nTimes {
-    for (NSUInteger i = 0; i < nTimes; i++) {
+- (void)executeModelAtURL:(NSURL *)modelURL nLoads:(NSUInteger)nLoads nExecutions:(NSUInteger)nExecutions {
+    for (NSUInteger i = 0; i < nLoads; ++i) {
         auto loader = std::make_unique<DataLoaderImpl>(modelURL.path.UTF8String);
         auto program = get_program(loader.get());
         XCTAssert(program != nullptr);
@@ -165,41 +165,44 @@ - (void)executeModelAtURL:(NSURL *)modelURL nTimes:(NSUInteger)nTimes {
         auto plannedBuffers = get_planned_buffers(methodName.get(), program.get());
         XCTAssert(plannedBuffers.ok());
         Buffer methodBuffer(kRuntimeMemorySize, 0);
-        MemoryAllocator methodAllocator(static_cast<int32_t>(methodBuffer.size()), methodBuffer.data());
+        __block MemoryAllocator methodAllocator(static_cast<int32_t>(methodBuffer.size()), methodBuffer.data());
         auto spans = to_spans(plannedBuffers.get());
         HierarchicalAllocator plannedAllocator({spans.data(), spans.size()});
         MemoryManager memoryManger(&methodAllocator, &plannedAllocator);
-        auto method = program->load_method(methodName.get().c_str(), &memoryManger);
+        __block auto method = program->load_method(methodName.get().c_str(), &memoryManger);
         XCTAssert(method.ok());
         auto inputs = ::prepare_input_tensors(method.get());
-        auto status = method->execute();
-        XCTAssertEqual(status, Error::Ok);
         auto outputs = methodAllocator.allocateList<EValue>(method->outputs_size());
-        status = method->get_outputs(outputs, method->outputs_size());
-        XCTAssertEqual(status, Error::Ok);
+        for (NSUInteger j = 0; j < nExecutions; ++j) {
+            auto status = method->execute();
+            XCTAssertEqual(status, Error::Ok);
+            status = method->get_outputs(outputs, method->outputs_size());
+            XCTAssertEqual(status, Error::Ok);
+        }
     }
 }
 
 - (void)testAddProgramExecute {
     NSURL *modelURL = [[self class] bundledResourceWithName:@"add_coreml_all" extension:@"pte"];
     XCTAssertNotNil(modelURL);
-    [self executeModelAtURL:modelURL nTimes:10];
+    [self executeModelAtURL:modelURL nLoads:5 nExecutions:2];
 }
 
 - (void)testMulProgramExecute {
     NSURL *modelURL = [[self class] bundledResourceWithName:@"mul_coreml_all" extension:@"pte"];
     XCTAssertNotNil(modelURL);
-    [self executeModelAtURL:modelURL nTimes:10];
+    [self executeModelAtURL:modelURL nLoads:5 nExecutions:2];
 }
 
 - (void)testMV3ProgramExecute {
     NSURL *modelURL = [[self class] bundledResourceWithName:@"mv3_coreml_all" extension:@"pte"];
     XCTAssertNotNil(modelURL);
-    [self executeModelAtURL:modelURL nTimes:10];
+    [self executeModelAtURL:modelURL nLoads:5 nExecutions:2];
 }
 
 - (void)executeMultipleModelsConcurrently:(NSArray<NSURL *> *)modelURLs
-                                   nTimes:(NSUInteger)nTimes
+                                   nLoads:(NSUInteger)nLoads
+                              nExecutions:(NSUInteger)nExecutions
                                   timeout:(NSTimeInterval)timeout {
     NSMutableArray<XCTestExpectation *> *expectations = [NSMutableArray arrayWithCapacity:modelURLs.count];
     dispatch_queue_t queue = dispatch_get_global_queue(DISPATCH_QUEUE_PRIORITY_DEFAULT, 0);
@@ -208,7 +211,7 @@ - (void)executeMultipleModelsConcurrently:(NSArray<NSURL *> *)modelURLs
         XCTestExpectation *expectation = [[XCTestExpectation alloc] initWithDescription:description];
         [expectations addObject:expectation];
         dispatch_async(queue, ^{
-            [self executeModelAtURL:modelURL nTimes:nTimes];
+            [self executeModelAtURL:modelURL nLoads:nLoads nExecutions:nExecutions];
             [expectation fulfill];
         });
     }
@@ -221,7 +224,8 @@ - (void)testMultipleModelExecutionConcurrently {
     NSURL *modelURL2 = [[self class] bundledResourceWithName:@"mul_coreml_all" extension:@"pte"];
     NSURL *modelURL3 = [[self class] bundledResourceWithName:@"mv3_coreml_all" extension:@"pte"];
     [self executeMultipleModelsConcurrently:@[modelURL1, modelURL2, modelURL3]
-                                     nTimes:10
+                                     nLoads:5
+                                nExecutions:2
                                     timeout:5 * 60];
 }
 
@@ -229,7 +233,8 @@ - (void)testSameModelExecutionConcurrently {
     NSURL *modelURL1 = [[self class] bundledResourceWithName:@"mv3_coreml_all" extension:@"pte"];
     NSURL *modelURL2 = [[self class] bundledResourceWithName:@"mv3_coreml_all" extension:@"pte"];
     [self executeMultipleModelsConcurrently:@[modelURL1, modelURL2]
-                                     nTimes:10
+                                     nLoads:5
+                                nExecutions:2
                                     timeout:5 * 60];
 }
 
diff --git a/backends/apple/coreml/runtime/test/ETCoreMLModelManagerTests.mm b/backends/apple/coreml/runtime/test/ETCoreMLModelManagerTests.mm
index d20d292cf69..8ad712497ea 100644
--- a/backends/apple/coreml/runtime/test/ETCoreMLModelManagerTests.mm
+++ b/backends/apple/coreml/runtime/test/ETCoreMLModelManagerTests.mm
@@ -115,7 +115,7 @@ - (void)testAddModelExecution {
     NSArray<MLMultiArray *> *args = [inputs arrayByAddingObject:output];
     XCTAssertTrue([self.modelManager executeModelWithHandle:handle 
                                                        args:args
-                                            loggingOptions:executorchcoreml::ModelLoggingOptions()
+                                             loggingOptions:executorchcoreml::ModelLoggingOptions()
                                                 eventLogger:nullptr
                                                       error:&localError]);
     for (NSUInteger i = 0; i < output.count; i++) {
diff --git a/backends/apple/coreml/runtime/test/MultiArrayTests.mm b/backends/apple/coreml/runtime/test/MultiArrayTests.mm
new file mode 100644
index 00000000000..895702ae154
--- /dev/null
+++ b/backends/apple/coreml/runtime/test/MultiArrayTests.mm
@@ -0,0 +1,133 @@
+//
+// MultiArrayTests.mm
+//
+// Copyright © 2024 Apple Inc. All rights reserved.
+//
+// Please refer to the license found in the LICENSE file in the root directory of the source tree.
+
+#import <multiarray.h>
+#import <objc_array_util.h>
+#import <vector>
+
+#import <XCTest/XCTest.h>
+
+using namespace executorchcoreml;
+
+namespace {
+size_t get_buffer_size(const std::vector<size_t>& shape, const std::vector<ssize_t>& srides) {
+    auto max_stride_it = std::max_element(srides.begin(), srides.end());
+    size_t max_stride_axis = static_cast<size_t>(std::distance(srides.begin(), max_stride_it));
+    size_t dimension_with_max_stride = shape[max_stride_axis];
+    return dimension_with_max_stride * (*max_stride_it);
+}
+
+template<typename T>
+MultiArray::DataType get_multiarray_data_type();
+
+template<> MultiArray::DataType get_multiarray_data_type<float>() {
+    return MultiArray::DataType::Float32;
+}
+
+template<> MultiArray::DataType get_multiarray_data_type<double>() {
+    return MultiArray::DataType::Float64;
+}
+
+template<> MultiArray::DataType get_multiarray_data_type<int64_t>() {
+    return MultiArray::DataType::Int64;
+}
+
+template<> MultiArray::DataType get_multiarray_data_type<int32_t>() {
+    return MultiArray::DataType::Int32;
+}
+
+template<> MultiArray::DataType get_multiarray_data_type<int16_t>() {
+    return MultiArray::DataType::Short;
+}
+
+template<> MultiArray::DataType get_multiarray_data_type<_Float16>() {
+    return MultiArray::DataType::Float16;
+}
+
+template<typename T1, typename T2>
+void verify_values(const MultiArray& multiarray1, const MultiArray& multiarray2) {
+    for (size_t i = 0;  i < multiarray1.layout().num_elements(); ++i) {
+        XCTAssertEqual(multiarray1.value<T1>(i), multiarray2.value<T2>(i));
+    }
+}
+
+template<typename T>
+MultiArray make_multi_array(const std::vector<size_t>& shape, const std::vector<ssize_t>& strides, std::vector<uint8_t>& storage) {
+    storage.resize(get_buffer_size(shape, strides) * sizeof(T), 0);
+    MultiArray::MemoryLayout layout(get_multiarray_data_type<T>(), shape, strides);
+    return MultiArray(storage.data(), std::move(layout));
+}
+
+template<typename T>
+MultiArray make_multi_array_and_fill(const std::vector<size_t>& shape, const std::vector<ssize_t>& strides, std::vector<uint8_t>& storage) {
+    auto result = make_multi_array<T>(shape, strides, storage);
+    for (size_t i = 0;  i < result.layout().num_elements(); ++i) {
+        T value = static_cast<T>(i);
+        result.set_value(i, value);
+    }
+    
+    return result;
+}
+
+template<typename T1, typename T2>
+void verify_copy_(const std::vector<size_t>& shape,
+                  const std::vector<ssize_t>& src_strides,
+                  const std::vector<ssize_t>& dst_strides) {
+    std::vector<uint8_t> src_storage;
+    auto src_multiarray = make_multi_array_and_fill<T1>(shape, src_strides, src_storage);
+    
+    std::vector<uint8_t> dst_storage;
+    auto dst_multiarray = make_multi_array<T2>(shape, dst_strides, dst_storage);
+    src_multiarray.copy(dst_multiarray, MultiArray::CopyOptions(true, false));
+    verify_values<T1, T2>(src_multiarray, dst_multiarray);
+    
+    dst_storage.clear();
+    dst_storage.resize(get_buffer_size(shape, dst_strides) * sizeof(T2), 0);
+    src_multiarray.copy(dst_multiarray, MultiArray::CopyOptions(false, false));
+    verify_values<T1, T2>(src_multiarray, dst_multiarray);
+}
+
+template<typename T1, typename T2>
+void verify_copy(const std::vector<size_t>& shape,
+                 const std::vector<ssize_t>& src_strides,
+                 const std::vector<ssize_t>& dst_strides) {
+    verify_copy_<T1, T2>(shape, src_strides, dst_strides);
+    verify_copy_<T2, T1>(shape, src_strides, dst_strides);
+}
+} //namespace
+
+@interface MultiArrayTests : XCTestCase
+
+@end
+
+@implementation MultiArrayTests
+
+- (void)verifyDataCopyWithShape:(const std::vector<size_t>&)shape
+                     srcStrides:(const std::vector<ssize_t>&)srcStrides
+                     dstStrides:(const std::vector<ssize_t>&)dstStrides {
+    verify_copy<int16_t, int32_t>(shape, srcStrides, dstStrides);
+    verify_copy<int16_t, int64_t>(shape, srcStrides, dstStrides);
+    verify_copy<int32_t, int64_t>(shape, srcStrides, dstStrides);
+    verify_copy<float, double>(shape, srcStrides, srcStrides);
+    verify_copy<float, _Float16>(shape, srcStrides, dstStrides);
+    verify_copy<double, _Float16>(shape, srcStrides, srcStrides);
+}
+
+- (void)testAdjacentDataCopy {
+    std::vector<size_t> shape = {1, 3, 10, 10};
+    std::vector<ssize_t> strides = {3 * 10 * 10, 10 * 10, 10, 1};
+    [self verifyDataCopyWithShape:shape srcStrides:strides dstStrides:strides];
+}
+
+- (void)testNonAdjacentDataCopy {
+    std::vector<size_t> shape = {1, 3, 10, 10};
+    std::vector<ssize_t> srcStrides = {3 * 10 * 64, 10 * 64, 64, 1};
+    std::vector<ssize_t> dstStrides = {3 * 10 * 10 * 10, 10 * 10 * 10, 100, 10};
+    [self verifyDataCopyWithShape:shape srcStrides:srcStrides dstStrides:dstStrides];
+}
+
+@end
diff --git a/backends/apple/coreml/runtime/util/objc_array_util.h b/backends/apple/coreml/runtime/util/objc_array_util.h
new file mode 100644
index 00000000000..5f4c8c7bc26
--- /dev/null
+++ b/backends/apple/coreml/runtime/util/objc_array_util.h
@@ -0,0 +1,42 @@
+//
+//  objc_array_util.h
+//  util
+//
+// Copyright © 2024 Apple Inc. All rights reserved.
+//
+// Please refer to the license found in the LICENSE file in the root directory of the source tree.
+
+#import <Foundation/Foundation.h>
+#import <type_traits>
+#import <vector>
+
+namespace executorchcoreml {
+
+template <typename T> T to_value(NSNumber* value);
+
+template <> inline size_t to_value(NSNumber* value) { return value.unsignedLongValue; }
+
+template <> inline ssize_t to_value(NSNumber* value) { return value.longLongValue; }
+
+template <typename T, typename = typename std::enable_if<std::is_arithmetic<T>::value, T>::type>
+inline NSArray<NSNumber*>* to_array(const std::vector<T>& array) {
+    NSMutableArray<NSNumber*>* result = [NSMutableArray arrayWithCapacity:array.size()];
+    for (T value: array) {
+        [result addObject:@(value)];
+    }
+
+    return result;
+}
+
+template <typename T, typename = typename std::enable_if<std::is_arithmetic<T>::value, T>::type>
+inline std::vector<T> to_vector(NSArray<NSNumber*>* numbers) {
+    std::vector<T> result;
+    result.reserve(numbers.count);
+    for (NSNumber* number in numbers) {
+        result.emplace_back(to_value<T>(number));
+    }
+
+    return result;
+}
+
+}
diff --git a/backends/apple/coreml/runtime/workspace/executorchcoreml.xcodeproj/project.pbxproj b/backends/apple/coreml/runtime/workspace/executorchcoreml.xcodeproj/project.pbxproj
index 4c9fc081b9e..d8ee4ea693a 100644
--- a/backends/apple/coreml/runtime/workspace/executorchcoreml.xcodeproj/project.pbxproj
+++ b/backends/apple/coreml/runtime/workspace/executorchcoreml.xcodeproj/project.pbxproj
@@ -100,6 +100,8 @@
 		C9E7D7952AB3F9BF00CCAE5D /* ETCoreMLModelManagerTests.mm in Sources */ = {isa = PBXBuildFile; fileRef = C9E7D78D2AB3F9BF00CCAE5D /* ETCoreMLModelManagerTests.mm */; };
 		C9E7D7962AB3F9BF00CCAE5D /* KeyValueStoreTests.mm in Sources */ = {isa = PBXBuildFile; fileRef = C9E7D78E2AB3F9BF00CCAE5D /* KeyValueStoreTests.mm */; };
 		C9E7D7A22AB3FBB200CCAE5D /* CoreMLBackendDelegateTests.mm in Sources */ = {isa = PBXBuildFile; fileRef = C9E7D7A12AB3FBB200CCAE5D /* CoreMLBackendDelegateTests.mm */; };
+		F24817E52BC655E100E80D98 /* libexecutorch_no_prim_ops.a in Frameworks */ = {isa = PBXBuildFile; fileRef = F24817E42BC655E100E80D98 /* libexecutorch_no_prim_ops.a */; };
+		C9EC7E1B2BC73B3200A6B166 /* MultiArrayTests.mm in Sources */ = {isa = PBXBuildFile; fileRef = C9EC7E1A2BC73B3200A6B166 /* MultiArrayTests.mm */; };
 /* End PBXBuildFile section */
 
 /* Begin PBXCopyFilesBuildPhase section */
@@ -297,6 +299,9 @@
 		C9EA3DB22B71A2B200B7D7BD /* CoreML.framework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.framework; name = CoreML.framework; path = System/Library/Frameworks/CoreML.framework; sourceTree = SDKROOT; };
 		C9EA3FDE2B73EEA000B7D7BD /* libsqlite3.tbd */ = {isa = PBXFileReference; lastKnownFileType = "sourcecode.text-based-dylib-definition"; name = libsqlite3.tbd; path = usr/lib/libsqlite3.tbd; sourceTree = SDKROOT; };
 		C9EA3FE52B73EF6300B7D7BD /* Accelerate.framework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.framework; name = Accelerate.framework; path = System/Library/Frameworks/Accelerate.framework; sourceTree = SDKROOT; };
+		F24817E42BC655E100E80D98 /* libexecutorch_no_prim_ops.a */ = {isa = PBXFileReference; lastKnownFileType = archive.ar; name = libexecutorch_no_prim_ops.a; path = ../libraries/libexecutorch_no_prim_ops.a; sourceTree = "<group>"; };
+		C9EC7E092BC662A300A6B166 /* objc_array_util.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; name = objc_array_util.h; path = ../util/objc_array_util.h; sourceTree = "<group>"; };
+		C9EC7E1A2BC73B3200A6B166 /* MultiArrayTests.mm */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.cpp.objcpp; name = MultiArrayTests.mm; path = ../test/MultiArrayTests.mm; sourceTree = "<group>"; };
 /* End PBXFileReference section */
 
 /* Begin PBXFrameworksBuildPhase section */
@@ -305,6 +310,7 @@
 			buildActionMask = 2147483647;
 			files = (
 				C94D510F2ABDF87500AF47FD /* Accelerate.framework in Frameworks */,
+				F24817E52BC655E100E80D98 /* libexecutorch_no_prim_ops.a in Frameworks */,
 				C94D510E2ABDF86800AF47FD /* libsqlite3.tbd in Frameworks */,
 				C94D50D92ABD7B2400AF47FD /* CoreML.framework in Frameworks */,
 				C99883862B95AD7D000953A3 /* libprotobuf-lite.a in Frameworks */,
@@ -523,6 +529,7 @@
 				C96560942AABFDCE005F8126 /* libsqlite3.tbd */,
 				C96560922AABF992005F8126 /* CoreML.framework */,
 				C96560902AABF982005F8126 /* Accelerate.framework */,
+				F24817E42BC655E100E80D98 /* libexecutorch_no_prim_ops.a */,
 				C965608D2AABF72A005F8126 /* libexecutorch.a */,
 			);
 			name = "Recovered References";
@@ -536,6 +543,7 @@
 				C97716DB2AF44D9A00FC0DAC /* objc_json_serde.h */,
 				C97716DC2AF44E7B00FC0DAC /* objc_json_serde.mm */,
 				C97716DE2AF44FC400FC0DAC /* objc_safe_cast.h */,
+				C9EC7E092BC662A300A6B166 /* objc_array_util.h */,
 			);
 			name = util;
 			sourceTree = "<group>";
@@ -574,6 +582,7 @@
 				C998838C2B96841D000953A3 /* ETCoreMLModelStructurePathTests.mm */,
 				C998838E2B96999F000953A3 /* ETCoreMLModelProfilerTests.mm */,
 				C962271A2B984FB9002D13B7 /* ETCoreMLModelDebuggerTests.mm */,
+				C9EC7E1A2BC73B3200A6B166 /* MultiArrayTests.mm */,
 			);
 			name = test;
 			sourceTree = "<group>";
@@ -724,6 +733,7 @@
 				C945E9372B997EEE009C3FAC /* FeatureTypes.pb.cc in Sources */,
 				C945E9402B997EEE009C3FAC /* OneHotEncoder.pb.cc in Sources */,
 				C94D50E82ABDF81100AF47FD /* key_value_store.cpp in Sources */,
+				C9EC7E1B2BC73B3200A6B166 /* MultiArrayTests.mm in Sources */,
 				C945E9452B997EEE009C3FAC /* BayesianProbitRegressor.pb.cc in Sources */,
 				C945E8E52B997ECE009C3FAC /* ETCoreMLOperationProfilingInfo.mm in Sources */,
 				C945E9312B997EEE009C3FAC /* DataStructures.pb.cc in Sources */,
diff --git a/backends/apple/coreml/scripts/build_tests.sh b/backends/apple/coreml/scripts/build_tests.sh
index 72afca2d6ce..730ba0839db 100755
--- a/backends/apple/coreml/scripts/build_tests.sh
+++ b/backends/apple/coreml/scripts/build_tests.sh
@@ -59,6 +59,7 @@ cmake --build "$CMAKE_PROTOBUF_BUILD_DIR_PATH"  -j9 -t libprotobuf-lite
 echo "ExecuTorch: Copying libraries"
 mkdir "$LIBRARIES_DIR_PATH"
 cp -f "$CMAKE_EXECUTORCH_BUILD_DIR_PATH/libexecutorch.a" "$LIBRARIES_DIR_PATH"
+cp -f "$CMAKE_EXECUTORCH_BUILD_DIR_PATH/libexecutorch_no_prim_ops.a" "$LIBRARIES_DIR_PATH"
 cp -f "$CMAKE_PROTOBUF_BUILD_DIR_PATH/libprotobuf-lite.a" "$LIBRARIES_DIR_PATH"
 
 #Copy ExecuTorch headers
diff --git a/backends/apple/coreml/scripts/install_requirements.sh b/backends/apple/coreml/scripts/install_requirements.sh
index 0f703c9e430..b48ac7bfb69 100755
--- a/backends/apple/coreml/scripts/install_requirements.sh
+++ b/backends/apple/coreml/scripts/install_requirements.sh
@@ -24,7 +24,7 @@ rm -rf "$COREML_DIR_PATH/third-party"
 mkdir "$COREML_DIR_PATH/third-party"
 
 echo "${green}ExecuTorch: Cloning coremltools."
-git clone "https://github.com/apple/coremltools.git" $COREMLTOOLS_DIR_PATH
+git clone --depth 1 --branch 7.2 "https://github.com/apple/coremltools.git" $COREMLTOOLS_DIR_PATH
 cd $COREMLTOOLS_DIR_PATH
 
 STATUS=$?
diff --git a/backends/apple/coreml/setup.md b/backends/apple/coreml/setup.md
index c01f6e2d238..4e66544f7bb 100644
--- a/backends/apple/coreml/setup.md
+++ b/backends/apple/coreml/setup.md
@@ -29,8 +29,8 @@ python3 -m examples.apple.coreml.scripts.export --model_name add
 4. You can now integrate the **Core ML** backend in code.
 
 ```python
-# Lower to Core ML backend
-lowered_module = to_backend('CoreMLBackend', to_be_lowered_exir_submodule, [])
+# Delegate to Core ML backend
+delegated_program_manager = edge_program_manager.to_backend(CoreMLPartitioner())
 ```
 
 
@@ -46,15 +46,15 @@ lowered_module = to_backend('CoreMLBackend', to_be_lowered_exir_submodule, [])
 xcode-select --install
 ```
 
-2. Build **Core ML** delegate. The following will create a `executorch.xcframework` in `cmake-out` directory.
+4. Build **Core ML** delegate. The following will create `executorch.xcframework` and `coreml_backend.xcframework` in the `cmake-out` directory.
 
 ```bash
 cd executorch
 ./build/build_apple_frameworks.sh --Release --coreml
 ```
-3. Open the project in Xcode, and drag the `executorch.xcframework` generated from Step 2 to Frameworks.
+5. Open the project in Xcode, and drag `executorch.xcframework` and `coreml_backend.xcframework` frameworks generated from Step 2 to Frameworks.
 
-4. Go to project Target’s Build Phases -  Link Binaries With Libraries, click the + sign, and add the following frameworks:
+6. Go to project Target’s Build Phases -  Link Binaries With Libraries, click the + sign, and add the following frameworks:
 
 ```
 executorch.xcframework
@@ -63,9 +63,9 @@ coreml_backend.xcframework
 
 5. Go to project Target’s Build Phases -  Link Binaries With Libraries, click the + sign, and add the following frameworks.
 ```
-- Accelerate.framework
-- CoreML.framework
-- libsqlite3.tbd
+Accelerate.framework
+CoreML.framework
+libsqlite3.tbd
 ```
 
 6. The target could now run a **Core ML** delegated **Program**.
diff --git a/backends/apple/mps/CMakeLists.txt b/backends/apple/mps/CMakeLists.txt
index ef64e26f2cc..a3b0bdab670 100644
--- a/backends/apple/mps/CMakeLists.txt
+++ b/backends/apple/mps/CMakeLists.txt
@@ -70,13 +70,16 @@ target_link_libraries(mpsdelegate
   PRIVATE
   bundled_program
   mps_schema
-  ${_executor_runner_libs}
+  executorch_no_prim_ops
   ${FOUNDATION_FRAMEWORK}
   ${METAL_FRAMEWORK}
   ${MPS_FRAMEWORK}
   ${MPS_GRAPG_FRAMEWORK}
 )
 
+target_link_options_shared_lib(mpsdelegate)
+target_compile_options(mpsdelegate PUBLIC ${_common_compile_options})
+
 install(
   TARGETS mpsdelegate
   DESTINATION lib
diff --git a/backends/apple/mps/mps_preprocess.py b/backends/apple/mps/mps_preprocess.py
index 0e543d7e079..bb828ed0f90 100644
--- a/backends/apple/mps/mps_preprocess.py
+++ b/backends/apple/mps/mps_preprocess.py
@@ -18,6 +18,7 @@
 from executorch.backends.apple.mps.serialization.mps_graph_schema import (
     MPSGraph,
     MPSTensor,
+    OpType,
 )
 
 from executorch.backends.apple.mps.serialization.mps_graph_serialize import (
@@ -65,6 +66,7 @@ def preprocess(
             input_ids=[],
             output_ids=[],
             constant_ids=[],
+            graph_type=OpType.mps_graph,
         )
 
         convert_model_to_fp16 = True
@@ -111,6 +113,16 @@ def handle_call_function(
         mps_graph: MPSGraph,
     ) -> None:
         logging.info(f"Visiting: {node}, {node.target.__name__}")
+
+        if (
+            "delegation_tag" in node.meta
+            and "metal_kernel" in node.meta["delegation_tag"]
+        ):
+            logging.info(
+                f"Node '{node.target.__name__}' was marked as a Metal kernel by the MPSPartitioner!"
+            )
+            mps_graph.graph_type = OpType.metal_kernel
+
         if node.target.__name__ in node_visitors:
             node_visitors[node.target.__name__].define_node(node, mps_graph)
         else:
diff --git a/backends/apple/mps/operators/indexing_ops.py b/backends/apple/mps/operators/indexing_ops.py
index f2c9dc6aeab..690549973a4 100644
--- a/backends/apple/mps/operators/indexing_ops.py
+++ b/backends/apple/mps/operators/indexing_ops.py
@@ -3,7 +3,7 @@
 #  Provided subject to the LICENSE file in the top level directory.
 #
 
-from typing import cast
+from typing import cast, List
 
 import torch
 from executorch.backends.apple.mps.operators.node_visitor import (
@@ -13,9 +13,12 @@
 from executorch.backends.apple.mps.serialization.mps_graph_schema import (
     MPSEmbedding,
     MPSGraph,
+    MPSIndexPut,
     MPSIndexSelect,
+    MPSIndexTensor,
 )
 from executorch.backends.apple.mps.utils.mps_utils import get_input_node
+from executorch.backends.transforms import get_shape
 from executorch.exir.sym_util import eval_expr
 
 
@@ -40,6 +43,78 @@ def define_node(
         mps_graph.mps_nodes.append(mps_node)
 
 
+@register_node_visitor
+class IndexTensorVisitor(NodeVisitor):
+    target = "aten.index.Tensor"
+
+    def __init__(self, *args) -> None:
+        super().__init__(*args)
+
+    def define_node(
+        self,
+        node: torch.fx.Node,
+        mps_graph: MPSGraph,
+    ) -> None:
+        mps_node = self.create_unary_node(node, mps_graph, MPSIndexTensor)
+        tensors = cast(List[torch.fx.Node], node.args[1])
+        for tensor in tensors:
+            mps_node.mpsnode_union.indices_id.append(
+                self.define_tensor(tensor, mps_graph)
+            )
+
+        mps_graph.mps_nodes.append(mps_node)
+
+
+# [MPS TODO]: Works on a single iteration of llama2, but subsequent tokens
+# are wrong when using Index put. Disabling it for now.
+@register_node_visitor
+class IndexPutVisitor(NodeVisitor):
+    # target = "aten.index_put.default"
+    target = "disabled"
+
+    def __init__(self, *args) -> None:
+        super().__init__(*args)
+
+    def infer_sizes(self, a: List[int], b: List[int]):
+        dimsA = len(a)
+        dimsB = len(b)
+        ndim = dimsA if dimsA > dimsB else dimsB
+        expandedSizes = [0] * ndim
+        for i in range(ndim - 1, -1, -1):
+            offset = ndim - 1 - i
+            dimA = dimsA - 1 - offset
+            dimB = dimsB - 1 - offset
+            sizeA = a[dimA] if dimA >= 0 else -1
+            sizeB = b[dimB] if dimB >= 0 else -1
+            expandedSizes[i] = sizeA if sizeB == -1 else sizeB
+
+        return expandedSizes
+
+    def define_node(
+        self,
+        node: torch.fx.Node,
+        mps_graph: MPSGraph,
+    ) -> None:
+        mps_node = self.create_unary_node(node, mps_graph, MPSIndexPut)
+        updates_shape = get_shape(node.args[2])
+        input_shape = get_shape(node.args[0])
+        new_shape = []
+        if len(updates_shape) != 1 and len(updates_shape) != len(input_shape):
+            new_shape = self.infer_sizes(input_shape, updates_shape)
+            mps_node.mpsnode_union.values_shape = new_shape
+
+        tensors = cast(List[torch.fx.Node], node.args[1])
+        for tensor in tensors:
+            mps_node.mpsnode_union.indices_id.append(
+                self.define_tensor(tensor, mps_graph)
+            )
+
+        mps_node.mpsnode_union.values_id = self.define_tensor(
+            get_input_node(node, 2), mps_graph
+        )
+        mps_graph.mps_nodes.append(mps_node)
+
+
 @register_node_visitor
 class EmbeddingVisitor(NodeVisitor):
     target = "aten.embedding.default"
diff --git a/backends/apple/mps/operators/unary_ops.py b/backends/apple/mps/operators/unary_ops.py
index 411924d0406..8b67d7dfba2 100644
--- a/backends/apple/mps/operators/unary_ops.py
+++ b/backends/apple/mps/operators/unary_ops.py
@@ -30,6 +30,7 @@
     MPSLog,
     MPSLog10,
     MPSLog2,
+    MPSLogicalNot,
     MPSNeg,
     MPSReciprocal,
     MPSRound,
@@ -79,6 +80,7 @@ class UnaryOpVisitor(NodeVisitor):
         "aten.isnan.default",
         "aten.isinf.default",
         "aten.round.default",
+        "aten.logical_not.default",
     ]
 
     def __init__(self, *args) -> None:
@@ -115,6 +117,7 @@ def __init__(self, *args) -> None:
             exir_ops.edge.aten.isnan.default: MPSIsnan,
             exir_ops.edge.aten.isinf.default: MPSIsinf,
             exir_ops.edge.aten.round.default: MPSRound,
+            exir_ops.edge.aten.logical_not.default: MPSLogicalNot,
         }
 
     def define_node(
diff --git a/backends/apple/mps/partition/mps_partitioner.py b/backends/apple/mps/partition/mps_partitioner.py
index a06677a59a5..e5497389d14 100644
--- a/backends/apple/mps/partition/mps_partitioner.py
+++ b/backends/apple/mps/partition/mps_partitioner.py
@@ -4,12 +4,13 @@
 #
 
 import logging
-from typing import Any, Dict, List, Union
+from typing import Any, cast, Dict, List, Union
 
 import torch
 from executorch.backends.apple.mps.mps_preprocess import MPSBackend
 from executorch.backends.apple.mps.operators.node_visitor import get_node_visitors
 from executorch.backends.apple.mps.utils.mps_utils import is_parameter
+from executorch.backends.transforms import get_shape
 from executorch.exir.backend.backend_details import CompileSpec
 from executorch.exir.backend.canonical_partitioners.pattern_op_partitioner import (
     generate_partitions_from_list_of_nodes,
@@ -20,6 +21,7 @@
     PartitionResult,
 )
 from executorch.exir.backend.utils import tag_constant_data
+from executorch.exir.dialects._ops import ops as exir_ops
 from torch.export.exported_program import ExportedProgram
 from torch.fx.passes.infra.partitioner import Partition
 from torch.fx.passes.operator_support import OperatorSupportBase
@@ -28,6 +30,13 @@
 logging.basicConfig(level=logging.DEBUG, format=FORMAT)
 
 
+# ops implemented as Metal kernels.
+METAL_KERNELS = [
+    exir_ops.edge.aten.index.Tensor,
+    exir_ops.edge.aten.index_put.default,
+]
+
+
 class MPSOperatorSupport(OperatorSupportBase):
     def __init__(self, edge_program: torch.export.ExportedProgram, compiler_specs):
         self.node_visitors = get_node_visitors(edge_program)
@@ -65,10 +74,47 @@ def generate_partitions(self, edge_program: ExportedProgram) -> List[Any]:
             op_support=self.supported_ops,
         )
 
+    def mps_graph_advanced_indexing_support(self, node: torch.fx.Node):
+        num_indices = 0
+        tensors = cast(List[torch.fx.Node], node.args[1])
+        input = cast(torch.fx.Node, node.args[0])
+        for t in tensors:
+            if t is not None:
+                num_indices += 1
+        # Can dispatch to MPSGraph if the length of the slices is equal
+        # to the number of dimensions of the sliced tensors, or only one
+        # slice is present. All other cases will fallback to a Metal kernel.
+        if num_indices == len(get_shape(input)) or num_indices == 1:
+            return True
+
+        return False
+
+    def use_metal_kernel(self, node: torch.fx.Node):
+        if node.target in METAL_KERNELS:
+            if (
+                node.target == exir_ops.edge.aten.index.Tensor
+                or node.target == exir_ops.edge.aten.index_put.default
+            ):
+                if not self.mps_graph_advanced_indexing_support(node):
+                    return True
+        return False
+
     def tag_nodes(self, partitions: List[Partition]) -> None:
         for partition in partitions:
+            crt_partition_counter = 0
             for node in partition.nodes:
                 delegation_tag = f"mps_{partition.id}"
+                if self.use_metal_kernel(node):
+                    logging.warning(f"[WARNING] Using Metal kernel for op {node.name}!")
+                    # Partition the Metal kernel into a separate partition
+                    crt_partition_counter += 1
+                    delegation_tag = (
+                        f"{delegation_tag}_metal_kernel_{crt_partition_counter}"
+                    )
+                    crt_partition_counter += 1
+                else:
+                    delegation_tag = f"{delegation_tag}_{crt_partition_counter}"
+
                 node.meta["delegation_tag"] = delegation_tag
                 self.partition_tags[delegation_tag] = self.delegation_spec
 
diff --git a/backends/apple/mps/runtime/MPSDevice.h b/backends/apple/mps/runtime/MPSDevice.h
index d9ab403e80b..a8b5dbe2b81 100644
--- a/backends/apple/mps/runtime/MPSDevice.h
+++ b/backends/apple/mps/runtime/MPSDevice.h
@@ -5,10 +5,19 @@
 
 #pragma once
 
+// Obj-C headers
 #include <Foundation/Foundation.h>
 #include <Metal/Metal.h>
+
+// Runtime headers
+#include <executorch/runtime/backend/interface.h>
+
+// MPS headers
 #include <MetalPerformanceShaders/MetalPerformanceShaders.h>
 
+#include <unordered_map>
+#include <vector>
+
 #define MB(x) (x * 1048576UL)
 
 namespace torch {
@@ -25,6 +34,11 @@ enum class MacOSVersion : uint32_t {
   MACOS_VER_14_0_PLUS,
 };
 
+enum class LibraryType : uint32_t {
+  INDEXING_KERNELS = 0,
+  MAX = INDEXING_KERNELS,
+};
+
 class MPSDevice {
  public:
   /**
@@ -53,9 +67,18 @@ class MPSDevice {
 
   ~MPSDevice();
 
+  /**
+   * Compile a PSO for a given library type.
+   * Once compiled, the library and PSOs are cached.
+   */
+  Error compilePSO(LibraryType libraryType, const char* kernelName);
+  Error compileLibrary(LibraryType);
+
  private:
   static MPSDevice* _device;
   id<MTLDevice> _mtl_device;
+  std::unordered_map<LibraryType, id<MTLLibrary>> _m_library_cache;
+  std::unordered_map<std::string, id<MTLComputePipelineState>> _m_pso_cache;
   MPSDevice();
 };
 
diff --git a/backends/apple/mps/runtime/MPSDevice.mm b/backends/apple/mps/runtime/MPSDevice.mm
index 86518fd0025..f51851c3795 100644
--- a/backends/apple/mps/runtime/MPSDevice.mm
+++ b/backends/apple/mps/runtime/MPSDevice.mm
@@ -16,6 +16,20 @@
 static std::unique_ptr<MPSDevice> mps_device;
 static std::once_flag mpsdev_init;
 
+static inline MTLLanguageVersion getMetalLanguageVersion(const id<MTLDevice>& device, bool macOS13Plus) {
+  // MPS Advanced Indexing needs at least Metal 2.0 (support for Argument Buffers and function constants)
+  // host_name attribute needs at least Metal 2.2 and ulong needs Metal 2.3 (supported on MacOS 11+)
+  MTLLanguageVersion languageVersion = MTLLanguageVersion2_3;
+#if defined(__MAC_13_0)
+  if (macOS13Plus) {
+    languageVersion = MTLLanguageVersion3_0;
+  }
+#endif
+
+  ET_CHECK_MSG([device supportsFamily:MTLGPUFamilyMac2], "Missing Metal support for MTLGPUFamilyMac2");
+  return languageVersion;
+}
+
 MPSDevice::~MPSDevice() {
   [_mtl_device release];
   _mtl_device = nil;
@@ -79,6 +93,57 @@
   }
 }
 
+const char* getLibraryCString(LibraryType libraryType) {
+  switch (libraryType) {
+    case LibraryType::INDEXING_KERNELS:
+      return "TODO";
+    default:
+      ET_CHECK_MSG(false, "Unhandled library type!");
+  }
+}
+
+Error
+MPSDevice::compileLibrary(LibraryType libraryType) {
+  Error err = Error::Ok;
+  NSError* error = nil;
+  MTLCompileOptions* options = [MTLCompileOptions new];
+  [options setLanguageVersion:getMetalLanguageVersion(_mtl_device, isMacOS13Plus(MacOSVersion::MACOS_VER_13_0_PLUS))];
+  [options setFastMathEnabled:YES];
+  id<MTLLibrary> lib =
+      [_mtl_device newLibraryWithSource:[NSString stringWithCString:getLibraryCString(libraryType)
+                                                           encoding:NSASCIIStringEncoding]
+                                options:options
+                                  error:&error];
+
+  ET_CHECK_OR_RETURN_ERROR(
+    lib != nil,
+    Internal,
+    "Failed to create indexing library, error: %s", [[error description] UTF8String]
+  );
+
+  _m_library_cache[libraryType] = lib;
+  return err;
+}
+
+Error
+MPSDevice::compilePSO(LibraryType libraryType, const char* kernelName) {
+  Error err = Error::Ok;
+  if (_m_library_cache.find(libraryType) == _m_library_cache.end()) {
+    ET_LOG(Debug, "Compiling library type: %d", libraryType);
+    err = compileLibrary(libraryType);
+    ET_CHECK_OR_RETURN_ERROR(
+      err == Error::Ok,
+      Internal,
+      "An error occured occured while compiling library %d", libraryType
+    );
+  }
+  if (_m_pso_cache.find(kernelName) == _m_pso_cache.end()) {
+    ET_LOG(Debug, "Compiling kernel: %s", kernelName);
+    // err = compilePSO(libraryType, kernelName);
+  }
+  return err;
+}
+
 bool isMacOS13OrNewer(MacOSVersion version) {
   return MPSDevice::getInstance()->isMacOS13Plus(version);
 }
diff --git a/backends/apple/mps/runtime/MPSGraphBuilder.h b/backends/apple/mps/runtime/MPSGraphBuilder.h
index 0a7bf835a73..e4e89d68691 100644
--- a/backends/apple/mps/runtime/MPSGraphBuilder.h
+++ b/backends/apple/mps/runtime/MPSGraphBuilder.h
@@ -109,6 +109,7 @@ class MPSGraphBuilder {
   _DEFINE_MPS_OP(Isnan);
   _DEFINE_MPS_OP(Isinf);
   _DEFINE_MPS_OP(Round);
+  _DEFINE_MPS_OP(LogicalNot);
   _DEFINE_MPS_OP(NormCdf);
   // Clamp ops
   _DEFINE_MPS_OP(Clamp);
@@ -120,6 +121,8 @@ class MPSGraphBuilder {
   // Indexing ops
   _DEFINE_MPS_OP(IndexSelect);
   _DEFINE_MPS_OP(Embedding);
+  _DEFINE_MPS_OP(IndexTensor);
+  _DEFINE_MPS_OP(IndexPut);
   // Linear algebra ops
   _DEFINE_MPS_OP(MatMul);
   _DEFINE_MPS_OP(Addmm);
@@ -153,6 +156,7 @@ class MPSGraphBuilder {
 
   // Helper functions
   Error addNodeToMPSGraph(NodePtr nodePtr);
+  Error compileMetalKernel(NodePtr nodePtr);
   MPSShape *getMPSShape(int32_t id);
   MPSShape *getMPSShape(const flatbuffers::Vector<int32_t> *shape);
   int64_t numel(const flatbuffers::Vector<int32_t> *shape);
@@ -161,6 +165,8 @@ class MPSGraphBuilder {
   MPSGraphTensor *getMPSGraphTensor(int32_t id);
   NSData *getConstantData(int32_t id);
   std::pair<float, float> getMinMaxValues(NodePtr nodePtr);
+  Error compileMPSGraph();
+  Error compileMetalKernel();
 
   // Each MPSGraph op result in at least MPSGraphTensor being
   // produced, which will be stored in this structure. Other ops
@@ -172,6 +178,7 @@ class MPSGraphBuilder {
   // FlatBuffer raw bytes of the serialized MPS model.
   const void *_buffer_pointer;
 
+  bool _metal_kernel;
   MPSGraph *_mpsGraph;
   MPSGraphExecutable *_mpsGraphExecutable;
   NSMutableDictionary<MPSGraphTensor *, MPSGraphShapedType *> *_feeds;
diff --git a/backends/apple/mps/runtime/MPSGraphBuilder.mm b/backends/apple/mps/runtime/MPSGraphBuilder.mm
index d82b677066f..8b571001d42 100644
--- a/backends/apple/mps/runtime/MPSGraphBuilder.mm
+++ b/backends/apple/mps/runtime/MPSGraphBuilder.mm
@@ -17,6 +17,7 @@
   _targetTensors = [NSMutableArray new];
 
   _mpsGraphExecutable = nil;
+  _metal_kernel = false;
 }
 
 Error
@@ -32,8 +33,34 @@
     mpsgraph::MPSGraphIdentifier());
 
   _flatBufferGraph = mpsgraph::GetMPSGraph(_buffer_pointer);
-  _idToMPSGraphTensor.resize(_flatBufferGraph->mps_values()->size(), nullptr);
+  switch (_flatBufferGraph->graph_type()) {
+    case mpsgraph::OpType::metal_kernel:
+    {
+      _metal_kernel = true;
+      err = compileMetalKernel();
+      break;
+    }
+    case mpsgraph::OpType::mps_graph:
+    {
+      err = compileMPSGraph();
+      break;
+    }
+    default:
+      ET_CHECK_OR_RETURN_ERROR(
+      false,
+      DelegateInvalidCompatibility,
+      "Received an invalid operation type: expected MPSGraph or metal kernel, but got: %s",
+      EnumNameOpType(_flatBufferGraph->graph_type()));
+  }
+
+  return err;
+}
 
+Error
+MPSGraphBuilder::compileMPSGraph() {
+  Error err = Error::Ok;
+
+  _idToMPSGraphTensor.resize(_flatBufferGraph->mps_values()->size(), nullptr);
   // Add the placeholder nodes to the graph.
   for (auto in_id : *_flatBufferGraph->input_ids()) {
     err = mpsGraphRankedPlaceholder(in_id);
@@ -71,6 +98,30 @@
   return err;
 }
 
+Error
+MPSGraphBuilder::compileMetalKernel() {
+  Error err = Error::Ok;
+
+  ET_CHECK_OR_RETURN_ERROR(
+    _flatBufferGraph->mps_nodes()->size() == 1,
+    DelegateInvalidCompatibility,
+    "Currently supporting dispatching a single Metal kernel.");
+  ET_CHECK_OR_RETURN_ERROR(
+    _flatBufferGraph->constant_ids()->size() == 0,
+    DelegateInvalidCompatibility,
+    "Currently not supporting dispatching Metal kernels with constants.");
+
+  // Compile the corresponding Metal kernel
+  for (auto node : *_flatBufferGraph->mps_nodes()) {
+    err = compileMetalKernel(node);
+    if (err != Error::Ok) {
+      return err;
+    }
+  }
+
+  return err;
+}
+
 Error
 MPSGraphBuilder::mpsGraphRankedPlaceholder(int32_t id) {
   ET_LOG(Debug, "%s: %d", __FUNCTION__, id);
diff --git a/backends/apple/mps/runtime/operations/IndexingOps.mm b/backends/apple/mps/runtime/operations/IndexingOps.mm
index 1c02cbea5c4..b4dcf192b46 100644
--- a/backends/apple/mps/runtime/operations/IndexingOps.mm
+++ b/backends/apple/mps/runtime/operations/IndexingOps.mm
@@ -108,6 +108,102 @@
   return Error::Ok;
 }
 
+Error
+MPSGraphBuilder::mpsIndexTensorOp(NodePtr nodePtr) {
+  Error err = Error::Ok;
+  auto graphNode = nodePtr->mpsnode_union_as_MPSIndexTensor();
+  ET_LOG(
+    Debug, "%s: %d -> %d",
+    __FUNCTION__, graphNode->input1_id(), graphNode->output_id()
+  );
+
+  if (_metal_kernel) {
+    err = MPSDevice::getInstance()->compilePSO(LibraryType::INDEXING_KERNELS, "index_select");
+    ET_CHECK_MSG(false, "Metal kernel path not yet implemented\n");
+  } else {
+    int validIndices = 0;
+    int numIndices = graphNode->indices_id()->size();
+    int axis = -1;
+    int indexId = -1;
+    for (int i = 0; i < numIndices; i++) {
+      int32_t index_id = graphNode->indices_id()->Get(i);
+      if (index_id == -1) {
+        continue;
+      }
+      validIndices++;
+      axis = i;
+      indexId = index_id;
+    }
+    ET_LOG(Debug, "index.Tensor with %d indices (axis = %d)", validIndices, axis);
+    ET_CHECK(validIndices > 0);
+
+    if (validIndices == 1) {
+      MPSGraphTensor* updatesTensor = getMPSGraphTensor(graphNode->input1_id());
+      MPSGraphTensor* indexTensor = getMPSGraphTensor(indexId);
+      _idToMPSGraphTensor[graphNode->output_id()] =
+        [_mpsGraph gatherWithUpdatesTensor:updatesTensor indicesTensor:indexTensor axis:axis batchDimensions:0 name:nil];
+    } else {
+      ET_CHECK_MSG(false, "Not yet implemented");
+    }
+  }
+
+  return err;
+}
+
+Error
+MPSGraphBuilder::mpsIndexPutOp(NodePtr nodePtr) {
+  Error err = Error::Ok;
+  auto graphNode = nodePtr->mpsnode_union_as_MPSIndexPut();
+  ET_LOG(
+    Debug, "%s: %d -> %d",
+    __FUNCTION__, graphNode->input1_id(), graphNode->output_id()
+  );
+
+  if (_metal_kernel) {
+    err = MPSDevice::getInstance()->compilePSO(LibraryType::INDEXING_KERNELS, "index_put");
+    ET_CHECK_MSG(false, "Metal kernel path not yet implemented\n");
+  } else {
+    int validIndices = 0;
+    int numIndices = graphNode->indices_id()->size();
+    int axis = -1;
+    int indexId = -1;
+    for (int i = 0; i < numIndices; i++) {
+      int32_t index_id = graphNode->indices_id()->Get(i);
+      if (index_id == -1) {
+        continue;
+      }
+      validIndices++;
+      axis = i;
+      indexId = index_id;
+    }
+    ET_LOG(Debug, "index_put with %d indices (axis = %d)", validIndices, axis);
+    ET_CHECK(validIndices > 0);
+
+    if (validIndices == 1) {
+      MPSGraphTensor* dataTensor = getMPSGraphTensor(graphNode->input1_id());
+      MPSGraphTensor* updatesTensor = getMPSGraphTensor(graphNode->values_id());
+      MPSGraphTensor* indicesTensor = getMPSGraphTensor(indexId);
+      if (graphNode->values_shape()->size() != 0) {
+        updatesTensor = [_mpsGraph broadcastTensor:updatesTensor
+                                       toShape:getMPSShape(graphNode->values_shape())
+                                            name:nil];
+      }
+
+      _idToMPSGraphTensor[graphNode->output_id()] =
+        [_mpsGraph scatterWithDataTensor:dataTensor
+                           updatesTensor:updatesTensor
+                           indicesTensor:indicesTensor
+                                    axis:axis
+                                    mode:MPSGraphScatterModeSet
+                                  name:nil];
+    } else {
+      ET_CHECK_MSG(false, "Not yet implemented");
+    }
+  }
+
+  return err;
+}
+
 } // namespace delegate
 } // namespace mps
 } // namespace executor
diff --git a/backends/apple/mps/runtime/operations/OperationUtils.mm b/backends/apple/mps/runtime/operations/OperationUtils.mm
index 71c36c967ef..648421ee2cd 100644
--- a/backends/apple/mps/runtime/operations/OperationUtils.mm
+++ b/backends/apple/mps/runtime/operations/OperationUtils.mm
@@ -166,6 +166,7 @@
     _DEFINE_MPS_NODE(Isnan);
     _DEFINE_MPS_NODE(Isinf);
     _DEFINE_MPS_NODE(Round);
+    _DEFINE_MPS_NODE(LogicalNot);
     // Clamp ops
     _DEFINE_MPS_NODE(Clamp);
     _DEFINE_MPS_NODE(Where);
@@ -178,6 +179,8 @@
     //Indexing ops
     _DEFINE_MPS_NODE(IndexSelect);
     _DEFINE_MPS_NODE(Embedding);
+    _DEFINE_MPS_NODE(IndexTensor);
+    _DEFINE_MPS_NODE(IndexPut);
     // Reduce ops
     _DEFINE_MPS_NODE(Mean);
     // Shape ops
@@ -223,6 +226,11 @@
   }
 }
 
+Error
+MPSGraphBuilder::compileMetalKernel(NodePtr nodePtr) {
+  return addNodeToMPSGraph(nodePtr);
+}
+
 #undef _DEFINE_MPS_NODE
 
 MPSGraphTensor*
diff --git a/backends/apple/mps/runtime/operations/ShapeOps.mm b/backends/apple/mps/runtime/operations/ShapeOps.mm
index 720161b955d..75de566e4ad 100644
--- a/backends/apple/mps/runtime/operations/ShapeOps.mm
+++ b/backends/apple/mps/runtime/operations/ShapeOps.mm
@@ -42,13 +42,9 @@
     __FUNCTION__, graphNode->input1_id(), graphNode->output_id()
   );
 
-  NSMutableArray<NSNumber*>* shape = [NSMutableArray array];
-  for (int32_t i = 0; i < graphNode->num_dims(); i++) {
-    [shape addObject:[NSNumber numberWithInteger:graphNode->shape()->Get(i)]];
-  }
   _idToMPSGraphTensor[graphNode->output_id()] =
     [_mpsGraph reshapeTensor:getMPSGraphTensor(graphNode->input1_id())
-                  withShape:shape
+                  withShape:getMPSShape(graphNode->shape())
                        name:@"view_copy"];
 
   return Error::Ok;
@@ -91,7 +87,7 @@
     __FUNCTION__, graphNode->output_id()
   );
 
-  NSMutableArray<MPSGraphTensor*>* inputTensors = [NSMutableArray array];
+  NSMutableArray<MPSGraphTensor*>* inputTensors = [NSMutableArray arrayWithCapacity:graphNode->input_ids()->size()];;
   for (auto id : *graphNode->input_ids()) {
     MPSGraphTensor* catTensor = getMPSGraphTensor(id);
     if (catTensor != nil)
diff --git a/backends/apple/mps/runtime/operations/UnaryOps.mm b/backends/apple/mps/runtime/operations/UnaryOps.mm
index 31246bd44f2..ed06584b271 100644
--- a/backends/apple/mps/runtime/operations/UnaryOps.mm
+++ b/backends/apple/mps/runtime/operations/UnaryOps.mm
@@ -92,6 +92,7 @@
 REGISTER_UNARY_OP(Isnan, isNaN)
 REGISTER_UNARY_OP(Isinf, isInfinite)
 REGISTER_UNARY_OP(Round, round)
+REGISTER_UNARY_OP(LogicalNot, not)
 
 
 Error
diff --git a/backends/apple/mps/serialization/mps_graph_schema.py b/backends/apple/mps/serialization/mps_graph_schema.py
index 66697b04b7d..8134091a01d 100644
--- a/backends/apple/mps/serialization/mps_graph_schema.py
+++ b/backends/apple/mps/serialization/mps_graph_schema.py
@@ -27,6 +27,11 @@ class MPSDataType(IntEnum):
     mps_data_type_complex_float32 = 11
 
 
+class OpType(IntEnum):
+    mps_graph = 0
+    metal_kernel = 1
+
+
 @dataclass
 class MPSNode1x1:
     input1_id: int
@@ -359,6 +364,11 @@ class MPSRound(MPSNode1x1):
     pass
 
 
+@dataclass
+class MPSLogicalNot(MPSNode1x1):
+    pass
+
+
 @dataclass
 class MPSBitwise(MPSNode1x1):
     pass
@@ -434,6 +444,18 @@ class MPSEmbedding(MPSNode2x1):
     sparse: bool = False
 
 
+@dataclass
+class MPSIndexTensor(MPSNode1x1):
+    indices_id: List[int] = field(default_factory=list)
+
+
+@dataclass
+class MPSIndexPut(MPSNode1x1):
+    indices_id: List[int] = field(default_factory=list)
+    values_shape: List[int] = field(default_factory=list)
+    values_id: int = -1
+
+
 ##
 ## Shape ops
 ##
@@ -664,6 +686,7 @@ class MPSArange:
     MPSIsnan,
     MPSIsinf,
     MPSRound,
+    MPSLogicalNot,
     # Linear algebra ops
     MPSMatMul,
     MPSAddmm,
@@ -678,6 +701,8 @@ class MPSArange:
     # Indexing ops
     MPSIndexSelect,
     MPSEmbedding,
+    MPSIndexTensor,
+    MPSIndexPut,
     # Shape ops
     MPSPermute,
     MPSView,
@@ -741,3 +766,4 @@ class MPSGraph:
     input_ids: List[int]
     output_ids: List[int]
     constant_ids: List[int]
+    graph_type: OpType
diff --git a/backends/apple/mps/serialization/schema.fbs b/backends/apple/mps/serialization/schema.fbs
index c3e3eaa4faf..6ba2c937f32 100644
--- a/backends/apple/mps/serialization/schema.fbs
+++ b/backends/apple/mps/serialization/schema.fbs
@@ -24,6 +24,13 @@ enum MPSDataType : short {
   mps_data_type_complex_float32 = 11,
 }
 
+// ops like index.Tensor and index.put are currentely implemented as
+// Metal kernels for unsupported MPSGraph cases.
+enum OpType : short {
+  mps_graph,
+  metal_kernel
+}
+
 // Helper classes to define the number of input and output tensors for a node.
 // Not meant to be used directly.
 
@@ -145,6 +152,20 @@ table MPSEmbedding {
   sparse:bool;
 }
 
+table MPSIndexTensor {
+  input1_id:int;
+  indices_id:[int];
+  output_id:int;
+}
+
+table MPSIndexPut {
+  input1_id:int;
+  indices_id:[int];
+  values_shape:[int];
+  values_id:int;
+  output_id:int;
+}
+
 // Shape ops.
 table MPSPermute {
   input1_id:int;
@@ -350,6 +371,7 @@ union MPSNodeUnion {
     MPSIsnan: _MPSNode1x1,
     MPSIsinf: _MPSNode1x1,
     MPSRound: _MPSNode1x1,
+    MPSLogicalNot: _MPSNode1x1,
 
     // Linear algebra ops
     MPSMatMul: _MPSNode2x1,
@@ -366,6 +388,8 @@ union MPSNodeUnion {
     // Indexing ops
     MPSIndexSelect,
     MPSEmbedding,
+    MPSIndexTensor,
+    MPSIndexPut,
 
     // Reduce ops
     MPSMean,
@@ -438,6 +462,8 @@ table MPSGraph {
   input_ids:[int];
   output_ids:[int];
   constant_ids:[int];
+
+  graph_type:OpType;
 }
 
 root_type MPSGraph;
diff --git a/backends/apple/mps/setup.md b/backends/apple/mps/setup.md
index 9c2222bb6de..c8fdfeb98e4 100644
--- a/backends/apple/mps/setup.md
+++ b/backends/apple/mps/setup.md
@@ -15,15 +15,28 @@ The MPS backend device maps machine learning computational graphs and primitives
 * [Introduction to ExecuTorch](intro-how-it-works.md)
 * [Setting up ExecuTorch](getting-started-setup.md)
 * [Building ExecuTorch with CMake](runtime-build-and-cross-compilation.md)
+* [ExecuTorch iOS Demo App](demo-apps-ios.md)
+* [ExecuTorch iOS LLaMA Demo App](llm/llama-demo-ios.md)
 :::
 ::::
 
 
 ## Prerequisites (Hardware and Software)
 
-In order to be able to successfully build and run a model using the MPS backend for ExecuTorch, you'll need the following hardware and software components.
- - macOS 12 / iOS 15 or later (for MPS runtime)
- - Xcode command-line tools: xcode-select --install
+In order to be able to successfully build and run a model using the MPS backend for ExecuTorch, you'll need the following hardware and software components:
+
+### Hardware:
+ - A [mac](https://www.apple.com/mac/) for tracing the model
+
+### Software:
+
+  - **Ahead of time** tracing:
+    - [macOS](https://www.apple.com/macos/) 12
+
+  - **Runtime**:
+    - [macOS](https://www.apple.com/macos/) >= 12.4
+    - [iOS](https://www.apple.com/ios) >= 15.4
+    - [Xcode](https://developer.apple.com/xcode/) >= 14.1
 
 ## Setting up Developer Environment
 
@@ -40,47 +53,34 @@ In order to be able to successfully build and run a model using the MPS backend
 ### AOT (Ahead-of-time) Components
 
 **Compiling model for MPS delegate**:
-- In this step, you will generate a simple ExecuTorch program that lowers MobileNetV3 model to the MPS delegate. You'll then pass this Program(the `.pte` file) during the runtime to run it using the MPS backend.
+- In this step, you will generate a simple ExecuTorch program that lowers MobileNetV3 model to the MPS delegate. You'll then pass this Program (the `.pte` file) during the runtime to run it using the MPS backend.
 
 ```bash
 cd executorch
-python3 -m examples.apple.mps.scripts.mps_example --model_name="mv3" --bundled
+# Note: `mps_example` script uses by default the MPSPartitioner for ops that are not yet supported by the MPS delegate. To turn it off, pass `--no-use_partitioner`.
+python3 -m examples.apple.mps.scripts.mps_example --model_name="mv3" --bundled --use_fp16
+
+# To see all options, run following command:
+python3 -m examples.apple.mps.scripts.mps_example --help
 ```
 
 ### Runtime
 
-**Building the MPS executor runner**
-- In this step, you'll be building the `mps_executor_runner` that is able to run MPS lowered modules.
-
+**Building the MPS executor runner:**
 ```bash
-# Build the mps_executor_runner
+# In this step, you'll be building the `mps_executor_runner` that is able to run MPS lowered modules:
+cd executorch
+./examples/apple/mps/scripts/build_mps_executor_runner.sh
+```
+
+## Run the mv3 generated model using the mps_executor_runner
+
 ```bash
-# Build and install executorch
-cmake -DBUCK2="$BUCK" \
-          -DCMAKE_INSTALL_PREFIX=cmake-out \
-          -DCMAKE_BUILD_TYPE=Release \
-          -DEXECUTORCH_BUILD_SDK=ON \
-          -DEXECUTORCH_ENABLE_EVENT_TRACER=ON \
-          -DEXECUTORCH_BUILD_MPS=ON \
-          -DPYTHON_EXECUTABLE="$PYTHON_EXECUTABLE" \
-          -Bcmake-out .
-cmake --build cmake-out -j9 --target install --config Release
-CMAKE_PREFIX_PATH="${PWD}/cmake-out/lib/cmake/ExecuTorch;${PWD}/cmake-out/third-party/gflags"
-# build mps_executor_runner
-rm -rf cmake-out/examples/apple/mps
-cmake \
-    -DCMAKE_PREFIX_PATH="$CMAKE_PREFIX_PATH" \
-    -DCMAKE_BUILD_TYPE=Release \
-    -DPYTHON_EXECUTABLE="$PYTHON_EXECUTABLE" \
-    -Bcmake-out/examples/apple/mps \
-    examples/apple/mps
-
-cmake --build cmake-out/examples/apple/mps -j9 --config Release
-
-# Run the mv2 generated model using the mps_executor_runner
 ./cmake-out/examples/apple/mps/mps_executor_runner --model_path mv3_mps_bundled_fp16.pte --bundled_program
+```
 
-# You should see the following results. Note that no output file will be generated in this example:
+- You should see the following results. Note that no output file will be generated in this example:
+```
 I 00:00:00.003290 executorch:mps_executor_runner.mm:286] Model file mv3_mps_bundled_fp16.pte is loaded.
 I 00:00:00.003306 executorch:mps_executor_runner.mm:292] Program methods: 1
 I 00:00:00.003308 executorch:mps_executor_runner.mm:294] Running method forward
@@ -94,12 +94,43 @@ I 00:00:00.118731 executorch:mps_executor_runner.mm:438] Model executed successf
 I 00:00:00.122615 executorch:mps_executor_runner.mm:501] Model verified successfully.
 ```
 
+### [Optional] Run the generated model directly using pybind
+1. Make sure `pybind` MPS support was installed:
+```bash
+./install_requirements.sh --pybind mps
+```
+2. Run the `mps_example` script to trace the model and run it directly from python:
+```bash
+cd executorch
+# Check correctness between PyTorch eager forward pass and ExecuTorch MPS delegate forward pass
+python3 -m examples.apple.mps.scripts.mps_example --model_name="mv3" --no-use_fp16 --check_correctness
+# You should see following output: `Results between ExecuTorch forward pass with MPS backend and PyTorch forward pass for mv3_mps are matching!`
+
+# Check performance between PyTorch MPS forward pass and ExecuTorch MPS forward pass
+python3 -m examples.apple.mps.scripts.mps_example --model_name="mv3" --no-use_fp16 --bench_pytorch
+```
+
+### Profiling:
+1. [Optional] Generate an [ETRecord](./sdk-etrecord.rst) while you're exporting your model.
+```bash
+cd executorch
+python3 -m examples.apple.mps.scripts.mps_example --model_name="mv3" --generate_etrecord -b
+```
+2. Run your Program on the ExecuTorch runtime and generate an [ETDump](./sdk-etdump.md).
+```
+./cmake-out/examples/apple/mps/mps_executor_runner --model_path mv3_mps_bundled_fp16.pte --bundled_program --dump-outputs
+```
+3. Create an instance of the Inspector API by passing in the ETDump you have sourced from the runtime along with the optionally generated ETRecord from step 1.
+```bash
+python3 -m sdk.inspector.inspector_cli --etdump_path etdump.etdp --etrecord_path etrecord.bin
+```
+
 ## Deploying and Running on Device
 
 ***Step 1***. Create the ExecuTorch core and MPS delegate frameworks to link on iOS
 ```bash
 cd executorch
-./build/build_apple_frameworks.sh --Release --mps
+./build/build_apple_frameworks.sh --mps
 ```
 
 `mps_delegate.xcframework` will be in `cmake-out` folder, along with `executorch.xcframework` and `portable_delegate.xcframework`:
@@ -123,4 +154,4 @@ In this tutorial, you have learned how to lower a model to the MPS delegate, bui
 
 ## Frequently encountered errors and resolution.
 
-If you encountered any bugs or issues following this tutorial please file a bug/issue on the ExecuTorch repository, with hashtag **#mps**.
+If you encountered any bugs or issues following this tutorial please file a bug/issue on the [ExecuTorch repository](https://github.com/pytorch/executorch/issues), with hashtag **#mps**.
diff --git a/backends/apple/mps/targets.bzl b/backends/apple/mps/targets.bzl
index 94f030310db..4d2862eb727 100644
--- a/backends/apple/mps/targets.bzl
+++ b/backends/apple/mps/targets.bzl
@@ -22,6 +22,7 @@ def define_common_targets(is_xplat = False, platforms = []):
             "-Wno-unused-const-variable",
             "-Wno-unused-variable",
             "-fno-objc-arc",
+            "-std=c++17",
         ],
         "deps": [
             "//executorch/runtime/core:core",
diff --git a/backends/apple/mps/test/test_mps.py b/backends/apple/mps/test/test_mps.py
index 691081d35de..5ca9d0175e9 100644
--- a/backends/apple/mps/test/test_mps.py
+++ b/backends/apple/mps/test/test_mps.py
@@ -677,188 +677,6 @@ def forward(self, x):
             const_module, model_inputs, func_name=inspect.stack()[0].function[5:]
         )
 
-    def test_mps_constant_add(self):
-        class Module(torch.nn.Module):
-            def __init__(self):
-                super().__init__()
-                self._constant = torch.ones(4, 4, 4)
-
-            def forward(self, x):
-                out1 = x + self._constant
-                out2 = x + self._constant + self._constant
-                return out1, out2
-
-        const_module = Module()
-        model_inputs = (torch.randn(4, 4, 4),)
-
-        self.lower_and_test_with_partitioner(
-            const_module, model_inputs, func_name=inspect.stack()[0].function[5:]
-        )
-
-    def test_mps_mul_scalar_float(self):
-        class MulScalarModule(torch.nn.Module):
-            def __init__(self):
-                super().__init__()
-                self._scalar = 3.14
-
-            def forward(self, x):
-                out1 = torch.ops.aten.mul.Scalar(x, self._scalar)
-                return out1
-
-        mul_scalar_module = MulScalarModule()
-        model_inputs = (torch.randn(4, 4, 4),)
-
-        self.lower_and_test_with_partitioner(
-            mul_scalar_module, model_inputs, func_name=inspect.stack()[0].function[5:]
-        )
-
-    def test_mps_mul_scalar_int(self):
-        class MulScalarModule(torch.nn.Module):
-            def __init__(self):
-                super().__init__()
-                self._scalar = 3
-
-            def forward(self, x):
-                out1 = torch.ops.aten.mul.Scalar(x, self._scalar)
-                return out1
-
-        mul_scalar_module = MulScalarModule()
-        model_inputs = (torch.randint(11, (4, 4, 4)),)
-
-        self.lower_and_test_with_partitioner(
-            mul_scalar_module, model_inputs, func_name=inspect.stack()[0].function[5:]
-        )
-
-    def test_mps_backend_add_1(self):
-        class AddModule(torch.nn.Module):
-            def __init__(self):
-                super().__init__()
-
-            def forward(self, x, y):
-                z = torch.add(x, y, alpha=0.1)
-                return z
-
-        add_module = AddModule()
-        model_inputs = (torch.randn(1), torch.randn(1))
-
-        self.lower_and_test_with_partitioner(
-            add_module, model_inputs, func_name=inspect.stack()[0].function[5:]
-        )
-
-    def test_mps_backend_add_2(self):
-        class AddModule(torch.nn.Module):
-            def __init__(self):
-                super().__init__()
-
-            def forward(self, x):
-                z = torch.ops.aten.add.Scalar(x, 2.0)
-                return z
-
-        add_module = AddModule()
-        model_inputs = (torch.randn(2, 5),)
-
-        self.lower_and_test_with_partitioner(
-            add_module, model_inputs, func_name=inspect.stack()[0].function[5:]
-        )
-
-    def test_mps_backend_add_3(self):
-        class AddModule(torch.nn.Module):
-            def __init__(self):
-                super().__init__()
-
-            def forward(self, x, y):
-                z = torch.add(x, y)
-                return z
-
-        add_module = AddModule()
-        model_inputs = (torch.randn(1), torch.randn(1))
-
-        self.lower_and_test_with_partitioner(
-            add_module, model_inputs, func_name=inspect.stack()[0].function[5:]
-        )
-
-    def test_mps_backend_sub_1(self):
-        class SubModule(torch.nn.Module):
-            def __init__(self):
-                super().__init__()
-
-            def forward(self, x, y):
-                z = torch.sub(x, y, alpha=0.1)
-                return z
-
-        sub_module = SubModule()
-        model_inputs = (torch.randn(1), torch.randn(1))
-
-        self.lower_and_test_with_partitioner(
-            sub_module, model_inputs, func_name=inspect.stack()[0].function[5:]
-        )
-
-    def test_mps_backend_sub_2(self):
-        class SubModule(torch.nn.Module):
-            def __init__(self):
-                super().__init__()
-
-            def forward(self, x):
-                z = torch.ops.aten.sub.Scalar(x, 2.0)
-                return z
-
-        sub_module = SubModule()
-        model_inputs = (torch.randn(2, 5),)
-
-        self.lower_and_test_with_partitioner(
-            sub_module, model_inputs, func_name=inspect.stack()[0].function[5:]
-        )
-
-    def test_mps_backend_sub_3(self):
-        class SubModule(torch.nn.Module):
-            def __init__(self):
-                super().__init__()
-
-            def forward(self, x, y):
-                z = torch.sub(x, y)
-                return z
-
-        sub_module = SubModule()
-        model_inputs = (torch.randn(1), torch.randn(1))
-
-        self.lower_and_test_with_partitioner(
-            sub_module, model_inputs, func_name=inspect.stack()[0].function[5:]
-        )
-
-    def test_mps_backend_add_scalar_float(self):
-        class AddScalarModule(torch.nn.Module):
-            def __init__(self):
-                super().__init__()
-                self._scalar_float = 3.14
-
-            def forward(self, x):
-                out = torch.ops.aten.add.Scalar(x, self._scalar_float)
-                return out
-
-        add_scalar_module = AddScalarModule()
-        model_inputs = (torch.randn(4, 4, 4),)
-
-        self.lower_and_test_with_partitioner(
-            add_scalar_module, model_inputs, func_name=inspect.stack()[0].function[5:]
-        )
-
-    def test_mps_backend_add_scalar_int(self):
-        class AddScalarModule(torch.nn.Module):
-            def __init__(self):
-                super().__init__()
-                self._scalar_int = 3
-
-            def forward(self, x):
-                out1 = torch.ops.aten.add.Scalar(x, self._scalar_int)
-                return out1
-
-        add_scalar_module = AddScalarModule()
-        model_inputs = (torch.randint(11, (4, 4, 4), dtype=torch.int32),)
-
-        self.lower_and_test_with_partitioner(
-            add_scalar_module, model_inputs, func_name=inspect.stack()[0].function[5:]
-        )
-
     def test_mps_backend_logit_1(self):
         class LogitModule(torch.nn.Module):
             def __init__(self):
@@ -891,22 +709,6 @@ def forward(self, x):
             logit_module, model_inputs, func_name=inspect.stack()[0].function[5:]
         )
 
-    def test_mps_backend_div(self):
-        class DivModule(torch.nn.Module):
-            def __init__(self):
-                super().__init__()
-
-            def forward(self, x, y):
-                z = x / y
-                return z
-
-        div_module = DivModule()
-        model_inputs = (torch.ones(1), torch.ones(1))
-
-        self.lower_and_test_with_partitioner(
-            div_module, model_inputs, func_name=inspect.stack()[0].function[5:]
-        )
-
     def test_mps_backend_round(self):
         class RoundModule(torch.nn.Module):
             def __init__(self):
@@ -923,36 +725,6 @@ def forward(self, x):
             module, model_inputs, func_name=inspect.stack()[0].function[5:]
         )
 
-    def test_mps_backend_fmod(self):
-        class FModModule(torch.nn.Module):
-            def __init__(self):
-                super().__init__()
-
-            def forward(self, x, y):
-                return torch.fmod(x, y)
-
-        module = FModModule()
-        model_inputs = (torch.randn(2, 3, 4), torch.randn(2, 3, 4))
-
-        self.lower_and_test_with_partitioner(
-            module, model_inputs, func_name=inspect.stack()[0].function[5:]
-        )
-
-    def test_mps_backend_floor_divide(self):
-        class FloorDivideModule(torch.nn.Module):
-            def __init__(self):
-                super().__init__()
-
-            def forward(self, x, y):
-                return torch.floor_divide(x, y)
-
-        module = FloorDivideModule()
-        model_inputs = (torch.randn(2, 3, 4), torch.randn(2, 3, 4))
-
-        self.lower_and_test_with_partitioner(
-            module, model_inputs, func_name=inspect.stack()[0].function[5:]
-        )
-
     def test_mps_backend_amax(self):
         class AmaxModule(torch.nn.Module):
             def __init__(self):
@@ -1331,6 +1103,149 @@ def forward(self, x):
             module, model_inputs, func_name=inspect.stack()[0].function[5:]
         )
 
+    def test_mps_indexing_get_1(self):
+        class IndexGet(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+
+            def forward(self, x):
+                return x[[0, 1, 2], [0, 1, 0]]
+
+        module = IndexGet()
+        model_inputs = (torch.tensor([[1, 2], [3, 4], [5, 6]]),)
+
+        self.lower_and_test_with_partitioner(
+            module, model_inputs, func_name=inspect.stack()[0].function[5:]
+        )
+
+    def test_mps_indexing_get_2(self):
+        class IndexGet(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+
+            def forward(self, x):
+                return x[:, [0, 4, 2]]
+
+        module = IndexGet()
+        model_inputs = (torch.randn(5, 7, 3),)
+
+        self.lower_and_test_with_partitioner(
+            module, model_inputs, func_name=inspect.stack()[0].function[5:]
+        )
+
+    def test_mps_indexing_get_3(self):
+        class IndexGet(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+
+            def forward(self, x):
+                return x[:, [[0, 1], [4, 3]]]
+
+        module = IndexGet()
+        model_inputs = (torch.randn(5, 7, 3),)
+
+        self.lower_and_test_with_partitioner(
+            module, model_inputs, func_name=inspect.stack()[0].function[5:]
+        )
+
+    def test_mps_indexing_get_4(self):
+        class IndexGet(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+
+            def forward(self, x):
+                return x[[0, 4, 2]]
+
+        module = IndexGet()
+        model_inputs = (torch.randn(5, 7, 3),)
+
+        self.lower_and_test_with_partitioner(
+            module, model_inputs, func_name=inspect.stack()[0].function[5:]
+        )
+
+    def test_mps_indexing_get_5(self):
+        class IndexGet(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+
+            def forward(self, x):
+                return x[[0, 2, 1], :, 0]
+
+        module = IndexGet()
+        model_inputs = (torch.ones(3, 2, 4),)
+
+        self.lower_and_test_with_partitioner(
+            module, model_inputs, func_name=inspect.stack()[0].function[5:]
+        )
+
+    def test_mps_indices2d(self):
+        class IndexGet(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+
+            def forward(self, x, rows, columns):
+                return x[rows, columns]
+
+        module = IndexGet()
+        x = torch.arange(0, 12).resize(4, 3)
+        rows = torch.tensor([[0, 0], [3, 3]])
+        columns = torch.tensor([[0, 2], [0, 2]])
+        model_inputs = (
+            x,
+            rows,
+            columns,
+        )
+
+        self.lower_and_test_with_partitioner(
+            module, model_inputs, func_name=inspect.stack()[0].function[5:]
+        )
+
+    def test_mps_slicing_using_advanced_index_for_column_0(self):
+        class IndexGet(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+
+            def forward(self, x):
+                return x[1:4]
+
+        module = IndexGet()
+        model_inputs = (torch.tensor([[0, 1, 2], [3, 4, 5], [6, 7, 8], [9, 10, 11]]),)
+
+        self.lower_and_test_with_partitioner(
+            module, model_inputs, func_name=inspect.stack()[0].function[5:]
+        )
+
+    def test_mps_slicing_using_advanced_index_for_column_1(self):
+        class IndexGet(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+
+            def forward(self, x):
+                # using advanced index for column
+                return x[1:4, [1, 2]]
+
+        module = IndexGet()
+        model_inputs = (torch.tensor([[0, 1, 2], [3, 4, 5], [6, 7, 8], [9, 10, 11]]),)
+
+        self.lower_and_test_with_partitioner(
+            module, model_inputs, func_name=inspect.stack()[0].function[5:]
+        )
+
+    def test_boolean_array_indexing(self):
+        class IndexGet(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+
+            def forward(self, x):
+                return x[x > 5]
+
+        module = IndexGet()
+        model_inputs = (torch.tensor([[0, 1, 2], [3, 4, 5], [6, 7, 8], [9, 10, 11]]),)
+
+        self.lower_and_test_with_partitioner(
+            module, model_inputs, func_name=inspect.stack()[0].function[5:]
+        )
+
     def test_mps_backend_isinf(self):
         class IsInfModule(torch.nn.Module):
             def __init__(self):
diff --git a/backends/apple/mps/test/test_mps_binary_ops.py b/backends/apple/mps/test/test_mps_binary_ops.py
new file mode 100644
index 00000000000..fdf2d1fbb94
--- /dev/null
+++ b/backends/apple/mps/test/test_mps_binary_ops.py
@@ -0,0 +1,296 @@
+#
+#  Copyright (c) 2024 Apple Inc. All rights reserved.
+#  Provided subject to the LICENSE file in the top level directory.
+#
+
+import inspect
+
+import torch
+from executorch.backends.apple.mps.test.test_mps_utils import TestMPS
+
+
+class TestMPSAdd(TestMPS):
+    class Add(torch.nn.Module):
+        def __init__(self):
+            super().__init__()
+
+        def forward(self, x, y):
+            z = x + y
+            z = z + x
+            z = z + x
+            z = z + z
+            return z
+
+    class Add2(torch.nn.Module):
+        def __init__(self):
+            super().__init__()
+
+        def forward(self, x):
+            z = x + x
+            return z
+
+    class AddConstant(torch.nn.Module):
+        def __init__(self, constant):
+            super().__init__()
+            self._constant1 = constant
+            self.register_buffer("_constant2", constant, persistent=False)
+            self.register_parameter("_constant3", torch.nn.Parameter(constant))
+
+        def forward(self, x):
+            out1 = x + self._constant1 + torch.ones(1, 1, 1)
+            out2 = x + self._constant2 + self._constant3
+            return out1, out2
+
+    def test_fp16_add(self):
+        inputs = (torch.ones(1).to(torch.float16), torch.ones(1).to(torch.float16))
+        self.lower_and_test_with_partitioner(
+            self.Add(), inputs, func_name=inspect.stack()[0].function[5:]
+        )
+
+    def test_fp32_add(self):
+        inputs = (torch.ones(1), torch.ones(1))
+        self.lower_and_test_with_partitioner(
+            self.Add(), inputs, func_name=inspect.stack()[0].function[5:]
+        )
+
+    def test_fp32_add_constant(self):
+        inputs = (torch.randn(4, 4, 4),)
+        self.lower_and_test_with_partitioner(
+            self.AddConstant(torch.ones(4, 4, 4)),
+            inputs,
+            func_name=inspect.stack()[0].function[5:],
+        )
+
+    def test_add_w_alpha(self):
+        class AddModule(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+
+            def forward(self, x, y):
+                z = torch.add(x, y, alpha=0.1)
+                return z
+
+        add_module = AddModule()
+        model_inputs = (torch.randn(1), torch.randn(1))
+
+        self.lower_and_test_with_partitioner(
+            add_module, model_inputs, func_name=inspect.stack()[0].function[5:]
+        )
+
+    def test_add_scalar(self):
+        class AddModule(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+
+            def forward(self, x):
+                z = torch.ops.aten.add.Scalar(x, 2.0)
+                return z
+
+        add_module = AddModule()
+        model_inputs = (torch.randn(2, 5),)
+
+        self.lower_and_test_with_partitioner(
+            add_module, model_inputs, func_name=inspect.stack()[0].function[5:]
+        )
+
+    def test_add_scalar_int(self):
+        class AddScalarModule(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self._scalar_int = 3
+
+            def forward(self, x):
+                out1 = torch.ops.aten.add.Scalar(x, self._scalar_int)
+                return out1
+
+        add_scalar_module = AddScalarModule()
+        model_inputs = (torch.randint(11, (4, 4, 4), dtype=torch.int32),)
+
+        self.lower_and_test_with_partitioner(
+            add_scalar_module, model_inputs, func_name=inspect.stack()[0].function[5:]
+        )
+
+    def test_add_without_alpha(self):
+        class AddModule(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+
+            def forward(self, x, y):
+                z = torch.add(x, y)
+                return z
+
+        add_module = AddModule()
+        model_inputs = (torch.randn(1), torch.randn(1))
+
+        self.lower_and_test_with_partitioner(
+            add_module, model_inputs, func_name=inspect.stack()[0].function[5:]
+        )
+
+    def test_add_scalar_float(self):
+        class AddScalarModule(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self._scalar_float = 3.14
+
+            def forward(self, x):
+                out = torch.ops.aten.add.Scalar(x, self._scalar_float)
+                return out
+
+        add_scalar_module = AddScalarModule()
+        model_inputs = (torch.randn(4, 4, 4),)
+
+        self.lower_and_test_with_partitioner(
+            add_scalar_module, model_inputs, func_name=inspect.stack()[0].function[5:]
+        )
+
+    def test_constant_add(self):
+        class Module(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self._constant = torch.ones(4, 4, 4)
+
+            def forward(self, x):
+                out1 = x + self._constant
+                out2 = x + self._constant + self._constant
+                return out1, out2
+
+        const_module = Module()
+        model_inputs = (torch.randn(4, 4, 4),)
+
+        self.lower_and_test_with_partitioner(
+            const_module, model_inputs, func_name=inspect.stack()[0].function[5:]
+        )
+
+
+class TestMPSSub(TestMPS):
+    def test_mps_backend_sub_1(self):
+        class SubModule(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+
+            def forward(self, x, y):
+                z = torch.sub(x, y, alpha=0.1)
+                return z
+
+        sub_module = SubModule()
+        model_inputs = (torch.randn(1), torch.randn(1))
+
+        self.lower_and_test_with_partitioner(
+            sub_module, model_inputs, func_name=inspect.stack()[0].function[5:]
+        )
+
+    def test_mps_backend_sub_2(self):
+        class SubModule(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+
+            def forward(self, x):
+                z = torch.ops.aten.sub.Scalar(x, 2.0)
+                return z
+
+        sub_module = SubModule()
+        model_inputs = (torch.randn(2, 5),)
+
+        self.lower_and_test_with_partitioner(
+            sub_module, model_inputs, func_name=inspect.stack()[0].function[5:]
+        )
+
+    def test_mps_backend_sub_3(self):
+        class SubModule(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+
+            def forward(self, x, y):
+                z = torch.sub(x, y)
+                return z
+
+        sub_module = SubModule()
+        model_inputs = (torch.randn(1), torch.randn(1))
+
+        self.lower_and_test_with_partitioner(
+            sub_module, model_inputs, func_name=inspect.stack()[0].function[5:]
+        )
+
+
+class TestMPSMul(TestMPS):
+    def test_mps_mul_scalar_float(self):
+        class MulScalarModule(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self._scalar = 3.14
+
+            def forward(self, x):
+                out1 = torch.ops.aten.mul.Scalar(x, self._scalar)
+                return out1
+
+        mul_scalar_module = MulScalarModule()
+        model_inputs = (torch.randn(4, 4, 4),)
+
+        self.lower_and_test_with_partitioner(
+            mul_scalar_module, model_inputs, func_name=inspect.stack()[0].function[5:]
+        )
+
+    def test_mps_mul_scalar_int(self):
+        class MulScalarModule(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self._scalar = 3
+
+            def forward(self, x):
+                out1 = torch.ops.aten.mul.Scalar(x, self._scalar)
+                return out1
+
+        mul_scalar_module = MulScalarModule()
+        model_inputs = (torch.randint(11, (4, 4, 4)),)
+
+        self.lower_and_test_with_partitioner(
+            mul_scalar_module, model_inputs, func_name=inspect.stack()[0].function[5:]
+        )
+
+
+class TestMPSDiv(TestMPS):
+    def test_mps_backend_div(self):
+        class DivModule(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+
+            def forward(self, x, y):
+                z = x / y
+                return z
+
+        div_module = DivModule()
+        model_inputs = (torch.ones(1), torch.ones(1))
+
+        self.lower_and_test_with_partitioner(
+            div_module, model_inputs, func_name=inspect.stack()[0].function[5:]
+        )
+
+    def test_mps_backend_fmod(self):
+        class FModModule(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+
+            def forward(self, x, y):
+                return torch.fmod(x, y)
+
+        module = FModModule()
+        model_inputs = (torch.randn(2, 3, 4), torch.randn(2, 3, 4))
+
+        self.lower_and_test_with_partitioner(
+            module, model_inputs, func_name=inspect.stack()[0].function[5:]
+        )
+
+    def test_mps_backend_floor_divide(self):
+        class FloorDivideModule(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+
+            def forward(self, x, y):
+                return torch.floor_divide(x, y)
+
+        module = FloorDivideModule()
+        model_inputs = (torch.randn(2, 3, 4), torch.randn(2, 3, 4))
+
+        self.lower_and_test_with_partitioner(
+            module, model_inputs, func_name=inspect.stack()[0].function[5:]
+        )
diff --git a/backends/apple/mps/test/test_mps_indexing_ops.py b/backends/apple/mps/test/test_mps_indexing_ops.py
new file mode 100644
index 00000000000..7991f1a165a
--- /dev/null
+++ b/backends/apple/mps/test/test_mps_indexing_ops.py
@@ -0,0 +1,225 @@
+#
+#  Copyright (c) 2024 Apple Inc. All rights reserved.
+#  Provided subject to the LICENSE file in the top level directory.
+#
+
+import inspect
+
+import torch
+from executorch.backends.apple.mps.test.test_mps_utils import TestMPS
+
+
+class TestMPSIndexingOps(TestMPS):
+    def test_mps_indexing_get_1(self):
+        class IndexGet(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+
+            def forward(self, x):
+                return x[[0, 1, 2], [0, 1, 0]]
+
+        module = IndexGet()
+        model_inputs = (torch.tensor([[1, 2], [3, 4], [5, 6]]),)
+
+        self.lower_and_test_with_partitioner(
+            module, model_inputs, func_name=inspect.stack()[0].function[5:]
+        )
+
+    def test_mps_indexing_get_2(self):
+        class IndexGet(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+
+            def forward(self, x):
+                return x[:, [0, 1, 0]]
+
+        module = IndexGet()
+        model_inputs = (torch.tensor([[1, 2], [3, 4], [5, 6]]),)
+
+        self.lower_and_test_with_partitioner(
+            module, model_inputs, func_name=inspect.stack()[0].function[5:]
+        )
+
+    def test_mps_indexing_get_3(self):
+        class IndexGet(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+
+            def forward(self, x):
+                return x[:, [0, 1, 0], [0, 1, 0]]
+
+        module = IndexGet()
+        model_inputs = (torch.tensor([[[1, 2], [3, 4], [5, 6]]]),)
+
+        self.lower_and_test_with_partitioner(
+            module, model_inputs, func_name=inspect.stack()[0].function[5:]
+        )
+
+    def test_mps_indexing_get_4(self):
+        class IndexGet(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+
+            def forward(self, x):
+                return x[:, [0, 1, 0], [0, 1, 0]]
+
+        module = IndexGet()
+        model_inputs = (
+            torch.tensor([[[1, 2], [3, 4], [5, 6]], [[7, 8], [9, 10], [11, 12]]]),
+        )
+
+        self.lower_and_test_with_partitioner(
+            module, model_inputs, func_name=inspect.stack()[0].function[5:]
+        )
+
+    def test_mps_indexing_get_5(self):
+        class IndexGet(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+
+            def forward(self, x):
+                return x[:, [0, 4, 2]]
+
+        module = IndexGet()
+        model_inputs = (torch.randn(5, 7, 3),)
+
+        self.lower_and_test_with_partitioner(
+            module, model_inputs, func_name=inspect.stack()[0].function[5:]
+        )
+
+    def test_mps_indexing_get_6(self):
+        class IndexGet(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+
+            def forward(self, x):
+                return x[:, [[0, 1], [4, 3]]]
+
+        module = IndexGet()
+        model_inputs = (torch.randn(5, 7, 3),)
+
+        self.lower_and_test_with_partitioner(
+            module, model_inputs, func_name=inspect.stack()[0].function[5:]
+        )
+
+    def test_mps_indexing_get_7(self):
+        class IndexGet(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+
+            def forward(self, x):
+                return x[[0, 4, 2]]
+
+        module = IndexGet()
+        model_inputs = (torch.randn(5, 7, 3),)
+
+        self.lower_and_test_with_partitioner(
+            module, model_inputs, func_name=inspect.stack()[0].function[5:]
+        )
+
+    def test_mps_indexing_get_8(self):
+        class IndexGet(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+
+            def forward(self, x):
+                return x[[0, 2, 1], :, 0]
+
+        module = IndexGet()
+        model_inputs = (torch.ones(3, 2, 4),)
+
+        self.lower_and_test_with_partitioner(
+            module, model_inputs, func_name=inspect.stack()[0].function[5:]
+        )
+
+    def test_mps_indices2d(self):
+        class IndexGet(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+
+            def forward(self, x, rows, columns):
+                return x[rows, columns]
+
+        module = IndexGet()
+        x = torch.arange(0, 12).resize(4, 3)
+        rows = torch.tensor([[0, 0], [3, 3]])
+        columns = torch.tensor([[0, 2], [0, 2]])
+        model_inputs = (
+            x,
+            rows,
+            columns,
+        )
+
+        self.lower_and_test_with_partitioner(
+            module, model_inputs, func_name=inspect.stack()[0].function[5:]
+        )
+
+    def test_mps_slicing_using_advanced_index_for_column_0(self):
+        class IndexGet(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+
+            def forward(self, x):
+                return x[1:4]
+
+        module = IndexGet()
+        model_inputs = (torch.tensor([[0, 1, 2], [3, 4, 5], [6, 7, 8], [9, 10, 11]]),)
+
+        self.lower_and_test_with_partitioner(
+            module, model_inputs, func_name=inspect.stack()[0].function[5:]
+        )
+
+    def test_mps_slicing_using_advanced_index_for_column_1(self):
+        class IndexGet(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+
+            def forward(self, x):
+                # using advanced index for column
+                return x[1:4, [1, 2]]
+
+        module = IndexGet()
+        model_inputs = (torch.tensor([[0, 1, 2], [3, 4, 5], [6, 7, 8], [9, 10, 11]]),)
+
+        self.lower_and_test_with_partitioner(
+            module, model_inputs, func_name=inspect.stack()[0].function[5:]
+        )
+
+    # def test_boolean_array_indexing(self):
+    #     class IndexGet(torch.nn.Module):
+    #         def __init__(self):
+    #             super().__init__()
+
+    #         def forward(self, x):
+    #             return x[x > 5]
+
+    #     module = IndexGet()
+    #     model_inputs = (torch.tensor([[0, 1, 2], [3, 4, 5], [6, 7, 8], [9, 10, 11]]),)
+
+    #     self.lower_and_test_with_partitioner(
+    #         module, model_inputs, func_name=inspect.stack()[0].function[5:]
+    #     )
+
+    def test_mps_indexing_put_1(self):
+
+        class IndexPut(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+
+            def forward(self, x, y, z):
+                x[:, :, y] = z
+                return x
+
+        module = IndexPut()
+        input = torch.ones(1, 8, 128, 8)
+        indices = torch.tensor([1])
+        values = torch.randn(8, 1, 8)
+        model_inputs = (
+            input,
+            indices,
+            values,
+        )
+
+        self.lower_and_test_with_partitioner(
+            module, model_inputs, func_name=inspect.stack()[0].function[5:]
+        )
diff --git a/backends/apple/mps/test/test_mps_unary_ops.py b/backends/apple/mps/test/test_mps_unary_ops.py
new file mode 100644
index 00000000000..69c1f5ba5c6
--- /dev/null
+++ b/backends/apple/mps/test/test_mps_unary_ops.py
@@ -0,0 +1,26 @@
+#
+#  Copyright (c) 2024 Apple Inc. All rights reserved.
+#  Provided subject to the LICENSE file in the top level directory.
+#
+
+import inspect
+
+import torch
+from executorch.backends.apple.mps.test.test_mps_utils import TestMPS
+
+
+class TestMPSLoigcal(TestMPS):
+    def test_mps_logical_not(self):
+        class LogicalNot(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+
+            def forward(self, x):
+                return x.logical_not()
+
+        module = LogicalNot()
+        model_inputs = (torch.tensor([1, 1, 0, 0], dtype=torch.bool),)
+
+        self.lower_and_test_with_partitioner(
+            module, model_inputs, func_name=inspect.stack()[0].function[5:]
+        )
diff --git a/backends/apple/mps/test/test_mps_utils.py b/backends/apple/mps/test/test_mps_utils.py
index 0e4a7424cc2..6e569dedb50 100644
--- a/backends/apple/mps/test/test_mps_utils.py
+++ b/backends/apple/mps/test/test_mps_utils.py
@@ -15,7 +15,6 @@
 from executorch.exir import (
     EdgeCompileConfig,
     EdgeProgramManager,
-    ExecutorchProgram,
     ExirExportedProgram,
     to_edge,
 )
@@ -28,7 +27,6 @@
 from executorch.sdk.bundled_program.serialize import (
     serialize_from_bundled_program_to_flatbuffer,
 )
-from torch._export import capture_pre_autograd_graph
 from torch.export import export, ExportedProgram
 
 # Config for Capturing the weights, will be moved in the future
@@ -141,7 +139,59 @@ def randomize_bn(num_features: int, dimensionality: int = 2) -> torch.nn.Module:
     return bn
 
 
+def dump_bundled_program(sample_inputs, expected_output, executorch_program, func_name):
+    method_test_suites = [
+        MethodTestSuite(
+            method_name="forward",
+            test_cases=[
+                MethodTestCase(inputs=sample_inputs, expected_outputs=expected_output)
+            ],
+        )
+    ]
+
+    logging.info(f"Expected output: {expected_output}")
+    logging.info("  -> Test suites generated successfully")
+
+    bundled_program = BundledProgram(executorch_program, method_test_suites)
+    bundled_program_buffer = serialize_from_bundled_program_to_flatbuffer(
+        bundled_program
+    )
+
+    filename = f"{func_name}.pte"
+    logging.info(f"Step 4: Saving bundled program to {filename}")
+    with open(filename, "wb") as file:
+        file.write(bundled_program_buffer)
+
+
 class TestMPS(unittest.TestCase):
+    def assert_outputs_equal(self, model_output, ref_output):
+        """
+        Helper testing function that asserts that the model output and the reference output
+        are equal with some tolerance. Due to numerical differences between eager mode and
+        the MPS's backend, we relax the detal such that absolute tolerance is 1e-3. and
+        relative tolerance is 1e-3.
+        """
+
+        # Compare the result from executor and eager mode direclty
+        if isinstance(ref_output, tuple) or isinstance(ref_output, list):
+            # Multiple outputs executor always returns tuple, even if there is one output
+            self.assertTrue(
+                len(ref_output) == len(model_output),
+                msg="Length of outputs is not matching!",
+            )
+            for i in range(len(ref_output)):
+                self.assertTrue(
+                    torch.allclose(
+                        model_output[i], ref_output[i], atol=1e-03, rtol=1e-03
+                    )
+                )
+        else:
+            # If one output, eager returns tensor while executor tuple of size 1
+            self.assertTrue(
+                torch.allclose(model_output[0], ref_output, atol=1e-03, rtol=1e-03),
+                msg="Outputs are not matching!",
+            )
+
     def lower_module_and_test_output(
         self,
         module: Any,
@@ -149,26 +199,24 @@ def lower_module_and_test_output(
         func_name: str,
         use_partitioner: bool = True,
         use_fp16: bool = False,
+        bundled_program=True,
     ) -> ExirExportedProgram:
         """
         Helper testing function that takes a torch.nn.Module and lowers it to MPS with
         the given sample inputs. It then runs the lowered module and compares its
         outputs with the outputs of the eager module.
         """
-
         logging.info("Step 1: EXIR capturing of original module")
 
-        class WrappedModule(torch.nn.Module):
-            def __init__(self):
-                super().__init__()
-                self.one_module = module
+        model = module.eval()
+        original_inputs = []
+        for t in sample_inputs:
+            original_inputs.append(t.detach().clone())
+        original_inputs = tuple(original_inputs)
 
-            def forward(self, *args):
-                return self.one_module(*args)
+        expected_output = model(*sample_inputs)
 
-        model = WrappedModule()
-        model = model.eval()
-        model = capture_pre_autograd_graph(model, sample_inputs)
+        model = torch._export.capture_pre_autograd_graph(model, sample_inputs)
 
         edge_program = export_to_edge(
             model,
@@ -183,10 +231,15 @@ def forward(self, *args):
 
         if use_partitioner:
             logging.info(f"Edge IR graph:\n{edge_program.exported_program().graph}")
-            edge = edge_program.to_backend(MPSPartitioner(compile_specs=compile_specs))
-            logging.info(f"Lowered graph:\n{edge.exported_program().graph}")
+            delegated_program = edge_program
+            delegated_program = edge_program.to_backend(
+                MPSPartitioner(compile_specs=compile_specs)
+            )
+            logging.info(
+                f"Lowered graph:\n{delegated_program.exported_program().graph}"
+            )
 
-            executorch_program = edge.to_executorch(
+            executorch_program = delegated_program.to_executorch(
                 config=ExecutorchBackendConfig(extract_constant_segment=False)
             )
         else:
@@ -206,42 +259,35 @@ def forward(self, *args):
                 )
             )
 
-        exported_program: ExirExportedProgram = exir.capture(
-            WrappedModule(), sample_inputs, _CAPTURE_CONFIG
-        ).to_edge(_EDGE_COMPILE_CONFIG)
-
-        executorch_program: ExecutorchProgram = exported_program.to_executorch()
-
-        logging.info("Step 3: Generating bundled program")
-        logging.info(
-            "  -> Number of execution plans: {len(executorch_program.program.execution_plan)}"
-        )
+        if bundled_program:
+            dump_bundled_program(
+                sample_inputs, expected_output, executorch_program, func_name
+            )
+        try:
+            from executorch.extension.pybindings.portable_lib import (  # @manual
+                _load_for_executorch_from_buffer,
+            )
 
-        expected_output = module(*sample_inputs)
+            logging.info("Testing delegated program using pybind")
 
-        method_test_suites = [
-            MethodTestSuite(
-                method_name="forward",
-                test_cases=[
-                    MethodTestCase(
-                        inputs=sample_inputs, expected_outputs=module(*sample_inputs)
-                    )
-                ],
+            # Test the model with executor
+            logging.debug("Initializing MPSGraph")
+            executorch_module = _load_for_executorch_from_buffer(
+                executorch_program.buffer
             )
-        ]
 
-        logging.info(f"Expected output: {expected_output}")
-        logging.info("  -> Test suites generated successfully")
+            model_output = executorch_module.forward(original_inputs)
 
-        bundled_program = BundledProgram(executorch_program, method_test_suites)
-        bundled_program_buffer = serialize_from_bundled_program_to_flatbuffer(
-            bundled_program
-        )
+            logging.info(f"Expected output: {expected_output}")
+            logging.info(f"MPS delegate output: {model_output}")
+            self.assert_outputs_equal(model_output, expected_output)
+            logging.info("Delegated program matches PyTorch Eager mode result!")
 
-        filename = f"{func_name}.pte"
-        logging.info(f"Step 4: Saving bundled program to {filename}")
-        with open(filename, "wb") as file:
-            file.write(bundled_program_buffer)
+            return delegated_program
+        except ImportError:
+            logging.info(
+                "ExecuTorch MPS delegate was built without pybind support. Exiting..."
+            )
 
     def lower_and_test_with_partitioner(
         self,
@@ -251,7 +297,6 @@ def lower_and_test_with_partitioner(
         use_fp16: bool = False,
     ):
         logging.info(func_name)
-        # MPS TODO: partitioner support
         self.lower_module_and_test_output(
             graph_module,
             example_inputs,
diff --git a/backends/arm/arm_quantizer_utils.py b/backends/arm/arm_quantizer_utils.py
index 63c98ee42d2..5275b5ecade 100644
--- a/backends/arm/arm_quantizer_utils.py
+++ b/backends/arm/arm_quantizer_utils.py
@@ -23,7 +23,7 @@
 from torch.ao.quantization.pt2e.utils import (
     _conv1d_bn_example_inputs,
     _conv2d_bn_example_inputs,
-    _get_aten_graph_module_for_pattern,
+    get_aten_graph_module,
 )
 from torch.ao.quantization.quantizer import (
     QuantizationAnnotation,
@@ -478,7 +478,7 @@ def _do_annotate_conv_bn(
     # Match against all conv dimensions and cuda variants
     for (conv_fn, example_inputs), is_cuda, relu_is_inplace in combinations:
         pattern = _get_pattern(conv_fn, relu_is_inplace, has_relu)
-        pattern = _get_aten_graph_module_for_pattern(pattern, example_inputs, is_cuda)
+        pattern = get_aten_graph_module(pattern, example_inputs, is_cuda)
         pattern.graph.eliminate_dead_code()
         pattern.recompile()
         matcher = SubgraphMatcherWithNameNodeMap(pattern, ignore_literals=True)
diff --git a/backends/qualcomm/CMakeLists.txt b/backends/qualcomm/CMakeLists.txt
index 8883e5ee026..727952b4fe4 100644
--- a/backends/qualcomm/CMakeLists.txt
+++ b/backends/qualcomm/CMakeLists.txt
@@ -253,7 +253,7 @@ target_link_libraries(qnn_executorch_backend
     qnn_executorch_header
     qnn_schema
     qnn_manager
-    executorch
+    executorch_no_prim_ops
     qcir_utils
 )
 target_link_libraries(utils
diff --git a/backends/qualcomm/builders/node_visitor.py b/backends/qualcomm/builders/node_visitor.py
index 3dae32f882e..3f40dc56737 100644
--- a/backends/qualcomm/builders/node_visitor.py
+++ b/backends/qualcomm/builders/node_visitor.py
@@ -29,6 +29,7 @@
     QNN_uint16: PyQnnWrapper.Qnn_DataType_t.QNN_DATATYPE_UFIXED_POINT_16,
 }
 QNN_TENSOR_TYPE_MAP = {
+    torch.bool: PyQnnWrapper.Qnn_DataType_t.QNN_DATATYPE_BOOL_8,
     torch.float32: PyQnnWrapper.Qnn_DataType_t.QNN_DATATYPE_FLOAT_32,
     torch.int8: PyQnnWrapper.Qnn_DataType_t.QNN_DATATYPE_INT_8,
     torch.int16: PyQnnWrapper.Qnn_DataType_t.QNN_DATATYPE_INT_16,
diff --git a/backends/qualcomm/partition/common_defs.py b/backends/qualcomm/partition/common_defs.py
index b06a5766a63..36a2986f09a 100644
--- a/backends/qualcomm/partition/common_defs.py
+++ b/backends/qualcomm/partition/common_defs.py
@@ -13,6 +13,8 @@
     exir_ops.edge.aten.clone.default,
     exir_ops.edge.aten.index.Tensor,
     exir_ops.edge.aten.full.default,
+    exir_ops.edge.aten.slice_scatter.default,
+    exir_ops.edge.aten.index_put.default,
 ]
 
 allow_list_operator = [
diff --git a/backends/qualcomm/quantizer/utils.py b/backends/qualcomm/quantizer/utils.py
index 809b7298eba..58be76aba11 100644
--- a/backends/qualcomm/quantizer/utils.py
+++ b/backends/qualcomm/quantizer/utils.py
@@ -9,6 +9,7 @@
 import torch
 
 from torch._ops import OpOverload
+from torch._subclasses import FakeTensor
 
 from torch.ao.quantization.quantizer import (
     QuantizationAnnotation,
@@ -42,6 +43,19 @@ def decorator(annotator: Callable):
     return decorator
 
 
+def _is_input_float_tensor(node: Node):
+    """Check if the input is not a float tensor, so that we can skip quantization for the node
+    since observers only works with float Tensors
+    """
+    if (
+        not isinstance(node, Node)
+        or "val" not in node.meta
+        or not isinstance(node.meta["val"], FakeTensor)
+    ):
+        return False
+    return node.meta["val"].dtype == torch.float32
+
+
 def _is_annotated(nodes: List[Node]):
     """
     Given a list of nodes (that represents an operator pattern),
@@ -123,11 +137,11 @@ def annotate_binary(node: Node, quantization_config: QuantizationConfig) -> None
 
     input_qspec_map = {}
     input_act0 = node.args[0]
-    if isinstance(input_act0, Node):
+    if _is_input_float_tensor(input_act0):
         input_qspec_map[input_act0] = input_act_qspec
 
     input_act1 = node.args[1]
-    if isinstance(input_act1, Node):
+    if _is_input_float_tensor(input_act1):
         input_qspec_map[input_act1] = input_act_qspec
 
     node.meta[QUANT_ANNOTATION_KEY] = QuantizationAnnotation(
diff --git a/backends/vulkan/README.md b/backends/vulkan/README.md
new file mode 100644
index 00000000000..bc5a674970f
--- /dev/null
+++ b/backends/vulkan/README.md
@@ -0,0 +1,192 @@
+# ExecuTorch Vulkan Delegate
+
+The ExecuTorch Vulkan delegate is a native GPU delegate for ExecuTorch that is
+built on top of the cross-platform Vulkan GPU API standard. It is primarily
+designed to leverage the GPU to accelerate model inference on Android devices,
+but can be used on any platform that supports an implementation of Vulkan:
+laptops, servers, and edge devices.
+
+::::{note}
+The Vulkan delegate is currently under active development, and its components
+are subject to change.
+::::
+
+## What is Vulkan?
+
+Vulkan is a low-level GPU API specification developed as a successor to OpenGL.
+It is designed to offer developers more explicit control over GPUs compared to
+previous specifications in order to reduce overhead and maximize the
+capabilities of the modern graphics hardware.
+
+Vulkan has been widely adopted among GPU vendors, and most modern GPUs (both
+desktop and mobile) in the market support Vulkan. Vulkan is also included in
+Android from Android 7.0 onwards.
+
+**Note that Vulkan is a GPU API, not a GPU Math Library**. That is to say it
+provides a way to execute compute and graphics operations on a GPU, but does not
+come with a built-in library of performant compute kernels.
+
+## The Vulkan Compute Library
+
+The ExecuTorch Vulkan Delegate is a wrapper around a standalone runtime known as
+the **Vulkan Compute Library**. The aim of the Vulkan Compute Library is to
+provide GPU implementations for PyTorch operators via GLSL compute shaders.
+
+The Vulkan Compute Library is a fork/iteration of the [PyTorch Vulkan Backend](https://pytorch.org/tutorials/prototype/vulkan_workflow.html).
+The core components of the PyTorch Vulkan backend were forked into ExecuTorch
+and adapted for an AOT graph-mode style of model inference (as opposed to
+PyTorch which adopted an eager execution style of model inference).
+
+The components of the Vulkan Compute Library are contained in the
+`executorch/backends/vulkan/runtime/` directory. The core components are listed
+and described below:
+
+```
+runtime/
+├── api/ .................... Wrapper API around Vulkan to manage Vulkan objects
+└── graph/ .................. ComputeGraph class which implements graph mode inference
+    └── ops/ ................ Base directory for operator implementations
+        ├── glsl/ ........... GLSL compute shaders
+        │   ├── *.glsl
+        │   └── conv2d.glsl
+        └── impl/ ........... C++ code to dispatch GPU compute shaders
+            ├── *.cpp
+            └── Conv2d.cpp
+```
+
+## Features
+
+The Vulkan delegate currently supports the following features:
+
+* **Memory Planning**
+  * Intermediate tensors whose lifetimes do not overlap will share memory allocations. This reduces the peak memory usage of model inference.
+* **Capability Based Partitioning**:
+  * A graph can be partially lowered to the Vulkan delegate via a partitioner, which will identify nodes (i.e. operators) that are supported by the Vulkan delegate and lower only supported subgraphs
+* **Support for upper-bound dynamic shapes**:
+  * Tensors can change shape between inferences as long as its current shape is smaller than the bounds specified during lowering
+
+In addition to increasing operator coverage, the following features are
+currently in development:
+
+* **Quantization Support**
+  * We are currently working on support for 8-bit dynamic quantization, with plans to extend to other quantization schemes in the future.
+* **Memory Layout Management**
+  * Memory layout is an important factor to optimizing performance. We plan to introduce graph passes to introduce memory layout transitions throughout a graph to optimize memory-layout sensitive operators such as Convolution and Matrix Multiplication.
+* **Selective Build**
+  * We plan to make it possible to control build size by selecting which operators/shaders you want to build with
+
+## End to End Example
+
+To further understand the features of the Vulkan Delegate and how to use it,
+consider the following end to end example with MobileNet V2.
+
+### Compile and lower a model to the Vulkan Delegate
+
+Assuming ExecuTorch has been set up and installed, the following script can be
+used to produce a lowered MobileNet V2 model as `vulkan_mobilenetv2.pte`.
+
+```
+import torch
+import torchvision.models as models
+
+from torch.export import export, ExportedProgram
+from torchvision.models.mobilenetv2 import MobileNet_V2_Weights
+from executorch.backends.vulkan.partitioner.vulkan_partitioner import VulkanPartitioner
+from executorch.exir import EdgeProgramManager, ExecutorchProgramManager, to_edge
+from executorch.exir.backend.backend_api import to_backend
+
+mobilenet_v2 = models.mobilenetv2.mobilenet_v2(weights=MobileNet_V2_Weights.DEFAULT).eval()
+sample_inputs = (torch.randn(1, 3, 224, 224), )
+
+exported_program: ExportedProgram = export(mobilenet_v2, sample_inputs)
+edge: EdgeProgramManager = to_edge(exported_program)
+
+# Lower the model to Vulkan backend
+edge = edge.to_backend(VulkanPartitioner())
+
+exec_prog = edge.to_executorch()
+
+with open("vulkan_mobilenetv2.pte", "wb") as file:
+    exec_prog.write_to_file(file)
+```
+
+Like other ExecuTorch delegates, a model can be lowered to the Vulkan Delegate
+using the `to_backend()` API. The Vulkan Delegate implements the
+`VulkanPartitioner` class which identifies nodes (i.e. operators) in the graph
+that are supported by the Vulkan delegate, and separates compatible sections of
+the model to be executed on the GPU.
+
+This means the a model can be lowered to the Vulkan delegate even if it contains
+some unsupported operators. This will just mean that only parts of the graph
+will be executed on the GPU.
+
+
+::::{note}
+The [Vulkan partitioner code](https://github.com/pytorch/executorch/blob/main/backends/vulkan/partitioner/vulkan_partitioner.py)
+can be inspected to examine which ops are currently implemented in the Vulkan
+delegate.
+::::
+
+### Build Vulkan Delegate libraries
+
+The easiest way to build and test the Vulkan Delegate is to build for Android
+and test on a local Android device. Android devices have built in support for
+Vulkan, and the Android NDK ships with a GLSL compiler, which is needed to
+compile the Vulkan Compute Library's GLSL compute shaders.
+
+The Vulkan Delegate libraries can be built by setting `-DEXECUTORCH_BUILD_VULKAN=ON`
+when building with CMake.
+
+First, make sure that you have the Android NDK installed - Android NDK r25c is
+recommended. The Android SDK should also be installed so that you have access
+to `adb`.
+
+```shell
+# Recommended version is Android NDK r25c.
+export ANDROID_NDK=<path_to_ndk>
+# Select an appropriate Android ABI
+export ANDROID_ABI=arm64-v8a
+# All subsequent commands should be performed from ExecuTorch repo root
+cd <path_to_executorch_root>
+# Make sure adb works
+adb --version
+```
+
+To build and install ExecuTorch libraries (for Android) with the Vulkan
+Delegate:
+
+```shell
+# From executorch root directory
+(rm -rf cmake-android-out && \
+  pp cmake . -DCMAKE_INSTALL_PREFIX=cmake-android-out \
+    -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK/build/cmake/android.toolchain.cmake \
+    -DANDROID_ABI=$ANDROID_ABI \
+    -DEXECUTORCH_BUILD_VULKAN=ON \
+    -DPYTHON_EXECUTABLE=python \
+    -Bcmake-android-out && \
+  cmake --build cmake-android-out -j16 --target install)
+```
+
+### Run the Vulkan model on device
+
+::::{note}
+Since operator support is currently limited, only binary arithmetic operators
+will run on the GPU. Expect inference to be slow as the majority of operators
+are being executed via Portable operators.
+::::
+
+Now, the partially delegated model can be executed (partially) on your device's
+GPU!
+
+```shell
+# Build a model runner binary linked with the Vulkan delegate libs
+cmake --build cmake-android-out --target vulkan_executor_runner -j32
+
+# Push model to device
+adb push vulkan_mobilenetv2.pte /data/local/tmp/vulkan_mobilenetv2.pte
+# Push binary to device
+adb push cmake-android-out/backends/vulkan/vulkan_executor_runner /data/local/tmp/runner_bin
+
+# Run the model
+adb shell /data/local/tmp/runner_bin --model_path /data/local/tmp/vulkan_mobilenetv2.pte
+```
diff --git a/backends/vulkan/docs/android_demo.md b/backends/vulkan/docs/android_demo.md
new file mode 100644
index 00000000000..f9fc35657a6
--- /dev/null
+++ b/backends/vulkan/docs/android_demo.md
@@ -0,0 +1,148 @@
+# Building and Running ExecuTorch with the Vulkan Backend
+
+The [ExecuTorch Vulkan Delegate](./native-delegates-executorch-vulkan-delegate.md)
+is a native GPU delegate for ExecuTorch.
+
+<!----This will show a grid card on the page----->
+::::{grid} 2
+:::{grid-item-card}  What you will learn in this tutorial:
+:class-card: card-content
+* How to export the Stories 110M parameter model with partial GPU delegation
+* How to execute the partially delegated model on Android
+:::
+:::{grid-item-card}  Prerequisites:
+:class-card: card-prerequisites
+* Follow [**Setting up ExecuTorch**](./getting-started-setup.md)
+* Follow [**Setting up the ExecuTorch LLaMA Android Demo App**](./llm/llama-demo-android.md)
+:::
+::::
+
+## Prerequisites
+
+Note that all the steps below should be performed from the ExecuTorch repository
+root directory, and assumes that you have gone through the steps of setting up
+ExecuTorch.
+
+You should also refer to the **Prerequisites** section of the [**Setting up the ExecuTorch LLaMA Android Demo App**](./llm/llama-demo-android.md)
+Tutorial in order to install the specified versions of the Android NDK and the
+Android SDK.
+
+```shell
+# Recommended version is Android NDK r25c.
+export ANDROID_NDK=<path_to_ndk>
+# Select an appropriate Android ABI
+export ANDROID_ABI=arm64-v8a
+# All subsequent commands should be performed from ExecuTorch repo root
+cd <path_to_executorch_root>
+# Make sure adb works
+adb --version
+```
+
+## Lowering the Stories 110M model to Vulkan
+
+::::{note}
+The resultant model will only be partially delegated to the Vulkan backend. In
+particular, only binary arithmetic operators (`aten.add`, `aten.sub`,
+`aten.mul`, `aten.div`) and the matrix multiplication operator (`aten.mm`) will
+be executed on the GPU via the Vulkan delegate. The rest of the model will be
+executed using Portable operators. This is because the Vulkan delegate is still
+early in development and currently has limited operator coverage.
+::::
+
+First, download `stories110M.pt` and `tokenizer.model` from Github:
+
+```shell
+wget "https://huggingface.co/karpathy/tinyllamas/resolve/main/stories110M.pt"
+wget "https://raw.githubusercontent.com/karpathy/llama2.c/master/tokenizer.model"
+```
+
+Next, create the params file:
+
+```shell
+echo '{"dim": 768, "multiple_of": 32, "n_heads": 12, "n_layers": 12, "norm_eps": 1e-05, "vocab_size": 32000}' > params.json
+```
+
+Then, create a tokenizer binary file:
+
+```shell
+python -m examples.models.llama2.tokenizer.tokenizer -t tokenizer.model -o tokenizer.bin
+```
+
+Finally, export the `stories110M.pt` file into an ExecuTorch program:
+
+```shell
+python -m examples.models.llama2.export_llama -c stories110M.pt -p params.json --vulkan
+```
+
+A `vulkan_llama2.pte` file should have been created as a result of the last step.
+
+Push the tokenizer binary and `vulkan_llama2.pte` onto your Android device:
+
+```shell
+adb mkdir /data/local/tmp/llama/
+adb push tokenizer.bin /data/local/tmp/llama/
+adb push vulkan_llama2.pte /data/local/tmp/llama/
+```
+
+## Build and Run the LLaMA runner binary on Android
+
+First, build and install ExecuTorch libraries, then build the LLaMA runner
+binary using the Android NDK toolchain.
+
+```shell
+(rm -rf cmake-android-out && \
+  cmake . -DCMAKE_INSTALL_PREFIX=cmake-android-out \
+    -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK/build/cmake/android.toolchain.cmake \
+    -DANDROID_ABI=$ANDROID_ABI \
+    -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \
+    -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \
+    -DEXECUTORCH_BUILD_VULKAN=ON \
+    -DEXECUTORCH_BUILD_OPTIMIZED=ON \
+    -DPYTHON_EXECUTABLE=python \
+    -Bcmake-android-out && \
+  cmake --build cmake-android-out -j16 --target install)
+
+# Build LLaMA Runner library
+(rm -rf cmake-android-out/examples/models/llama2 && \
+  cmake examples/models/llama2 \
+    -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK/build/cmake/android.toolchain.cmake \
+    -DANDROID_ABI=$ANDROID_ABI \
+    -DCMAKE_INSTALL_PREFIX=cmake-android-out \
+    -DPYTHON_EXECUTABLE=python \
+    -Bcmake-android-out/examples/models/llama2 && \
+  cmake --build cmake-android-out/examples/models/llama2 -j16)
+```
+
+Finally, push and run the llama runner binary on your Android device.
+
+```shell
+adb push cmake-android-out/examples/models/llama2/llama_main /data/local/tmp/llama_main
+
+adb shell /data/local/tmp/llama_main \
+    --model_path=/data/local/tmp/llama/vulkan_llama2.pte \
+    --tokenizer_path=/data/local/tmp/llama/tokenizer.bin \
+    --prompt "hi" \--temperature=0
+```
+
+The following output will be produced:
+
+```
+hippo named Hippy lived in a big pond. Hippy was a very happy hippo. He liked to play...
+```
+
+## Running with the LLaMA Android Demo App
+
+It is also possible to run the partially delegated Vulkan model inside the LLaMA
+Android demo app.
+
+First, make some modifications to the Android app setup script to make sure that
+the Vulkan backend is built when building and installing ExecuTorch libraries:
+
+```shell
+# Run from executorch root directory. You can also edit this in a code editor
+sed -i 's/-DEXECUTORCH_BUILD_XNNPACK=ON/-DEXECUTORCH_BUILD_XNNPACK=ON -DEXECUTORCH_BUILD_VULKAN=ON/g' examples/demo-apps/android/LlamaDemo/setup.sh
+```
+
+Then, Follow the instructions at [**Setting up the ExecuTorch LLaMA Android Demo App**](./llm/llama-demo-android.md)
+to build and run the demo application on your Android device. Once the app
+starts up, you can load and run the `vulkan_llama2.pte` model with the app.
diff --git a/build/Utils.cmake b/build/Utils.cmake
index 39fa7317da8..66b740ad1eb 100644
--- a/build/Utils.cmake
+++ b/build/Utils.cmake
@@ -213,6 +213,12 @@ function(resolve_buck2)
           PARENT_SCOPE)
     endif()
   endif()
+
+  # The buck2 daemon can get stuck. Killing it can help.
+  message(STATUS "Killing buck2 daemon")
+  execute_process(
+    COMMAND "${BUCK2} kill"
+    WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR})
 endfunction()
 
 # Sets the value of the PYTHON_EXECUTABLE variable to 'python' if in an active
diff --git a/build/build_apple_frameworks.sh b/build/build_apple_frameworks.sh
index 78425485526..0b6adae0a7f 100755
--- a/build/build_apple_frameworks.sh
+++ b/build/build_apple_frameworks.sh
@@ -25,9 +25,9 @@ PORTABLE=OFF
 QUANTIZED=OFF
 XNNPACK=OFF
 HEADERS_PATH="include"
-EXECUTORCH_FRAMEWORK="executorch:libexecutorch.a,libextension_apple.a,libextension_data_loader.a,libextension_module.a:$HEADERS_PATH"
+EXECUTORCH_FRAMEWORK="executorch:libexecutorch.a,libexecutorch_no_prim_ops.a,libextension_apple.a,libextension_data_loader.a,libextension_module.a:$HEADERS_PATH"
 COREML_FRAMEWORK="coreml_backend:libcoremldelegate.a:"
-CUSTOM_FRAMEWORK="custom_backend:libcustom_ops.a,libcustom_ops_lib.a:"
+CUSTOM_FRAMEWORK="custom_backend:libcustom_ops.a:"
 MPS_FRAMEWORK="mps_backend:libmpsdelegate.a:"
 OPTIMIZED_FRAMEWORK="optimized_backend:liboptimized_kernels.a,liboptimized_ops_lib.a:"
 PORTABLE_FRAMEWORK="portable_backend:libportable_kernels.a,libportable_ops_lib.a:"
diff --git a/build/cmake_deps.toml b/build/cmake_deps.toml
index 4346881224b..91174c08f75 100644
--- a/build/cmake_deps.toml
+++ b/build/cmake_deps.toml
@@ -19,6 +19,18 @@ excludes = [
 buck_targets = [
   "//runtime/executor:program",
 ]
+deps = [
+  "executorch_no_prim_ops",
+]
+filters = [
+  ".cpp$",
+]
+
+
+[targets.executorch_no_prim_ops]
+buck_targets = [
+  "//runtime/executor:program_no_prim_ops",
+]
 deps = [
   "program_schema",
 ]
@@ -43,6 +55,7 @@ excludes = [
 ]
 deps = [
   "executorch",
+  "executorch_no_prim_ops",
 ]
 
 [targets.optimized_kernels]
@@ -59,6 +72,7 @@ excludes = [
 ]
 deps = [
   "executorch",
+  "executorch_no_prim_ops",
   "portable_kernels",
 ]
 
@@ -76,6 +90,7 @@ excludes = [
 ]
 deps = [
   "executorch",
+  "executorch_no_prim_ops",
   "portable_kernels",
 ]
 
@@ -97,6 +112,7 @@ filters = [
 excludes = [
 ]
 deps = [
+  "executorch_no_prim_ops",
   "executorch",
 ]
 
@@ -113,6 +129,7 @@ filters = [
   ".cpp$",
 ]
 deps = [
+  "executorch_no_prim_ops",
   "executorch",
 ]
 
@@ -125,6 +142,7 @@ filters = [
 ]
 deps = [
   "executorch",
+  "executorch_no_prim_ops",
   "extension_data_loader",
 ]
 
@@ -137,6 +155,7 @@ filters = [
 ]
 deps = [
   "executorch",
+  "executorch_no_prim_ops",
 ]
 
 # ---------------------------------- extension end ----------------------------------
@@ -154,6 +173,7 @@ excludes = [
 ]
 deps = [
   "executorch",
+  "executorch_no_prim_ops",
   "portable_kernels",
   "quantized_kernels",
 ]
@@ -169,6 +189,7 @@ excludes = [
   "^codegen",
 ]
 deps = [
+  "executorch_no_prim_ops",
   "executorch",
 ]
 # ---------------------------------- binary end ----------------------------------
@@ -185,6 +206,7 @@ excludes = [
 ]
 deps = [
   "executorch",
+  "executorch_no_prim_ops",
   "portable_kernels",
 ]
 
@@ -197,6 +219,7 @@ filters = [
 ]
 deps = [
   "executorch",
+  "executorch_no_prim_ops",
 ]
 
 [targets.mps_schema]
@@ -222,6 +245,7 @@ excludes = [
 ]
 deps = [
   "executorch",
+  "executorch_no_prim_ops",
   "xnnpack_backend",
   "portable_kernels",
 ]
@@ -235,6 +259,7 @@ filters = [
 ]
 deps = [
   "executorch",
+  "executorch_no_prim_ops",
 ]
 
 [targets.xnnpack_dynamic_quant_utils]
@@ -275,6 +300,7 @@ excludes = [
 ]
 deps = [
   "executorch",
+  "executorch_no_prim_ops",
   "optimized_kernels",
   "xnnpack_backend",
 ]
@@ -292,6 +318,7 @@ excludes = [
 deps = [
   "custom_ops",
   "executorch",
+  "executorch_no_prim_ops",
   "extension_data_loader",
   "extension_module",
   "portable_kernels",
diff --git a/build/executorch-config.cmake b/build/executorch-config.cmake
index 14ec7bf1f45..acf8b6779d5 100644
--- a/build/executorch-config.cmake
+++ b/build/executorch-config.cmake
@@ -13,27 +13,20 @@
 cmake_minimum_required(VERSION 3.19)
 
 set(_root "${CMAKE_CURRENT_LIST_DIR}/../..")
-add_library(executorch STATIC IMPORTED)
-find_library(
-    EXECUTORCH_LIBRARY_PATH executorch
-    HINTS "${_root}"
-    CMAKE_FIND_ROOT_PATH_BOTH
-)
-set_target_properties(
-    executorch PROPERTIES IMPORTED_LOCATION "${EXECUTORCH_LIBRARY_PATH}"
-)
-target_include_directories(executorch INTERFACE ${_root})
+set(required_lib_list executorch executorch_no_prim_ops portable_kernels)
+foreach(lib ${required_lib_list})
+    set(lib_var "LIB_${lib}")
+    add_library(${lib} STATIC IMPORTED)
+    find_library(
+        ${lib_var} ${lib} HINTS "${_root}" CMAKE_FIND_ROOT_PATH_BOTH
+    )
+    set_target_properties(
+        ${lib} PROPERTIES IMPORTED_LOCATION "${${lib_var}}"
+    )
+    target_include_directories(${lib} INTERFACE ${_root})
+endforeach()
 
-add_library(portable_kernels STATIC IMPORTED)
-find_library(
-    PORTABLE_KERNELS_PATH portable_kernels
-    HINTS "${_root}"
-    CMAKE_FIND_ROOT_PATH_BOTH
-)
-set_target_properties(
-    portable_kernels PROPERTIES IMPORTED_LOCATION "${PORTABLE_KERNELS_PATH}"
-)
-target_include_directories(portable_kernels INTERFACE ${_root})
+target_link_libraries(executorch INTERFACE executorch_no_prim_ops)
 
 if(CMAKE_BUILD_TYPE MATCHES "Debug")
     set(FLATCCRT_LIB flatccrt_d)
diff --git a/build/packaging/env_var_script_linux.sh b/build/packaging/env_var_script_linux.sh
new file mode 100644
index 00000000000..6379dee6b5a
--- /dev/null
+++ b/build/packaging/env_var_script_linux.sh
@@ -0,0 +1,20 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+# This file is sourced into the environment before building a pip wheel. It
+# should typically only contain shell variable assignments. Be sure to export
+# any variables so that subprocesses will see them.
+
+# Enable pybindings so that users can execute ExecuTorch programs from python.
+export EXECUTORCH_BUILD_PYBIND=1
+
+# Ensure that CMAKE_ARGS is defined before referencing it. Defaults to empty
+# if not defined.
+export CMAKE_ARGS="${CMAKE_ARGS:-}"
+
+# Link the XNNPACK backend into the pybindings runtime so that users can execute
+# ExecuTorch programs that delegate to it.
+CMAKE_ARGS="${CMAKE_ARGS} -DEXECUTORCH_BUILD_XNNPACK=ON"
diff --git a/build/packaging/env_var_script_m1.sh b/build/packaging/env_var_script_m1.sh
new file mode 100644
index 00000000000..48db0a2b431
--- /dev/null
+++ b/build/packaging/env_var_script_m1.sh
@@ -0,0 +1,27 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+# This file is sourced into the environment before building a pip wheel. It
+# should typically only contain shell variable assignments. Be sure to export
+# any variables so that subprocesses will see them.
+
+# Enable pybindings so that users can execute ExecuTorch programs from python.
+export EXECUTORCH_BUILD_PYBIND=1
+
+# Ensure that CMAKE_ARGS is defined before referencing it. Defaults to empty
+# if not defined.
+export CMAKE_ARGS="${CMAKE_ARGS:-}"
+
+# Link the XNNPACK backend into the pybindings runtime so that users can execute
+# ExecuTorch programs that delegate to it.
+CMAKE_ARGS="${CMAKE_ARGS} -DEXECUTORCH_BUILD_XNNPACK=ON"
+
+# When building for macOS, link additional backends into the pybindings runtime.
+
+# TODO(dbort): Make these build properly in the CI environment.
+# build machine uses an older version.
+# CMAKE_ARGS="${CMAKE_ARGS} -DEXECUTORCH_BUILD_COREML=ON"
+# CMAKE_ARGS="${CMAKE_ARGS} -DEXECUTORCH_BUILD_MPS=ON"
diff --git a/build/packaging/pre_build_script.sh b/build/packaging/pre_build_script.sh
index 3940168c403..eeb4b95b007 100644
--- a/build/packaging/pre_build_script.sh
+++ b/build/packaging/pre_build_script.sh
@@ -5,6 +5,21 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
-set -eux
+set -euxo pipefail
 
-echo "This script is run before building ExecuTorch binaries"
+# This script is run before building ExecuTorch binaries
+
+# Manually install build requirements because `python setup.py bdist_wheel` does
+# not install them. TODO(dbort): Switch to using `python -m build --wheel`,
+# which does install them. Though we'd need to disable build isolation to be
+# able to see the installed torch package.
+readonly BUILD_DEPS=(
+  # This list must match the build-system.requires list from pyproject.toml.
+  "cmake"
+  "pyyaml"
+  "setuptools"
+  "tomli"
+  "wheel"
+  "zstd"
+)
+pip install --progress-bar off "${BUILD_DEPS[@]}"
diff --git a/build/packaging/smoke_test.py b/build/packaging/smoke_test.py
index 5273a457f13..be53ae5a378 100644
--- a/build/packaging/smoke_test.py
+++ b/build/packaging/smoke_test.py
@@ -5,13 +5,99 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
+"""
+This script is run by CI after building the executorch wheel. Before running
+this, the job will install the matching torch package as well as the newly-built
+executorch package and its dependencies.
+"""
+
+# Import this first. If it can't find the torch.so libraries, the dynamic load
+# will fail and the process will exit.
+from executorch.extension.pybindings import portable_lib  # usort: skip
+
+# Import this after importing the ExecuTorch pybindings. If the pybindings
+# links against a different torch.so than this uses, there will be a set of
+# symbol comflicts; the process will either exit now, or there will be issues
+# later in the smoke test.
+import torch  # usort: skip
+
+# Import everything else later to help isolate the critical imports above.
+import os
+import tempfile
+from typing import Tuple
+
+from executorch.exir import to_edge
+from torch.export import export
+
+
+class LinearModel(torch.nn.Module):
+    """Runs Linear on its input, which should have shape [4]."""
+
+    def __init__(self):
+        super().__init__()
+        self.linear = torch.nn.Linear(4, 2)
+
+    def forward(self, x: torch.Tensor):
+        """Expects a single tensor of shape [4]."""
+        return self.linear(x)
+
+
+def linear_model_inputs() -> Tuple[torch.Tensor]:
+    """Returns some example inputs compatible with LinearModel."""
+    # The model takes a single tensor of shape [4] as an input.
+    return (torch.ones(4),)
+
+
+def export_linear_model() -> bytes:
+    """Exports LinearModel and returns the .pte data."""
+
+    # This helps the exporter understand the shapes of tensors used in the model.
+    # Since our model only takes one input, this is a one-tuple.
+    example_inputs = linear_model_inputs()
+
+    # Export the pytorch model and process for ExecuTorch.
+    print("Exporting program...")
+    exported_program = export(LinearModel(), example_inputs)
+    print("Lowering to edge...")
+    edge_program = to_edge(exported_program)
+    print("Creating ExecuTorch program...")
+    et_program = edge_program.to_executorch()
+
+    return et_program.buffer
+
 
 def main():
-    """
-    Run ExecuTorch binary smoke tests. This is a placeholder for future tests. See
-    https://github.com/pytorch/test-infra/wiki/Using-Nova-Reusable-Build-Workflows
-    for more information about Nova binary workflow.
-    """
+    """Tests the export and execution of a simple model."""
+
+    # If the pybindings loaded correctly, we should be able to ask for the set
+    # of operators.
+    ops = portable_lib._get_operator_names()
+    assert len(ops) > 0, "Empty operator list"
+    print(f"Found {len(ops)} operators; first element '{ops[0]}'")
+
+    # Export LinearModel to .pte data.
+    pte_data: bytes = export_linear_model()
+
+    # Try saving to and loading from a file.
+    with tempfile.TemporaryDirectory() as tempdir:
+        pte_file = os.path.join(tempdir, "linear.pte")
+
+        # Save the .pte data to a file.
+        with open(pte_file, "wb") as file:
+            file.write(pte_data)
+            print(f"ExecuTorch program saved to {pte_file} ({len(pte_data)} bytes).")
+
+        # Load the model from disk.
+        m = portable_lib._load_for_executorch(pte_file)
+
+        # Run the model.
+        outputs = m.forward(linear_model_inputs())
+
+        # Should see a single output with shape [2].
+        assert len(outputs) == 1, f"Unexpected output length {len(outputs)}: {outputs}"
+        assert outputs[0].shape == (2,), f"Unexpected output size {outputs[0].shape}"
+
+    print("PASS")
 
 
 if __name__ == "__main__":
diff --git a/build/resolve_buck.py b/build/resolve_buck.py
index cba151ab340..463e6bf6c37 100644
--- a/build/resolve_buck.py
+++ b/build/resolve_buck.py
@@ -76,6 +76,10 @@ class BuckInfo:
         archive_name="buck2-aarch64-apple-darwin.zst",
         target_versions=["99e407b49dc432eda0cbddd67ea78346"],
     ),
+    ("darwin", "x86_64"): BuckInfo(
+        archive_name="buck2-x86_64-apple-darwin.zst",
+        target_versions=["9150d78e7a7531799a1b06ce58623bbc"],
+    ),
 }
 
 
diff --git a/build/test_android_ci.sh b/build/test_android_ci.sh
index acc853727fa..8d9391146dc 100755
--- a/build/test_android_ci.sh
+++ b/build/test_android_ci.sh
@@ -8,7 +8,7 @@
 set -ex
 
 # https://github.com/pytorch/executorch/tree/main/examples/demo-apps/android/ExecuTorchDemo
-build_executorch() {
+export_model() {
   MODEL_NAME=dl3
   # Delegating DeepLab v3 to XNNPACK backend
   python -m examples.xnnpack.aot_compiler --model_name="${MODEL_NAME}" --delegate
@@ -16,10 +16,12 @@ build_executorch() {
   ASSETS_DIR=examples/demo-apps/android/ExecuTorchDemo/app/src/main/assets/
   mkdir -p "${ASSETS_DIR}"
   cp "${MODEL_NAME}_xnnpack_fp32.pte" "${ASSETS_DIR}"
+}
 
-  rm -rf cmake-out && mkdir cmake-out
-  ANDROID_NDK=/opt/ndk BUCK2=$(which buck2) FLATC=$(which flatc) ANDROID_ABI=arm64-v8a \
-    bash examples/demo-apps/android/ExecuTorchDemo/setup.sh
+build_android_native_library() {
+  pushd examples/demo-apps/android/LlamaDemo
+  CMAKE_OUT="cmake-out-android-$1" ANDROID_NDK=/opt/ndk ANDROID_ABI="$1" ./gradlew setup
+  popd
 }
 
 build_android_demo_app() {
@@ -30,11 +32,13 @@ build_android_demo_app() {
 
 build_android_llama_demo_app() {
   pushd examples/demo-apps/android/LlamaDemo
-  ANDROID_NDK=/opt/ndk ANDROID_ABI=arm64-v8a ./gradlew setup
   ANDROID_HOME=/opt/android/sdk ./gradlew build
+  ANDROID_HOME=/opt/android/sdk ./gradlew assembleAndroidTest
   popd
 }
 
-build_executorch
+build_android_native_library arm64-v8a
+build_android_native_library x86_64
+export_model
 build_android_demo_app
 build_android_llama_demo_app
diff --git a/docs/source/_static/img/llama_ios_app.mp4 b/docs/source/_static/img/llama_ios_app.mp4
new file mode 100644
index 00000000000..fead47644d6
Binary files /dev/null and b/docs/source/_static/img/llama_ios_app.mp4 differ
diff --git a/docs/source/_static/img/llama_ios_app.png b/docs/source/_static/img/llama_ios_app.png
new file mode 100644
index 00000000000..4f9020efb87
Binary files /dev/null and b/docs/source/_static/img/llama_ios_app.png differ
diff --git a/docs/source/_static/img/llm_manual_print_data_tabular.png b/docs/source/_static/img/llm_manual_print_data_tabular.png
new file mode 100644
index 00000000000..6052a404246
Binary files /dev/null and b/docs/source/_static/img/llm_manual_print_data_tabular.png differ
diff --git a/docs/source/_static/img/print_data_tabular.png b/docs/source/_static/img/print_data_tabular.png
index 593ea4088ca..7e20b129bb4 100644
Binary files a/docs/source/_static/img/print_data_tabular.png and b/docs/source/_static/img/print_data_tabular.png differ
diff --git a/docs/source/build-run-coreml.md b/docs/source/build-run-coreml.md
index c442b2cc6b8..da830e542c8 100644
--- a/docs/source/build-run-coreml.md
+++ b/docs/source/build-run-coreml.md
@@ -1,6 +1,6 @@
 # Building and Running ExecuTorch with Core ML Backend
 
-Core ML delegate uses Core ML apis to enable running neural networks via Apple's hardware acceleration. For more about coreml you can read [here](https://developer.apple.com/documentation/coreml). In this tutorial we will walk through steps of lowering a PyTorch model to Core ML delegate
+Core ML delegate uses Core ML APIs to enable running neural networks via Apple's hardware acceleration. For more about Core ML you can read [here](https://developer.apple.com/documentation/coreml). In this tutorial, we will walk through the steps of lowering a PyTorch model to Core ML delegate
 
 
 ::::{grid} 2
@@ -24,8 +24,8 @@ Core ML delegate uses Core ML apis to enable running neural networks via Apple's
 In order to be able to successfully build and run the ExecuTorch's Core ML backend you'll need the following hardware and software components.
 
 ### Hardware:
-- A [mac](https://www.apple.com/mac/]) system for building.
-- A [mac](https://www.apple.com/mac/]) or [iPhone](https://www.apple.com/iphone/) or [iPad](https://www.apple.com/ipad/) or [Apple TV](https://www.apple.com/tv-home/) device for running the model.
+- A [mac](https://www.apple.com/mac/) system for building.
+- A [mac](https://www.apple.com/mac/) or [iPhone](https://www.apple.com/iphone/) or [iPad](https://www.apple.com/ipad/) or [Apple TV](https://www.apple.com/tv-home/) device for running the model.
 
 ### Software:
 
@@ -67,22 +67,53 @@ python3 -m examples.apple.coreml.scripts.export --model_name mv3
 
 ### Runtime:
 
-**Running the Core ML delegated Program**:
+**Running a Core ML delegated Program**:
 1. Build the runner.
 ```bash
 cd executorch
 
-# Generates ./coreml_executor_runner.
+# Builds `coreml_executor_runner`.
 ./examples/apple/coreml/scripts/build_executor_runner.sh
 ```
-2. Run the exported program.
+2. Run the CoreML delegated program.
 ```bash
 cd executorch
 
-# Runs the exported mv3 model on the Core ML backend.
+# Runs the exported mv3 model using the Core ML backend.
 ./coreml_executor_runner --model_path mv3_coreml_all.pte
 ```
 
+**Profiling a Core ML delegated Program**:
+
+Note that profiling is supported on [macOS](https://developer.apple.com/macos) >= 14.4.
+
+1. [Optional] Generate an [ETRecord](./sdk-etrecord.rst) when exporting your model.
+```bash
+cd executorch
+
+# Generates `mv3_coreml_all.pte` and `mv3_coreml_etrecord.bin` files.
+python3 -m examples.apple.coreml.scripts.export --model_name mv3 --generate_etrecord
+```
+
+2. Build the runner.
+```bash
+# Builds `coreml_executor_runner`.
+./examples/apple/coreml/scripts/build_executor_runner.sh
+```
+3. Run and generate an [ETDump](./sdk-etdump.md).
+```bash
+cd executorch
+
+# Generate the ETDump file.
+./coreml_executor_runner --model_path mv3_coreml_all.pte --profile_model --etdump_path etdump.etdp
+```
+
+4. Create an instance of the [Inspector API](./sdk-inspector.rst) by passing in the [ETDump](./sdk-etdump.md) you have sourced from the runtime along with the optionally generated [ETRecord](./sdk-etrecord.rst) from step 1 or execute the following command in your terminal to display the profiling data table.
+```bash
+python examples/apple/coreml/scripts/inspector_cli.py --etdump_path etdump.etdp --etrecord_path mv3_coreml.bin
+```
+
+
 ## Deploying and running on a device
 
 **Running the Core ML delegated Program in the Demo iOS App**:
@@ -92,27 +123,27 @@ cd executorch
 
 3. Complete the [Final Steps](demo-apps-ios.md#final-steps) section of the tutorial to build and run the demo app.
 
-<br>**Running the Core ML delegated Program in your own App**
-1. Build **Core ML** delegate. The following will create a `executorch.xcframework` in the `cmake-out` directory.
+<br>**Running the Core ML delegated Program in your App**
+1. Build frameworks, running the following will create a `executorch.xcframework` and `coreml_backend.xcframework` in the `cmake-out` directory.
 ```bash
 cd executorch
 ./build/build_apple_frameworks.sh --Release --coreml
 ```
 2. Create a new [Xcode project](https://developer.apple.com/documentation/xcode/creating-an-xcode-project-for-an-app#) or open an existing project.
 
-3. Drag the `executorch.xcframework` generated from Step 2 to Frameworks.
+3. Drag the `executorch.xcframework` and `coreml_backend.xcframework` generated from Step 2 to Frameworks.
 
 4. Go to the project's [Build Phases](https://developer.apple.com/documentation/xcode/customizing-the-build-phases-of-a-target) -  Link Binaries With Libraries, click the + sign, and add the following frameworks:
 ```
-- executorch.xcframework
-- coreml_backend.xcframework
-- Accelerate.framework
-- CoreML.framework
-- libsqlite3.tbd
+executorch.xcframework
+coreml_backend.xcframework
+Accelerate.framework
+CoreML.framework
+libsqlite3.tbd
 ```
 5. Add the exported program to the [Copy Bundle Phase](https://developer.apple.com/documentation/xcode/customizing-the-build-phases-of-a-target#Copy-files-to-the-finished-product) of your Xcode target.
 
-6. Please follow the [running a model](running-a-model-cpp-tutorial.md) tutorial to integrate the code for loading a ExecuTorch program.
+6. Please follow the [running a model](./running-a-model-cpp-tutorial.md) tutorial to integrate the code for loading an ExecuTorch program.
 
 7. Update the code to load the program from the Application's bundle.
 ``` objective-c
@@ -120,9 +151,7 @@ using namespace torch::executor;
 
 NSURL *model_url = [NBundle.mainBundle URLForResource:@"mv3_coreml_all" extension:@"pte"];
 
-Result<util::FileDataLoader> loader =
-        util::FileDataLoader::from(model_url.path.UTF8String);
-
+Result<util::FileDataLoader> loader = util::FileDataLoader::from(model_url.path.UTF8String);
 ```
 
 8. Use [Xcode](https://developer.apple.com/documentation/xcode/building-and-running-an-app#Build-run-and-debug-your-app) to deploy the application on the device.
diff --git a/docs/source/build-run-qualcomm-ai-engine-direct-backend.md b/docs/source/build-run-qualcomm-ai-engine-direct-backend.md
index 30904b29ddb..1a94577e90c 100644
--- a/docs/source/build-run-qualcomm-ai-engine-direct-backend.md
+++ b/docs/source/build-run-qualcomm-ai-engine-direct-backend.md
@@ -115,6 +115,10 @@ Python APIs on x64 are required to compile models to Qualcomm AI Engine Direct b
 
 ```bash
 cd $EXECUTORCH_ROOT
+# Workaround for fbs files in exir/_serialize
+cp schema/program.fbs exir/_serialize/program.fbs
+cp schema/scalar_type.fbs exir/_serialize/scalar_type.fbs
+
 mkdir build_x86_64
 cd build_x86_64
 cmake .. -DEXECUTORCH_BUILD_QNN=ON -DQNN_SDK_ROOT=${QNN_SDK_ROOT}
@@ -138,8 +142,8 @@ mkdir build_android
 cd build_android
 # build executorch & qnn_executorch_backend
 cmake .. \
-    -DBUCK2=buck2 \
     -DCMAKE_INSTALL_PREFIX=$PWD \
+    -DEXECUTORCH_BUILD_SDK=ON \
     -DEXECUTORCH_BUILD_QNN=ON \
     -DQNN_SDK_ROOT=$QNN_SDK_ROOT \
     -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK/build/cmake/android.toolchain.cmake \
@@ -220,6 +224,7 @@ So, we can run `qnn_executor_runner` like
 ```bash
 adb push ./deeplab_v3/dlv3_qnn.pte ${DEVICE_DIR}
 adb push ${EXECUTORCH_ROOT}/build_android/examples/qualcomm/qnn_executor_runner ${DEVICE_DIR}
+adb push ${EXECUTORCH_ROOT}/build_android/lib/libqnn_executorch_backend.so ${DEVICE_DIR}
 adb shell "cd ${DEVICE_DIR} \
            && export LD_LIBRARY_PATH=${DEVICE_DIR} \
            && export ADSP_LIBRARY_PATH=${DEVICE_DIR} \
diff --git a/docs/source/build-run-vulkan.md b/docs/source/build-run-vulkan.md
new file mode 100644
index 00000000000..736859b86f6
--- /dev/null
+++ b/docs/source/build-run-vulkan.md
@@ -0,0 +1 @@
+```{include} ../../backends/vulkan/docs/android_demo.md
diff --git a/docs/source/compiler-memory-planning.md b/docs/source/compiler-memory-planning.md
index 86c0c136300..1dad3b032fc 100644
--- a/docs/source/compiler-memory-planning.md
+++ b/docs/source/compiler-memory-planning.md
@@ -9,7 +9,7 @@ MemoryPlanning is the very last action taken before taking an `ExportedProgram`
 Concretely, there are three passes related to memory planning:
 * `SpecPropPass` computes a TensorSpec for each tensor in the graph (inputs, intermediates or outputs). The most important field of the tensor spec is a symbolic expression of the shapes of the tensor, where the initial set of symbols comes from the dimensions of input tensors, intermediate tensor shapes’ symbolic expression is propagated via tensor operations. The dimensions can be marked as either dynamic or static by users and when the dims are dynamic, users are required to annotate the dim with a ValueRange.
 
-* `SymShapEvalPass` evaluates the symbolic expressions to concrete integers with their upper bounds. There are two ways to doing the upper bound specialization:
+* `SymShapeEvalPass` evaluates the symbolic expressions to concrete integers with their upper bounds. There are two ways to doing the upper bound specialization:
 HintBasedSymShapeEval (to be deprecated) is the old way of evaluating the upper bound. It doesn’t look at the ValueRange of the symbols but uses the shapes of example inputs to replace all the symbols. We call it “hint based“ because the example inputs’ shapes are just hints of what the input shapes might be at run time and are used for tracing only. ValueRangeBasedSymShapeEval is the recommended way of doing UpperBoundMemory planning. It will actually look at the ValueRange of the symbols and do an inference over the ranges to get a real upper bound.
 
 * `MemoryPlanningPass` does the actual memory planning given all tensors get a TensorSpec with concrete integer shapes.
@@ -18,9 +18,9 @@ HintBasedSymShapeEval (to be deprecated) is the old way of evaluating the upper
 
 ExecuTorch provides two options for memory planning algorithms out of the box, but users can define their own if the provided options are inappropriate or insufficient for their use case.
 
-* The naive algorithm simply concatenates all the tensors together in a linear memory without considering any memory re-use. It serves as an upper bound for total memory consumption and serves as a baseline.
+* The naive algorithm simply concatenates all the tensors together in a linear memory block without considering memory re-use. It serves as an upper bound for total memory consumption and serves as a baseline.
 
-* The Greedy algorithm tries to re-use the already allocated memory and choose based on the best-fit criteria. Specifically:
+* The Greedy algorithm tries to re-use the already allocated memory based on the best-fit criteria. Specifically:
 When there isn’t an allocated memory whose lifetime doesn’t overlap with the current tensor that we try to do memory planning for, we allocate a new memory buffer with the same size and lifetime as the current tensor. When there is one or more allocated memory buffer, whose lifetime overlaps with the current tensor, we pick the buffer that has the closest size with current tensor so as to reduce memory fragmentation. Finally, we allocate these memory buffers linearly in memory.
 
 
@@ -48,7 +48,7 @@ Users can write custom memory plans to take advantage of multiple memory locatio
 
 ```python
 class CustomPoolMemoryPlanningPass(MemoryPlanningPass):
-    def call(self, graph_module: GraphModule) -> PassResult:
+    def run(self, graph_module: GraphModule, graph_signature: Optional[ExportGraphSignature]) -> PassResult:
         for subgm in graph_module.modules():
             if not isinstance(subgm, GraphModule):
                 continue
@@ -68,7 +68,7 @@ class CustomPoolMemoryPlanningPass(MemoryPlanningPass):
                 elif node.target == torch.ops.aten.mul.out:
                     node.meta["spec"].mem_id = 1
 
-        return super().call(graph_module)
+        return super().run(graph_module, graph_signature)
 ```
 
 Then later when lowering to ExecuTorch you can use your custom plan in the following way:
@@ -83,4 +83,4 @@ program = edge_program.to_executorch(
         )
 ```
 
-Users attempting to write a custom memory planning algorithm should start by looking at [the greedy algorithm's implementation](https://github.com/pytorch/executorch/blob/d62c41ca86435e5316e7ed292b6d68aff27a2fb7/exir/memory_planning.py#L459C1-L459C12)
+Users attempting to write a custom memory planning algorithm should start by looking at [the greedy algorithm's implementation](https://github.com/pytorch/executorch/blob/d62c41ca86435e5316e7ed292b6d68aff27a2fb7/exir/memory_planning.py#L459C1-L459C12).
diff --git a/docs/source/conf.py b/docs/source/conf.py
index 239319f7c2f..2f72a01d22d 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -73,38 +73,21 @@
 
 # Get ET_VERSION_DOCS during the build.
 et_version_docs = os.environ.get("ET_VERSION_DOCS", None)
-
+print(f"et_version_docs: {et_version_docs}")
 
 # The code below will cut version displayed in the dropdown like this:
-# tags like v0.1.0 = > 0.1
-# branch like release/0.1 => 0.1
-# main will remain main
-# if not set will fail back to main
+# By default, set to "main".
+# If it's a tag like refs/tags/v1.2.3-rc4 or refs/tags/v1.2.3, then
+# cut to 1.2
 # the version varible is used in layout.html: https://github.com/pytorch/executorch/blob/main/docs/source/_templates/layout.html#L29
+version = release = "main"
 if et_version_docs:
-    # Check if starts with release/ and set the version to the number after slash
-    if et_version_docs.startswith("release/"):
-        version = et_version_docs.split("/")[-1]
-    else:
-        # Remove "v" prefix if present
-        if et_version_docs.startswith("v"):
-            et_version_docs = et_version_docs[1:]
-        # Split to major, minor, and patch
-        version_components = et_version_docs.split(".")
-
-        # Combine the major and minor version components:
-        if len(version_components) >= 2:
-            version = release = ".".join(version_components[:2])
-        else:
-            # If there are not enough components, use the full version
-            version = release = et_version_docs
-
-    html_title = " ".join((project, version, "documentation"))
-# IF ET_VERSION_DOCS not set, set version to main.
-# This can be updated to nightly and so on.
-else:
-    version = "main"
-    release = "main"
+    if et_version_docs.startswith("refs/tags/v"):
+        version = ".".join(
+            et_version_docs.split("/")[-1].split("-")[0].lstrip("v").split(".")[:2]
+        )
+print(f"Version: {version}")
+html_title = " ".join((project, version, "documentation"))
 
 breathe_projects = {"ExecuTorch": "../build/xml/"}
 breathe_default_project = "ExecuTorch"
diff --git a/docs/source/debug-backend-delegate.md b/docs/source/debug-backend-delegate.md
new file mode 100644
index 00000000000..ebcf94136c7
--- /dev/null
+++ b/docs/source/debug-backend-delegate.md
@@ -0,0 +1,65 @@
+# Debug Backend Delegate
+
+We provide a list of util functions to give users insights on what happened to the graph modules during the `to_backend()` stage.
+
+## Get delegation summary
+The `get_delegation_info()` method provides a summary of what happened to the model after the `to_backend()` call:
+
+```python
+from executorch.exir.backend.utils import get_delegation_info
+from tabulate import tabulate
+
+# ... After call to to_backend(), but before to_executorch()
+graph_module = edge_manager.exported_program().graph_module
+delegation_info = get_delegation_info(graph_module)
+print(delegation_info.get_summary())
+df = delegation_info.get_operator_delegation_dataframe()
+print(tabulate(df, headers="keys", tablefmt="fancy_grid"))
+```
+
+Example printout:
+```
+Total  delegated  subgraphs:  86
+Number  of  delegated  nodes:  473
+Number  of  non-delegated  nodes:  430
+```
+
+
+|    |  op_type                                 |  occurrences_in_delegated_graphs  |  occurrences_in_non_delegated_graphs  |
+|----|---------------------------------|------- |-----|
+|  0  |  aten__softmax_default  |  12  |  0  |
+|  1  |  aten_add_tensor  |  37  |  0  |
+|  2  |  aten_addmm_default  |  48  |  0  |
+|  3  |  aten_arange_start_step  |  0  |  25  |
+|      |  ...  |    |    |
+|  23  |  aten_view_copy_default  |  170  |  48  |
+|      |  ...  |    |    |
+|  26  |  Total  |  473  |  430  |
+
+From the table, the operator `aten_view_copy_default` appears 170 times in delegate graphs and 48 times in non-delegated graphs. Users can use information like this to debug.
+
+## Visualize delegated graph
+To see a more detailed view, use the `print_delegated_graph()` method to display a printout of the whole graph:
+
+```python
+from executorch.exir.backend.utils import print_delegated_graph
+graph_module = edge_manager.exported_program().graph_module
+print(print_delegated_graph(graph_module))
+```
+It will print the whole model as well as the subgraph consumed by the backend. The generic debug function provided by fx like `print_tabular()` or `print_readable()` will only show `call_delegate` but hide the the subgraph consumes by the backend, while this function exposes the contents inside the subgraph.
+
+In the example printout below, observe that `embedding` and `add` operators are delegated to `XNNPACK` while the `sub` operator is not.
+
+```
+%aten_unsqueeze_copy_default_22 : [num_users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten.unsqueeze_copy.default](args = (%aten_arange_start_step_23, -2), kwargs = {})
+  %aten_unsqueeze_copy_default_23 : [num_users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten.unsqueeze_copy.default](args = (%aten_arange_start_step_24, -1), kwargs = {})
+  %lowered_module_0 : [num_users=1] = get_attr[target=lowered_module_0]
+    backend_id: XnnpackBackend
+    lowered graph():
+      %aten_embedding_default : [num_users=1] = placeholder[target=aten_embedding_default]
+      %aten_embedding_default_1 : [num_users=1] = placeholder[target=aten_embedding_default_1]
+      %aten_add_tensor : [num_users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten.add.Tensor](args = (%aten_embedding_default, %aten_embedding_default_1), kwargs = {})
+      return (aten_add_tensor,)
+  %executorch_call_delegate : [num_users=1] = call_function[target=torch.ops.higher_order.executorch_call_delegate](args = (%lowered_module_0, %aten_embedding_default, %aten_embedding_default_1), kwargs = {})
+  %aten_sub_tensor : [num_users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten.sub.Tensor](args = (%aten_unsqueeze_copy_default, %aten_unsqueeze_copy_default_1), kwargs = {})
+```
diff --git a/docs/source/demo-apps-ios.md b/docs/source/demo-apps-ios.md
index e04b6cae681..d68b1309e2b 100644
--- a/docs/source/demo-apps-ios.md
+++ b/docs/source/demo-apps-ios.md
@@ -1 +1 @@
-```{include} ../../examples/demo-apps/apple_ios/README.md
+```{include} ../../examples/demo-apps/apple_ios/ExecuTorchDemo/README.md
diff --git a/docs/source/getting-started-setup.md b/docs/source/getting-started-setup.md
index ffc33498483..6abbbfc0642 100644
--- a/docs/source/getting-started-setup.md
+++ b/docs/source/getting-started-setup.md
@@ -83,7 +83,7 @@ portability details.
 
    ```bash
    # Clone the ExecuTorch repo from GitHub
-   git clone https://github.com/pytorch/executorch.git
+   git clone --branch v0.2.0 https://github.com/pytorch/executorch.git
    cd executorch
 
    # Update and pull submodules
diff --git a/docs/source/index.rst b/docs/source/index.rst
index adbda475aa2..210839cf57c 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -100,6 +100,7 @@ Topics in this section will help you get started with ExecuTorch.
    demo-apps-android
    examples-end-to-end-to-lower-model-to-delegate
    tutorial-xnnpack-delegate-lowering
+   build-run-vulkan
    ..
       Alphabetical by backend name. Be sure to keep the same order in the
       customcarditem entries below.
@@ -183,8 +184,10 @@ Topics in this section will help you get started with ExecuTorch.
    :hidden:
 
    native-delegates-executorch-xnnpack-delegate
+   native-delegates-executorch-vulkan-delegate
    backend-delegates-integration
    backend-delegates-dependencies
+   debug-backend-delegate
 
 .. toctree::
    :glob:
@@ -262,6 +265,13 @@ ExecuTorch tutorials.
    :link: tutorial-xnnpack-delegate-lowering.html
    :tags: Export,Backend,Delegation,Quantization,XNNPACK
 
+.. customcarditem::
+   :header: Building and Running ExecuTorch with Vulkan Backend
+   :card_description: A tutorial that walks you through the process of building ExecuTorch with Vulkan Backend
+   :image: _static/img/generic-pytorch-logo.png
+   :link: build-run-vulkan.html
+   :tags: Export,Backend,Delegation,Vulkan
+
 ..
    Alphabetical by backend name. Be sure to keep the same order in the Tutorials
    toctree entry above.
diff --git a/docs/source/kernel-library-custom-aten-kernel.md b/docs/source/kernel-library-custom-aten-kernel.md
index 4b0794ea1c5..4d391b1a944 100644
--- a/docs/source/kernel-library-custom-aten-kernel.md
+++ b/docs/source/kernel-library-custom-aten-kernel.md
@@ -86,10 +86,88 @@ ATen operator with a dtype/dim order specialized kernel (works for `Double` dtyp
       kernel_name: torch::executor::add_out
 
 ```
+### Custom Ops C++ API
+
+For a custom kernel that implements a custom operator, we provides 2 ways to register it into ExecuTorch runtime:
+1. Using `EXECUTORCH_LIBRARY` and `WRAP_TO_ATEN` C++ macros.
+2. Using `functions.yaml` and codegen'd C++ libraries.
+
+The first option requires C++17 and doesn't have selective build support yet, but it's faster than the second option where we have to go through yaml authoring and build system tweaking.
+
+The first option is particularly suitable for fast prototyping but can also be used in production.
+
+Similar to `TORCH_LIBRARY`, `EXECUTORCH_LIBRARY` takes the operator name and the C++ function name and register them into ExecuTorch runtime.
+
+#### Prepare custom kernel implementation
+
+Define your custom operator schema for both functional variant (used in AOT compilation) and out variant (used in ExecuTorch runtime). The schema needs to follow PyTorch ATen convention (see native_functions.yaml). For example:
+
+```yaml
+custom_linear(Tensor weight, Tensor input, Tensor(?) bias) -> Tensor
+custom_linear.out(Tensor weight, Tensor input, Tensor(?) bias, *, Tensor(a!) out) -> Tensor(a!)
+```
+
+Then write your custom kernel according to the schema using ExecuTorch types, along with APIs to register to ExecuTorch runtime:
+
+
+```c++
+// custom_linear.h/custom_linear.cpp
+#include <executorch/runtime/kernel/kernel_includes.h>
+Tensor& custom_linear_out(const Tensor& weight, const Tensor& input, optional<Tensor> bias, Tensor& out) {
+   // calculation
+   return out;
+}
+```
+#### Use a C++ macro to register it into PyTorch & ExecuTorch
+
+Append the following line in the example above:
+```c++
+// custom_linear.h/custom_linear.cpp
+// opset namespace myop
+EXECUTORCH_LIBRARY(myop, "custom_linear.out", custom_linear_out);
+```
+
+Now we need to write some wrapper for this op to show up in PyTorch, but don’t worry we don’t need to rewrite the kernel. Create a separate .cpp for this purpose:
+
+```c++
+// custom_linear_pytorch.cpp
+#include "custom_linear.h"
+#include <torch/library.h>
+
+at::Tensor custom_linear(const at::Tensor& weight, const at::Tensor& input, std::optional<at::Tensor> bias) {
+    // initialize out
+    at::Tensor out = at::empty({weight.size(1), input.size(1)});
+    // wrap kernel in custom_linear.cpp into ATen kernel
+    WRAP_TO_ATEN(custom_linear_out, 3)(weight, input, bias, out);
+    return out;
+}
+// standard API to register ops into PyTorch
+TORCH_LIBRARY(myop, m) {
+    m.def("custom_linear(Tensor weight, Tensor input, Tensor(?) bias) -> Tensor", custom_linear);
+    m.def("custom_linear.out(Tensor weight, Tensor input, Tensor(?) bias, *, Tensor(a!) out) -> Tensor(a!)", WRAP_TO_ATEN(custom_linear_out, 3));
+}
+```
+
+#### Compile and link the custom kernel
+
+Link it into ExecuTorch runtime: In our `CMakeLists.txt`` that builds the binary/application, we just need to add custom_linear.h/cpp into the binary target. We can build a dynamically loaded library (.so or .dylib) and link it as well.
+
+Link it into PyTorch runtime: We need to package custom_linear.h, custom_linear.cpp and custom_linear_pytorch.cpp into a dynamically loaded library (.so or .dylib) and load it into our python environment. One way of doing this is:
+
+```python
+import torch
+torch.ops.load_library("libcustom_linear.so/dylib")
+
+# Now we have access to the custom op, backed by kernel implemented in custom_linear.cpp.
+op = torch.ops.myop.custom_linear.default
+```
+
 
 ### Custom Ops Yaml Entry
 
-For custom ops (the ones that are not part of the out variants of core ATen opset) we need to specify the operator schema as well as a `kernel` section. So instead of `op` we use `func` with the operator schema. As an example, here’s a yaml entry for a custom op:
+As mentioned above, this option provides more support in terms of selective build and features such as merging operator libraries.
+
+First we need to specify the operator schema as well as a `kernel` section. So instead of `op` we use `func` with the operator schema. As an example, here’s a yaml entry for a custom op:
 ```yaml
 - func: allclose.out(Tensor self, Tensor other, float rtol=1e-05, float atol=1e-08, bool equal_nan=False, bool dummy_param=False, *, Tensor(a!) out) -> Tensor(a!)
   kernels:
@@ -159,6 +237,30 @@ target_link_libraries(executorch_binary generated_lib)
 
 ```
 
+We also provide the ability to merge two yaml files, given a precedence. `merge_yaml(FUNCTIONS_YAML functions_yaml FALLBACK_YAML fallback_yaml OUTPUT_DIR out_dir)` merges functions_yaml and fallback_yaml into a single yaml, if there's duplicate entries in functions_yaml and fallback_yaml, this macro will always take the one in functions_yaml.
+
+Example:
+
+```yaml
+# functions.yaml
+- op: add.out
+  kernels:
+    - arg_meta: null
+      kernel_name: torch::executor::opt_add_out
+```
+
+And out fallback:
+
+```yaml
+# fallback.yaml
+- op: add.out
+  kernels:
+    - arg_meta: null
+      kernel_name: torch::executor::add_out
+```
+
+The merged yaml will have the entry in functions.yaml.
+
 #### Buck2
 
 `executorch_generated_lib` is the macro that takes the yaml files and depends on the selective build macro `et_operator_library`. For an example:
diff --git a/docs/source/llm/getting-started.md b/docs/source/llm/getting-started.md
index eff8fd52ffb..ae743e8e6d0 100644
--- a/docs/source/llm/getting-started.md
+++ b/docs/source/llm/getting-started.md
@@ -1,5 +1,18 @@
 # Getting Started with LLMs via ExecuTorch
 
+Welcome to LLM Manual! This manual is designed to provide a practical example to leverage
+ExecuTorch in onboarding your own Large Language Models (LLMs). Our primary goal is to offer
+ a clear and concise guideline on how to integrate our system with your own LLMs.
+
+Please note that this project is intended as a demonstration and not as a fully functional
+example with optimal performance. As such, certain components such as the sampler, tokenizer,
+and others are provided in their bare minimum versions solely for demonstration purposes.
+Consequently, the results produced by the model may vary and might not always be optimal.
+
+We encourage users to use this project as a starting point and adapt it to their specific needs,
+which includes creating your own versions of the tokenizer, sampler, acceleration backends, and
+other components. We hope this project serves as a useful guide in your journey with LLMs and ExecuTorch.
+
 ### Table Of Contents
 
 
@@ -14,208 +27,490 @@
 
 ## Prerequisites
 
-Let’s start by getting an ExecuTorch environment:
+To follow this guide, you'll need to clone the ExecuTorch repository and install dependencies.
+ExecuTorch recommends Python 3.10 and the use of Conda to manage your environment. Conda is not
+required, though be aware that you may need to replace the use of python/pip with python3/pip3
+depending on your environment.
+
+::::{tab-set}
+:::{tab-item} conda
+Instructions on installing miniconda can be [found here](https://docs.anaconda.com/free/miniconda).
 
-1.  Create a third-party folder (Keeps the file paths organized)
-```
-mkdir  third-party
-cd  third-party
 ```
-2. If you’re new to ExecuTorch follow [these steps](https://pytorch.org/executorch/main/getting-started-setup.html#set-up-your-environment) to set up your environment.
+# Create a directory for this example.
+mkdir et-nanogpt
+cd et-nanogpt
 
-## Instantiating and Executing an LLM
+# Clone the ExecuTorch repository and submodules.
+mkdir third-party
+git clone -b release/0.2 https://github.com/pytorch/executorch.git third-party/executorch
+cd third-party/executorch
+git submodule update --init
 
-We will use Karpathy’s [NanoGPT](https://github.com/karpathy/nanoGPT) but you can use another model if you prefer.
+# Create a conda environment and install requirements.
+conda create -yn executorch python=3.10.0
+conda activate executorch
+pip install cmake zstd
+./install_requirements.sh
 
+cd ../..
+```
+:::
+:::{tab-item} pyenv-virtualenv
+Instructions on installing pyenv-virtualenv can be [found here](https://github.com/pyenv/pyenv-virtualenv?tab=readme-ov-file#installing-with-homebrew-for-macos-users).
 
+Importantly, if installing pyenv through brew, it does not automatically enable pyenv in the terminal, leading to errors. Run the following commands to enable.
+See the pyenv-virtualenv installation guide above on how to add this to your .bashrc or .zshrc to avoid needing to run these commands manually.
+```
+eval "$(pyenv init -)"
+eval "$(pyenv virtualenv-init -)"
+```
 
-There are just 2 steps to this:
+```
+# Create a directory for this example.
+mkdir et-nanogpt
+cd et-nanogpt
 
-1.  Export the LLM Model
-2.  Create a runtime to execute the model
+pyenv install -s 3.10
+pyenv virtualenv 3.10 executorch
+pyenv activate executorch
 
+# Clone the ExecuTorch repository and submodules.
+mkdir third-party
+git clone -b release/0.2 https://github.com/pytorch/executorch.git third-party/executorch
+cd third-party/executorch
+git submodule update --init
 
+# Install requirements.
+pip install cmake zstd
+PYTHON_EXECUTABLE=python ./install_requirements.sh
 
+cd ../..
+```
+:::
+::::
 
-Note: Reminder to exit out of the “third-party” directory, before proceeding.
+For more information, see [Setting Up ExecuTorch](../getting-started-setup.md).
 
-### Step 1. Export
 
-[Exporting to ExecuTorch](https://pytorch.org/executorch/main/export-overview.html) simply describes taking an existing model and converting it to the ExecuTorch format.
+## Running a Large Language Model Locally
 
+This example uses Karpathy’s [nanoGPT](https://github.com/karpathy/nanoGPT), which is a minimal implementation of
+GPT-2 124M. This guide is applicable to other language models, as ExecuTorch is model-invariant.
 
+There are two steps to running a model with ExecuTorch:
 
-To start, let’s retrieve our model:
+1.  Export the model. This step preprocesses it into a format suitable for runtime execution.
+2.  At runtime, load the model file and run with the ExecuTorch runtime.
 
-`wget  https://raw.githubusercontent.com/karpathy/nanoGPT/master/model.py`
+<br />
 
-Next, we’ll create a script (call it export.py) to generate the ExecuTorch Program (which gets dumped into an ExecuTorch Binary):
+The export step happens ahead of time, typically as part of the application build or when the model changes. The resultant
+.pte file is distributed with the application. At runtime, the application loads the .pte file and passes it to the
+ExecuTorch runtime.
 
+### Step 1. Exporting to ExecuTorch
 
+Exporting takes a PyTorch model and converts it into a format that can run efficiently on consumer devices.
 
-1.  Create the model and example inputs
-```
-import torch
-from model import GPT
+For this example, you will need the nanoGPT model and the corresponding tokenizer vocabulary.
 
-model  =  GPT.from_pretrained('gpt2')
-example_inputs = (torch.randint(0, 100, (1, 8), dtype=torch.long), )
+::::{tab-set}
+:::{tab-item} curl
+```
+curl https://raw.githubusercontent.com/karpathy/nanoGPT/master/model.py -O
+curl https://huggingface.co/openai-community/gpt2/resolve/main/vocab.json -O
+```
+:::
+:::{tab-item} wget
 ```
+wget https://raw.githubusercontent.com/karpathy/nanoGPT/master/model.py
+wget https://huggingface.co/openai-community/gpt2/resolve/main/vocab.json
+```
+:::
+::::
 
+To convert the model into a format optimized for standalone execution, there are two steps. First, use the PyTorch
+`export` function to convert the PyTorch model into an intermediate, platform-independent intermediate representation. Then
+use the ExecuTorch `to_edge` and `to_executorch` methods to prepare the model for on-device execution. This creates a .pte
+file which can be loaded by a desktop or mobile application at runtime.
 
+Create a file called export_nanogpt.py with the following contents:
 
-2.  Trace the model
-Tracing extracts a cleaner representation of our model for conversion to ExecuTorch.
-You can read more about tracing in [torch.export — PyTorch 2.2 documentation](https://pytorch.org/docs/stable/export.html).
+```python
+# export_nanogpt.py
 
-```
-from torch.nn.attention import sdpa_kernel,  SDPBackend
+import torch
+
+from executorch.exir import EdgeCompileConfig, to_edge
+from torch.nn.attention import sdpa_kernel, SDPBackend
 from torch._export import capture_pre_autograd_graph
 from torch.export import export
 
-# Using a custom SDPA kernel for LLMs
-with torch.nn.attention.sdpa_kernel([SDPBackend.MATH]),  torch.no_grad():
+from model import GPT
 
-m  =  capture_pre_autograd_graph(model,  example_inputs)
+# Load the model.
+model = GPT.from_pretrained('gpt2')
+
+# Create example inputs. This is used in the export process to provide
+# hints on the expected shape of the model input.
+example_inputs = (torch.randint(0, 100, (1, model.config.block_size), dtype=torch.long), )
+
+# Set up dynamic shape configuration. This allows the sizes of the input tensors
+# to differ from the sizes of the tensors in `example_inputs` during runtime, as
+# long as they adhere to the rules specified in the dynamic shape configuration.
+# Here we set the range of 0th model input's 1st dimension as
+# [0, model.config.block_size].
+# See https://pytorch.org/executorch/main/concepts.html#dynamic-shapes
+# for details about creating dynamic shapes.
+dynamic_shape = (
+    {1: torch.export.Dim("token_dim", max=model.config.block_size)},
+)
 
-traced_model  =  export(m,  example_inputs)
-```
+# Trace the model, converting it to a portable intermediate representation.
+# The torch.no_grad() call tells PyTorch to exclude training-specific logic.
+with torch.nn.attention.sdpa_kernel([SDPBackend.MATH]), torch.no_grad():
+    m = capture_pre_autograd_graph(model, example_inputs, dynamic_shapes=dynamic_shape)
+    traced_model = export(m, example_inputs, dynamic_shapes=dynamic_shape)
 
-3.  Export the model to ExecuTorch
-Exporting (or lowering) takes the model and creates a runnable ExecuTorch program, without delegate to any specific bakends for further acceleration.
-```
-from executorch.exir import EdgeCompileConfig,  to_edge
+# Convert the model into a runnable ExecuTorch program.
+edge_config = EdgeCompileConfig(_check_ir_validity=False)
+edge_manager = to_edge(traced_model,  compile_config=edge_config)
+et_program = edge_manager.to_executorch()
 
-edge_config  =  EdgeCompileConfig(_check_ir_validity=False)
-edge_manager  =  to_edge(traced_model,  compile_config=edge_config)
-et_program  =  edge_manager.to_executorch()
+# Save the ExecuTorch program to a file.
+with open("nanogpt.pte", "wb") as file:
+    file.write(et_program.buffer)
 ```
 
-Also ExecuTorch provides different backend support for mobile acceleration. Simply call `to_backend()` with the specific backend partitioner on edge_manager  during exportation. Take Xnnpack delegation as an example:
+To export, run the script with `python export_nanogpt.py` (or python3, as appropriate for your environment). It will generate a `nanogpt.pte` file in the current directory.
 
+For more information, see [Exporting to ExecuTorch](../tutorials/export-to-executorch-tutorial) and
+[torch.export](https://pytorch.org/docs/stable/export.html).
 
-```
-from executorch.backends.xnnpack.partition.xnnpack_partitioner import XnnpackPartitioner
-from executorch.backends.xnnpack.utils.configs import get_xnnpack_edge_compile_config
-from executorch.exir import EdgeCompileConfig, to_edge
+### Step 2. Invoking the Runtime
 
-edge_config = edge_config = get_xnnpack_edge_compile_config()
-edge_manager = to_edge(traced_model, compile_config=edge_config)
-edge_manager = edge_manager.to_backend(XnnpackPartitioner())
+ExecuTorch provides a set of runtime APIs and types to load and run models.
 
-et_program = edge_manager.to_executorch()
-```
+Create a file called main.cpp with the following contents:
 
-After that, we’re ready to run our model. Remember to save you model before proceeding:
+```cpp
+// main.cpp
 
-```
-#Write the serialized ExecuTorch program to a file.
-with open("nanogpt.pte",  "wb") as file:
-file.write(et_program.buffer)
+#include <cstdint>
+#include <functional>
+#include <memory>
+#include <unordered_map>
+
+#include "basic_tokenizer.h"
+#include "basic_sampler.h"
+#include "managed_tensor.h"
+
+#include <executorch/extension/module/module.h>
+#include <executorch/extension/evalue_util/print_evalue.h>
+#include <executorch/runtime/core/exec_aten/exec_aten.h>
+#include <executorch/runtime/core/exec_aten/util/dim_order_util.h>
+#include <executorch/runtime/core/exec_aten/util/tensor_util.h>
+
+using namespace torch::executor;
+
+using SizesType = exec_aten::SizesType;
+using DimOrderType = exec_aten::DimOrderType;
+using StridesType = exec_aten::StridesType;
 ```
 
+The model inputs and outputs take the form of tensors. A tensor can be thought of as an multi-dimensional array.
+The ExecuTorch `EValue` class provides a wrapper around tensors and other ExecuTorch data types.
 
-Then run the script.
-`python export.py`
+Since the LLM generates one token at a time, the driver code needs to repeatedly invoke the model, building the
+output token by token. Each generated token is passed as input for the next run.
 
-### Step 2. Running the model
-Running model stands for executing the exported model on ExecuTorch runtime platform.
+```cpp
+// main.cpp
 
-Before running, we need to retrieve vocabulary file GPT2 used for tokenization:
+// The value of the gpt2 `<|endoftext|>` token.
+#define ENDOFTEXT_TOKEN 50256
 
-```
-wget  https://huggingface.co/openai-community/gpt2/resolve/main/vocab.json
-```
-1.  Create the prompt:
-Prompt here means the initial cue given to the model, which it uses as a starting point to generate following sentences. Here we use “Hello world!” as example:
+std::string generate(
+    Module& llm_model,
+    std::string& prompt,
+    BasicTokenizer& tokenizer,
+    BasicSampler& sampler,
+    size_t max_input_length,
+    size_t max_output_length) {
 
+    // Convert the input text into a list of integers (tokens) that represents
+    // it, using the string-to-token mapping that the model was trained on.
+    // Each token is an integer that represents a word or part of a word.
+    std::vector<int64_t> input_tokens = tokenizer.encode(prompt);
+    std::vector<int64_t> output_tokens;
 
-```
-string  prompt  =  "Hello world!";
+    for (auto i = 0u; i < max_output_length; i++) {
+        // Convert the input_tokens from a vector of int64_t to EValue.
+        // EValue is a unified data type in the ExecuTorch runtime.
+        ManagedTensor tensor_tokens(
+            input_tokens.data(),
+            {1, static_cast<int>(input_tokens.size())},
+            ScalarType::Long);
+        std::vector<EValue> inputs = {tensor_tokens.get_tensor()};
+
+        // Run the model. It will return a tensor of logits (log-probabilities).
+        Result<std::vector<EValue>> logits_evalue = llm_model.forward(inputs);
+
+        // Convert the output logits from EValue to std::vector, which is what
+        // the sampler expects.
+        Tensor logits_tensor = logits_evalue.get()[0].toTensor();
+        std::vector<float> logits(logits_tensor.data_ptr<float>(),
+            logits_tensor.data_ptr<float>() + logits_tensor.numel());
+
+        // Sample the next token from the logits.
+        int64_t next_token = sampler.sample(logits);
+
+        // Break if we reached the end of the text.
+        if (next_token == ENDOFTEXT_TOKEN) {
+            break;
+        }
+
+        // Add the next token to the output.
+        output_tokens.push_back(next_token);
+
+        std::cout << tokenizer.decode({ next_token });
+        std::cout.flush();
+
+        // Update next input.
+        input_tokens.push_back(next_token);
+        if (input_tokens.size() > max_input_length) {
+            input_tokens.erase(input_tokens.begin());
+        }
+    }
+
+    std::cout << std::endl;
+
+    // Convert the output tokens into a human-readable string.
+    std::string output_string = tokenizer.decode(output_tokens);
+    return output_string;
+}
 ```
 
+The `Module` class handles loading the .pte file and preparing for execution.
 
-2.  Load tokenizer and model
-A Tokenizer is a crucial component among different Natural Language Processing (NLP) tasks. The primary functionalities are:
+The tokenizer is responsible for converting from a human-readable string representation of the prompt to the
+numerical form expected by the model. To do this, the tokenzier associates short substrings with a given token ID.
+The tokens can be thought of as representing words or parts of words, though, in-practice, they may be arbitrary
+sequences of characters.
 
--   Encode: Convert text into structural and numerical representations by parsing text into smaller units.Each unit is replaced by a specific number for the NLP model to consume
+The tokenizer loads the vocabulary from a file, which contains the mapping between each token ID and the text it
+represents. Call `tokenizer.encode()` and `tokenizer.decode()` to convert between string and token representations.
 
--   Decode: Convert the numerical representations back for human interpretation.
+The sampler is responsible for selecting the next token, based on the logits, or log-probabilties, output by the
+model. The LLM returns a logit value for each possible next token. The sampler chooses which token to use based
+on some strategy. The simplest approach, used here, is to take the token with the highest logit value.
 
+Samplers may provide configurable options, such as configurable amount of randomness to the outputs selection,
+penalties for repeated tokens, and biases to prioritize or de-prioritize specific tokens.
 
-In our NanoGPT example, we create a simple tokenizer called BasicTokenizer to demonstrate the function. You can use other implementations like [tiktoken](https://github.com/openai/tiktoken) or your own implementation to do that.
 
+```cpp
+// main.cpp
 
-```
-#include  "basic_tokenizer.h"
-BasicTokenizer tokenizer("vocab.json");
-```
+int main() {
+    // Set up the prompt. This provides the seed text for the model to elaborate.
+    std::cout << "Enter model prompt: ";
+    std::string prompt;
+    std::getline(std::cin, prompt);
 
+    // The tokenizer is used to convert between tokens (used by the model) and
+    // human-readable strings.
+    BasicTokenizer tokenizer("vocab.json");
 
-To load the exported ExecuTorch model into runtime environment, we can use **Module** class:
+    // The sampler is used to sample the next token from the logits.
+    BasicSampler sampler = BasicSampler();
 
+    // Load the exported nanoGPT program, which was generated via the previous steps.
+    Module model("nanogpt.pte", torch::executor::Module::MlockConfig::UseMlockIgnoreErrors);
 
-```
-#include <executorch/extension/module/module.h>
-Module llm_model("nanogpt.pte");
+    const auto max_input_tokens = 1024;
+    const auto max_output_tokens = 30;
+    std::cout << prompt;
+    generate(model, prompt, tokenizer, sampler, max_input_tokens, max_output_tokens);
+}
 ```
 
+Finally, download the following files into the same directory as main.h:
 
-3.  Tokenize the prompt
 ```
-vector<int64_t> tokens = tokenizer.encode(prompt);
+curl -O https://raw.githubusercontent.com/pytorch/executorch/main/examples/llm_manual/basic_sampler.h
+curl -O https://raw.githubusercontent.com/pytorch/executorch/main/examples/llm_manual/basic_tokenizer.h
+curl -O https://raw.githubusercontent.com/pytorch/executorch/main/examples/llm_manual/managed_tensor.h
 ```
 
-4.  Generate outputs
-We use the loaded model to generate text based on tokenized prompt. Here we create a helper function to illustrate the pipeline:
+To learn more, see [Running an ExecuTorch Model in C++](../running-a-model-cpp-tutorial.md)
+and the [ExecuTorch Runtime API Reference](../executorch-runtime-api-reference.md).
+
+### Building and Running
+
+ExecuTorch uses the CMake build system. To compile and link against the ExecuTorch runtime,
+include the ExecuTorch project via `add_directory` and link against `executorch` and additional
+dependencies.
+
+Create a file named CMakeLists.txt with the following content:
 
 ```
-vector<int64_t> generate(Module& llm_model, vector<int64_t>& input_tokens, BasicSampler& sampler, size_t target_output_length) {
-    vector<int64_t> output_tokens;
-    for (int i = 0; i < target_output_length; i++) {
-        // Convert the input_tokens from a vector of int64_t to EValue.
-        // Evalue is a unified data type in the executorch runtime.
-        ManagedTensor tensor_tokens(input_tokens.data(), {1, 8}, ScalarType::Long);
-        vector<EValue> inputs = {tensor_tokens.get_tensor()};
-        // Run the model given the Evalue inputs. The model will also return a sequence of EValues as output.
-        Result<vector<EValue>> logits_evalue = llm_model.forward(inputs);
-        // Convert the output from EValue to a logits in float.
-        Tensor logits_tensor = logits_evalue.get()[0].toTensor();
-        vector<float> logits(logits_tensor.data_ptr<float>(), logits_tensor.data_ptr<float>() + logits_tensor.numel());
-        // Sample the next token from the logits.
-        int64_t next_token = sampler.sample(logits);
-        // Record the next token
-        output_tokens.push_back(next_token);
-        // Update next input.
-        input_tokens.erase(input_tokens.begin());
-        input_tokens.push_back(next_token);
-    }
-    return output_tokens;
-}
+# CMakeLists.txt
+
+cmake_minimum_required(VERSION 3.19)
+project(nanogpt_runner)
 
+set(CMAKE_CXX_STANDARD 17)
+set(CMAKE_CXX_STANDARD_REQUIRED True)
+
+# Set options for executorch build.
+option(EXECUTORCH_BUILD_EXTENSION_DATA_LOADER "" ON)
+option(EXECUTORCH_BUILD_EXTENSION_MODULE "" ON)
+option(EXECUTORCH_BUILD_OPTIMIZED "" ON)
+
+# Include the executorch subdirectory.
+add_subdirectory(
+    ${CMAKE_CURRENT_SOURCE_DIR}/third-party/executorch
+    ${CMAKE_BINARY_DIR}/third-party/executorch)
+
+add_executable(nanogpt_runner main.cpp)
+target_link_libraries(
+    nanogpt_runner
+    PRIVATE
+    executorch
+    extension_module_static # Provides the Module class
+    optimized_native_cpu_ops_lib) # Provides baseline cross-platform kernels
+```
+
+At this point, the working directory should contain the following files:
+
+- CMakeLists.txt
+- main.cpp
+- basic_tokenizer.h
+- basic_sampler.h
+- managed_tensor.h
+- export_nanogpt.py
+- model.py
+- vocab.json
+- nanogpt.pte
+
+If all of these are present, you can now build and run:
+```bash
+(rm -rf cmake-out && mkdir cmake-out && cd cmake-out && cmake ..)
+cmake --build cmake-out -j10
+./cmake-out/nanogpt_runner
 ```
 
+You should see the message:
 
-And in the main function, we leverage the function to generate the outputs.
 ```
-vector<int64_t> outputs = generate(llm_model, tokens, sampler, /*target_output_length*/20);
+Enter model prompt:
 ```
-Notice that here outputs are tokens, rather than actual natural language.
 
-5.  Decode the output.
-We convert the generated output tokens back to natural language for better understanding:
+Type some seed text for the model and press enter. Here we use "Hello world!" as
+an example prompt:
 
 ```
-string out_str = tokenizer.decode(outputs);
-```
+Enter model prompt: Hello world!
+Hello world!
 
-6.  Print the generated text
+I'm not sure if you've heard of the "Curse of the Dragon" or not, but it's a very popular game in
 ```
-cout << "output: " << out_str << endl;
+
+At this point, it is likely to run very slowly. This is because ExecuTorch hasn't been told to optimize for
+specific hardware (delegation), and because it is doing all of the calculations in 32-bit floating point (no quantization).
+
+## Delegation
+
+While ExecuTorch provides a portable, cross-platform implementation for all
+operators, it also provides specialized backends for a number of different
+targets. These include, but are not limited to, x86 and ARM CPU acceleration via
+the XNNPACK backend, Apple acceleration via the Core ML backend and Metal
+Performance Shader (MPS) backend, and GPU acceleration via the Vulkan backend.
+
+Because optimizations are specific to a given backend, each pte file is specific
+to the backend(s) targeted at export. To support multiple devices, such as
+XNNPACK acceleration for Android and Core ML for iOS, export a separate PTE file
+for each backend.
+
+To delegate to a backend at export time, ExecuTorch provides the `to_backend()`
+function in the `EdgeProgramManager` object, which takes a backend-specific
+partitioner object. The partitioner is responsible for finding parts of the
+computation graph that can be accelerated by the target backend，and
+`to_backend()` function will delegate matched part to given backend for
+acceleration and optimization. Any portions of the computation graph not
+delegated will be executed by the ExecuTorch operator implementations.
+
+To delegate the exported model to a specific backend, we need to import its
+partitioner as well as edge compile config from ExecuTorch codebase first, then
+call `to_backend` with an instance of partitioner on the `EdgeProgramManager`
+object `to_edge` function created.
+
+Here's an example of how to delegate nanoGPT to XNNPACK (if you're deploying to an Android phone for instance):
+
+```python
+# export_nanogpt.py
+
+# Load partitioner for Xnnpack backend
+from executorch.backends.xnnpack.partition.xnnpack_partitioner import XnnpackPartitioner
+
+# Model to be delegated to specific backend should use specific edge compile config
+from executorch.backends.xnnpack.utils.configs import get_xnnpack_edge_compile_config
+from executorch.exir import EdgeCompileConfig, to_edge
+
+import torch
+from torch.export import export
+from torch.nn.attention import sdpa_kernel, SDPBackend
+from torch._export import capture_pre_autograd_graph
+
+from model import GPT
+
+# Load the nanoGPT model.
+model = GPT.from_pretrained('gpt2')
+
+# Create example inputs. This is used in the export process to provide
+# hints on the expected shape of the model input.
+example_inputs = (
+        torch.randint(0, 100, (1, model.config.block_size - 1), dtype=torch.long),
+    )
+
+# Set up dynamic shape configuration. This allows the sizes of the input tensors
+# to differ from the sizes of the tensors in `example_inputs` during runtime, as
+# long as they adhere to the rules specified in the dynamic shape configuration.
+# Here we set the range of 0th model input's 1st dimension as
+# [0, model.config.block_size].
+# See https://pytorch.org/executorch/main/concepts.html#dynamic-shapes
+# for details about creating dynamic shapes.
+dynamic_shape = (
+    {1: torch.export.Dim("token_dim", max=model.config.block_size - 1)},
+)
+
+# Trace the model, converting it to a portable intermediate representation.
+# The torch.no_grad() call tells PyTorch to exclude training-specific logic.
+with torch.nn.attention.sdpa_kernel([SDPBackend.MATH]), torch.no_grad():
+    m = capture_pre_autograd_graph(model, example_inputs, dynamic_shapes=dynamic_shape)
+    traced_model = export(m, example_inputs, dynamic_shapes=dynamic_shape)
+
+# Convert the model into a runnable ExecuTorch program.
+# To be further lowered to Xnnpack backend, `traced_model` needs xnnpack-specific edge compile config
+edge_config = get_xnnpack_edge_compile_config()
+edge_manager = to_edge(traced_model, compile_config=edge_config)
+
+# Delegate exported model to Xnnpack backend by invoking `to_backend` function with Xnnpack partitioner.
+edge_manager = edge_manager.to_backend(XnnpackPartitioner())
+et_program = edge_manager.to_executorch()
+
+# Save the Xnnpack-delegated ExecuTorch program to a file.
+with open("nanogpt.pte", "wb") as file:
+    file.write(et_program.buffer)
+
+
 ```
-### Build and Run
 
-1. Create the Cmake file for build
+Additionally, update CMakeLists.txt to build and link the XNNPACK backend to
+ExecuTorch runner.
+
 ```
 cmake_minimum_required(VERSION 3.19)
 project(nanogpt_runner)
@@ -223,71 +518,103 @@ project(nanogpt_runner)
 set(CMAKE_CXX_STANDARD 17)
 set(CMAKE_CXX_STANDARD_REQUIRED True)
 
-
 # Set options for executorch build.
 option(EXECUTORCH_BUILD_EXTENSION_DATA_LOADER "" ON)
 option(EXECUTORCH_BUILD_EXTENSION_MODULE "" ON)
-option(EXECUTORCH_BUILD_XNNPACK "" ON)
-option(EXECUTORCH_BUILD_SDK "" ON) # Needed for etdump
+option(EXECUTORCH_BUILD_OPTIMIZED "" ON)
+option(EXECUTORCH_BUILD_XNNPACK "" ON) # Build with Xnnpack backend
 
 # Include the executorch subdirectory.
 add_subdirectory(
-    ${CMAKE_CURRENT_SOURCE_DIR}/../executorch
+    ${CMAKE_CURRENT_SOURCE_DIR}/third-party/executorch
     ${CMAKE_BINARY_DIR}/executorch)
 
 # include_directories(${CMAKE_CURRENT_SOURCE_DIR}/src)
 
-add_executable(nanogpt_runner nanogpt_runner.cpp)
+add_executable(nanogpt_runner main.cpp)
 target_link_libraries(
     nanogpt_runner
     PRIVATE
-    etdump
-    extension_module
-    portable_ops_lib)
-
+    executorch
+    extension_module_static # Provides the Module class
+    optimized_native_cpu_ops_lib # Provides baseline cross-platform kernels
+    xnnpack_backend) # Provides the XNNPACK CPU acceleration backend
 ```
 
-This CMake file links the ExecuTorch codebase, along with the necessary extensions and XNNPACK modules, to the nanogpt runner.
+Keep the rest of the code the same. For more details refer to [Exporting
+to ExecuTorch](#step-1-exporting-to-executorch) and [Invoking the
+Runtime](#step-2-invoking-the-runtime) for more details
 
-2. Build the c++ environment for nanorunner
-```
-(rm -rf cmake-out \
-  && mkdir cmake-out \
-  && cd cmake-out \
-  && cmake ..)
-```
+At this point, the working directory should contain the following files:
 
-3. With this CMake file as well as built environment iin place, you can build the nanogpt runner binary by executing the following command:
+- CMakeLists.txt
+- main.cpp
+- basic_tokenizer.h
+- basic_sampler.h
+- managed_tensor.h
+- export_nanogpt.py
+- model.py
+- vocab.json
 
+If all of these are present, you can now export Xnnpack delegated pte model:
+```bash
+python export_nanogpt.py
 ```
-cmake --build cmake-out --target nanogpt_runner -j9
+
+It will generate `nanogpt.pte`, under the same working directory.
+
+Then we can build and run the model by:
+```bash
+(rm -rf cmake-out && mkdir cmake-out && cd cmake-out && cmake ..)
+cmake --build cmake-out -j10
+./cmake-out/nanogpt_runner
 ```
 
-4. After the build is complete, you can run the binary with this command:
+
+You should see the message:
+
 ```
-./cmake-out/nanogpt_runner
+Enter model prompt:
 ```
-If everything worked it should see something like this:
+
+Type some seed text for the model and press enter. Here we use "Hello world!" as
+an example prompt:
+
 ```
-prompt: Hello world!
-output: Hello world!
+Enter model prompt: Hello world!
+Hello world!
 
-I'm not sure if you've heard of the "Curse of the Dragon" or
+I'm not sure if you've heard of the "Curse of the Dragon" or not, but it's a very popular game in
 ```
 
-## Quantization (Optional)
+The delegated model should be noticeably faster compared to the non-delegated model.
 
-Quantization refers to a set of techniques for running calculations and storing tensors using lower precision types. Compared to 32-bit floating point, using 8-bit integers can provide both a significant speedup and reduction in memory usage. There are many approaches to quantizing a model, varying in amount of pre-processing required, data types used, and impact on model accuracy and performance.
+For more information regarding backend delegateion, see the ExecuTorch guides
+for the [XNNPACK Backend](../tutorial-xnnpack-delegate-lowering.md) and [Core ML
+Backend](../build-run-coreml.md).
 
-Because compute and memory are highly constrained on mobile devices, some form of quantization is necessary to ship large models on consumer electronics. In particular, large language models, such as Llama2, may require quantizing model weights to 4 bits or less.
+## Quantization
 
-Leveraging quantization requires transforming the model before export. PyTorch provides multiple quantization flows. Because we are quantizing a model for export, we need to use the PyTorch 2.0 export (pt2e) quantization API.
+Quantization refers to a set of techniques for running calculations and storing tensors using lower precision types.
+Compared to 32-bit floating point, using 8-bit integers can provide both a significant speedup and reduction in
+memory usage. There are many approaches to quantizing a model, varying in amount of pre-processing required, data
+types used, and impact on model accuracy and performance.
 
-This example targets CPU acceleration using the XNNPACK delegate. As such, we need to use the XNNPACK-specific quantizer. Targeting a different backend will require use of the corresponding quantizer.
+Because compute and memory are highly constrained on mobile devices, some form of quantization is necessary to ship
+large models on consumer electronics. In particular, large language models, such as Llama2, may require quantizing
+model weights to 4 bits or less.
 
-To use 8-bit integer dynamic quantization with the XNNPACK delegate, perform the following calls prior to calling export. This will update and annotate the computational graph to use quantized operators, where available.
+Leveraging quantization requires transforming the model before export. PyTorch provides the pt2e (PyTorch 2 Export)
+API for this purpose. This example targets CPU acceleration using the XNNPACK delegate. As such, it needs to use the
+ XNNPACK-specific quantizer. Targeting a different backend will require use of the corresponding quantizer.
+
+To use 8-bit integer dynamic quantization with the XNNPACK delegate, call `prepare_pt2e`, calibrate the model by
+running with a representative input, and then call `convert_pt2e`. This updates the computational graph to use
+quantized operators where available.
+
+```python
+# export_nanogpt.py
 
-```
 from executorch.backends.transforms.duplicate_dynamic_quant_chain import (
     DuplicateDynamicQuantChainPass,
 )
@@ -296,7 +623,9 @@ from torch.ao.quantization.quantizer.xnnpack_quantizer import (
     XNNPACKQuantizer,
 )
 from torch.ao.quantization.quantize_pt2e import convert_pt2e, prepare_pt2e
+```
 
+```python
 # Use dynamic, per-channel quantization.
 xnnpack_quant_config = get_symmetric_quantization_config(
     is_per_channel=True, is_dynamic=True
@@ -318,48 +647,53 @@ m = convert_pt2e(m, fold_quantize=False)
 DuplicateDynamicQuantChainPass()(m)
 
 traced_model = export(m, example_inputs)
-
 ```
 
-Additionally, add or update the to_backend() call to use XnnpackDynamicallyQuantizedPartitioner. This will instruct the lowering logic to emit the correct quantized operators.
+Additionally, add or update the `to_backend()` call to use `XnnpackPartitioner`. This instructs ExecuTorch to
+optimize the model for CPU execution via the XNNPACK backend.
 
-```
+```python
 from executorch.backends.xnnpack.partition.xnnpack_partitioner import (
-    XnnpackDynamicallyQuantizedPartitioner,
+    XnnpackPartitioner,
 )
+```
 
+```python
 edge_manager = to_edge(traced_model, compile_config=edge_config)
-
-# Lower to XNNPACK using the appropriate quantized partitioner.
-edge_manager = edge_manager.to_backend(XnnpackDynamicallyQuantizedPartitioner())
-
+edge_manager = edge_manager.to_backend(XnnpackPartitioner()) # Lower to XNNPACK.
 et_program = edge_manager.to_executorch()
 ```
-Finally, update the CMakeLists.txt to link the XNNPACK backend with the runner.
+
+Finally, ensure that the runner links against the `xnnpack_backend` target in CMakeLists.txt.
 
 ```
-add_executable(nanogpt_runner nanogpt_runner.cpp)
+add_executable(nanogpt_runner main.cpp)
 target_link_libraries(
     nanogpt_runner
     PRIVATE
-    etdump
-    extension_module
-    portable_ops_lib
-    xnnpack_backend) # Link the XNNPACK backend
+    executorch
+    extension_module_static # Provides the Module class
+    optimized_native_cpu_ops_lib # Provides baseline cross-platform kernels
+    xnnpack_backend) # Provides the XNNPACK CPU acceleration backend
 ```
 
-## Debugging and Profiling
-After lowering a model by calling to_backend(), you might want to see what got delegated and what didn’t. We provide util functions to help you get insight on the delegation, and with such information, you can debug and maybe improve the delegation.
+For more information, see [Quantization in ExecuTorch](../quantization-overview.md).
 
-### Debug the Delegation
+## Profiling and Debugging
+After lowering a model by calling `to_backend()`, you may want to see what got delegated and what didn’t. ExecuTorch
+provides utility methods to give insight on the delegation. You can use this information to gain visibility into
+the underlying computation and diagnose potential performance issues. Model authors can use this information to
+structure the model in a way that is compatible with the target backend.
 
-1.  Get high level information
-get_delegation_info gives you a summary of what happened to the model after the to_backend() call:
+### Visualizing the Delegation
 
-```
+The `get_delegation_info()` method provides a summary of what happened to the model after the `to_backend()` call:
+
+```python
 from executorch.exir.backend.utils import get_delegation_info
 from tabulate import tabulate
 
+# ... After call to to_backend(), but before to_executorch()
 graph_module = edge_manager.exported_program().graph_module
 delegation_info = get_delegation_info(graph_module)
 print(delegation_info.get_summary())
@@ -367,8 +701,7 @@ df = delegation_info.get_operator_delegation_dataframe()
 print(tabulate(df, headers="keys", tablefmt="fancy_grid"))
 ```
 
-
-Take NanoGPT lowered to XNNPACK as an example:
+For nanoGPT targeting the XNNPACK backend, you might see the following:
 ```
 Total  delegated  subgraphs:  86
 Number  of  delegated  nodes:  473
@@ -376,121 +709,122 @@ Number  of  non-delegated  nodes:  430
 ```
 
 
-|    |  op_type                                 |  occurrences_in_delegated_graphs  |  occurrences_in_non_delegated_graphs  |
+|    |  op_type                                 |  # in_delegated_graphs  |  # in_non_delegated_graphs  |
 |----|---------------------------------|------- |-----|
 |  0  |  aten__softmax_default  |  12  |  0  |
 |  1  |  aten_add_tensor  |  37  |  0  |
 |  2  |  aten_addmm_default  |  48  |  0  |
 |  3  |  aten_arange_start_step  |  0  |  25  |
-|  4  |  aten_bmm_default  |  24  |  0  |
-|  5  |  aten_clone_default  |  0  |  38  |
-|  6  |  aten_embedding_default  |  0  |  2  |
-|  7  |  aten_expand_copy_default  |  48  |  0  |
-|  8  |  aten_full_default  |  0  |  12  |
-|  9  |  aten_full_like_default  |  0  |  12  |
-|  10  |  aten_gelu_default  |  0  |  12  |
-|  11  |  aten_index_tensor  |  0  |  1  |
-|  12  |  aten_le_scalar  |  0  |  12  |
-|  13  |  aten_logical_and_default  |  0  |  12  |
-|  14  |  aten_logical_not_default  |  0  |  12  |
-|  15  |  aten_mm_default  |  1  |  0  |
-|  16  |  aten_mul_scalar  |  24  |  0  |
-|  17  |  aten_native_layer_norm_default  |  0  |  25  |
-|  18  |  aten_permute_copy_default  |  109  |  0  |
-|  19  |  aten_scalar_tensor_default  |  0  |  12  |
-|  20  |  aten_split_with_sizes_copy_default  |  0  |  12  |
-|  21  |  aten_sub_tensor  |  0  |  12  |
-|  22  |  aten_unsqueeze_copy_default  |  0  |  24  |
+|      |  ...  |    |    |
 |  23  |  aten_view_copy_default  |  170  |  48  |
-|  24  |  aten_where_self  |  0  |  12  |
-|  25  |  getitem  |  0  |  147  |
+|      |  ...  |    |    |
 |  26  |  Total  |  473  |  430  |
 
-In the table, we see that op type aten_view_copy_default appears 170 times in delegate graphs and 48 times in non-delegated graphs.
-
-| 23 | aten_view_copy_default | 170 | 48 |
-
-From here, we might want to know in which part of the graph it wasn’t delegated. For that, you can use the `print_delegated_graph` util function to see a printout of the whole graph with highlighted lowered graphs.
+From the table, the operator `aten_view_copy_default` appears 170 times in delegate graphs and 48 times in non-delegated graphs.
+To see a more detailed view, use the `print_delegated_graph()` method to display a printout of the whole graph.
 
-2.  Print graph module
-Call this function right after you call `to_backend()`
-
-```
+```python
 from executorch.exir.backend.utils import print_delegated_graph
-graph_module = self.edge_manager.exported_program().graph_module
+graph_module = edge_manager.exported_program().graph_module
 print(print_delegated_graph(graph_module))
 ```
+This may generate a large amount of output for large models. Consider using "Control+F" or "Command+F" to locate the operator you’re interested in
+(e.g. “aten_view_copy_default”). Observe which instances are not under lowered graphs.
 
-On the printed graph, you can do "Control+F" (or "Command+F" on a Mac) on the operator type you’re interested in (e.g. “aten_view_copy_default”) and observe which ones of them are not under “lowered graph()”s.
+In the fragment of the output for nanoGPT below, observe that embedding and add operators are delegated to XNNPACK while the sub operator is not.
 
-### Performance Analysis (Optional)
+```
+%aten_unsqueeze_copy_default_22 : [num_users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten.unsqueeze_copy.default](args = (%aten_arange_start_step_23, -2), kwargs = {})
+  %aten_unsqueeze_copy_default_23 : [num_users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten.unsqueeze_copy.default](args = (%aten_arange_start_step_24, -1), kwargs = {})
+  %lowered_module_0 : [num_users=1] = get_attr[target=lowered_module_0]
+    backend_id: XnnpackBackend
+    lowered graph():
+      %aten_embedding_default : [num_users=1] = placeholder[target=aten_embedding_default]
+      %aten_embedding_default_1 : [num_users=1] = placeholder[target=aten_embedding_default_1]
+      %aten_add_tensor : [num_users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten.add.Tensor](args = (%aten_embedding_default, %aten_embedding_default_1), kwargs = {})
+      return (aten_add_tensor,)
+  %executorch_call_delegate : [num_users=1] = call_function[target=torch.ops.higher_order.executorch_call_delegate](args = (%lowered_module_0, %aten_embedding_default, %aten_embedding_default_1), kwargs = {})
+  %aten_sub_tensor : [num_users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten.sub.Tensor](args = (%aten_unsqueeze_copy_default, %aten_unsqueeze_copy_default_1), kwargs = {})
+```
+
+### Performance Analysis
 
-Through the ExecuTorch SDK, users are able to profile a model and inspect its latency performance.
+Through the ExecuTorch SDK, users are able to profile model execution, giving timing information for each operator in the model.
 
 #### Prerequisites
 
 ##### ETRecord generation (Optional)
 
-ETRecord contains model graphs and metadata for linking runtime results (such as profiling) to the eager model. You will be able to view all profiling events with just ETDump (see next section), but with ETRecord, you will also be able to link each event to the types of operators being executed, module hierarchy, and stack traces of the original PyTorch source code. For more information, see [https://pytorch.org/executorch/main/sdk-etrecord.html](https://pytorch.org/executorch/main/sdk-etrecord.html)
+An ETRecord is an artifact generated at the time of export that contains model graphs and source-level metadata linking the ExecuTorch program to the original PyTorch model. You can view all profiling events without an ETRecord, though with an ETRecord, you will also be able to link each event to the types of operators being executed, module hierarchy, and stack traces of the original PyTorch source code. For more information, see [the ETRecord docs](../sdk-etrecord.md).
 
 
-
-**Steps for enablement:**
-ETRecord is created during export. In your export script, you just called `to_edge() `and it returned edge_program_manager
+In your export script, after calling `to_edge()` and `to_executorch()`, call `generate_etrecord()` with the `EdgeProgramManager` from `to_edge()` and the `ExecuTorchProgramManager` from `to_executorch()`. Make sure to copy the `EdgeProgramManager`, as the call to `to_backend()` mutates the graph in-place.
 
 ```
 import copy
+from executorch.sdk import generate_etrecord
 
-# Make the deep copy right after your call to to_edge()
-edge_program_manager_copy  =  copy.deepcopy(edge_program_manager)
+# Make the deep copy immediately after to to_edge()
+edge_manager_copy = copy.deepcopy(edge_manager)
 
 # ...
-# Then generate ETRecord right after your call to to_executorch()
-etrecord_path  =  "etrecord.bin"
-generate_etrecord(etrecord_path,  edge_program_manager_copy,  et_program_manager)
+# Generate ETRecord right after to_executorch()
+etrecord_path = "etrecord.bin"
+generate_etrecord(etrecord_path, edge_manager_copy, et_program)
 ```
-Run the export script, then the ETRecord should be generated under path ./etrecord.bin.
-
-##### ETDump generation
 
-ETDump contains runtime results from executing an ExecuTorch model. For more information, see [https://pytorch.org/executorch/main/sdk-etdump.html](https://pytorch.org/executorch/main/sdk-etdump.html)
+Run the export script and the ETRecord will be generated as `etrecord.bin`.
 
+##### ETDump generation
 
+An ETDump is an artifact generated at runtime containing a trace of the model execution. For more information, see [the ETDump docs](../sdk-etdump.md).
 
-**Steps for enablement:**
-You need to enable ETDump generation in your nanogpt_runner.cpp.
+Include the ETDump header in your code.
+```cpp
+// main.cpp
 
-1.  Include the ETDump header in your code.
-```
-#include  <executorch/sdk/etdump/etdump_flatcc.h>
+#include <executorch/sdk/etdump/etdump_flatcc.h>
 ```
 
-2.  Create an Instance of the ETDumpGen class and pass it into the Module constructor
-```
+Create an Instance of the ETDumpGen class and pass it to the Module constructor.
+```cpp
 std::unique_ptr<torch::executor::ETDumpGen> etdump_gen_ = std::make_unique<torch::executor::ETDumpGen>();
-Module llm_model("nanogpt.pte", Module::MlockConfig::UseMlock, std::move(etdump_gen_));
+Module model("nanogpt.pte", torch::executor::Module::MlockConfig::UseMlockIgnoreErrors, std::move(etdump_gen_));
 ```
 
-3.  Dump out the ETDump buffer after call to generate()
-```
+After calling `generate()`, save the ETDump to a file. You can capture multiple
+model runs in a single trace, if desired.
+```cpp
 torch::executor::ETDumpGen* etdump_gen =
-static_cast<torch::executor::ETDumpGen*>(llm_model.event_tracer());
+    static_cast<torch::executor::ETDumpGen*>(model.event_tracer());
 
 ET_LOG(Info, "ETDump size: %zu blocks", etdump_gen->get_num_blocks());
 etdump_result result = etdump_gen->get_etdump_data();
 if (result.buf != nullptr && result.size > 0) {
-// On a device with a file system users can just write it out
-// to the file-system.
-FILE* f = fopen("etdump.etdp", "w+");
-fwrite((uint8_t*)result.buf, 1, result.size, f);
-fclose(f);
-free(result.buf);
+    // On a device with a file system, users can just write it to a file.
+    FILE* f = fopen("etdump.etdp", "w+");
+    fwrite((uint8_t*)result.buf, 1, result.size, f);
+    fclose(f);
+    free(result.buf);
 }
 ```
 
-4.  Compile your binary with the `ET_EVENT_TRACER_ENABLED` pre-processor flag to enable events to be traced and logged into ETDump inside the ExecuTorch runtime. Add these to your CMakeLists.txt
+Additionally, update CMakeLists.txt to build with SDK and enable events to be traced and logged into ETDump:
 
 ```
+option(EXECUTORCH_BUILD_SDK "" ON)
+
+# ...
+
+target_link_libraries(
+    nanogpt_runner
+    PRIVATE
+    executorch
+    extension_module_static # Provides the Module class
+    optimized_native_cpu_ops_lib # Provides baseline cross-platform kernels
+    xnnpack_backend # Provides the XNNPACK CPU acceleration backend
+    etdump) # Provides event tracing and logging
+
 target_compile_options(executorch PUBLIC -DET_EVENT_TRACER_ENABLED)
 target_compile_options(portable_ops_lib PUBLIC -DET_EVENT_TRACER_ENABLED)
 ```
@@ -498,45 +832,35 @@ Run the runner, you will see “etdump.etdp” generated.
 
 #### Analyze with Inspector APIs
 
-Once you’ve collected debug artifacts ETDump (and the optional ETRecord), you can feed them into Inspector APIs in order to get performance details.
+Once you’ve collected debug artifacts ETDump (and optionally an ETRecord), you can use the Inspector API to view performance information.
 
-##### Creating an Inspector
-```
+```python
 from executorch.sdk import Inspector
 
-inspector = Inspector(etdump_path="etdump.etdp", etrecord="etrecord.bin")
-# If you did not generate an ETRecord, then just pass in ETDump: `inspector = Inspector(etdump_path="etdump.etdp")`
-```
+inspector = Inspector(etdump_path="etdump.etdp")
+# If you also generated an ETRecord, then pass that in as well: `inspector = Inspector(etdump_path="etdump.etdp", etrecord="etrecord.bin")`
 
-Using an Inspector
-```
-with  open("inspector_out.txt", "w") as file:
+with open("inspector_out.txt", "w") as file:
     inspector.print_data_tabular(file)
 ```
-This saves the performance data in a tabular format in “inspector_out.txt”, with each row being a profiling event. Top rows:
-
-|  |  event_block_name  |  event_name  |  p10  (ms)  |  p50  (ms)  |  p90  (ms)  |  avg  (ms)  |  min  (ms)  |  max  (ms)  |  op_types  |  is_delegated_op  |  delegate_backend_name  |
-|---|----------------------|------------------|-----------|---------------|--------------|-------------|-------------|--------------|-------------|---------------------------|----------|
-|  0  |  Default  |  Method::init  |  60.502  |  60.502  |  60.502  |  60.502  |  60.502  |  60.502  |  []  |  False  |  |
-|  1  |  Default  |  Program::load_method  |  60.5114  |  60.5114  |  60.5114  |  60.5114  |  60.5114  |  60.5114  |  []  |  False  |  |
-|  2  |  Execute  |  native_call_arange.start_out  |  0.029583  |  0.029583  |  0.029583  |  0.029583  |  0.029583  |  0.029583  |  []  |  False  |  |
-|  3  |  Execute  |  native_call_embedding.out  |  0.022916  |  0.022916  |  0.022916  |  0.022916  |  0.022916  |  0.022916  |  []  |  False  |  |
-|  4  |  Execute  |  native_call_embedding.out  |  0.001084  |  0.001084  |  0.001084  |  0.001084  |  0.001084  |  0.001084  |  []  |  False  |  |
+This prints the performance data in a tabular format in “inspector_out.txt”, with each row being a profiling event. Top rows look like this:
+![](../_static/img/llm_manual_print_data_tabular.png)
+<a href="../_static/img/llm_manual_print_data_tabular.png" target="_blank">View in full size</a>
 
-For more information about Inspector APIs and the rich functionality it provides, see [https://pytorch.org/executorch/main/sdk-inspector.html](https://pytorch.org/executorch/main/sdk-inspector.html).
+To learn more about the Inspector and the rich functionality it provides, see the [Inspector API Reference](../sdk-inspector.md).
 
-## How to use custom kernels
-With our new custom op APIs, custom op/kernel authors can easily bring in their op/kernel into PyTorch/ExecuTorch and the process is streamlined.
+## Custom Kernels
+With the ExecuTorch custom operator APIs, custom operator and kernel authors can easily bring in their kernel into PyTorch/ExecuTorch.
 
 There are three steps to use custom kernels in ExecuTorch:
 
-1.  Prepare the kernel implementation using ExecuTorch types.
-2.  Compile and link the custom kernel to both AOT Python environment as well as the runner binary.
+1.  Write the custom kernel using ExecuTorch types.
+2.  Compile and link the custom kernel to both AOT Python environment as well as the runtime binary.
 3.  Source-to-source transformation to swap an operator with a custom op.
 
-### Prepare custom kernel implementation
+### Writing a Custom Kernel
 
-Define your custom operator schema for both functional variant (used in AOT compilation) and out variant (used in ExecuTorch runtime). The schema needs to follow PyTorch ATen convention (see [native_functions.yaml](https://github.com/pytorch/pytorch/blob/main/aten/src/ATen/native/native_functions.yaml)). For example:
+Define your custom operator schema for both functional variant (used in AOT compilation) and out variant (used in ExecuTorch runtime). The schema needs to follow PyTorch ATen convention (see [native_functions.yaml](https://github.com/pytorch/pytorch/blob/main/aten/src/ATen/native/native_functions.yaml)).
 
 ```
 custom_linear(Tensor weight, Tensor input, Tensor(?) bias) -> Tensor
@@ -544,89 +868,87 @@ custom_linear(Tensor weight, Tensor input, Tensor(?) bias) -> Tensor
 custom_linear.out(Tensor weight, Tensor input, Tensor(?) bias, *, Tensor(a!) out) -> Tensor(a!)
 ```
 
-Then write your custom kernel according to the schema using ExecuTorch types, along with APIs to register to ExecuTorch runtime:
-```
-// custom_linear.h/custom_linear.cpp
+Write your custom kernel according to the schema defined above. Use the `EXECUTORCH_LIBRARY` macro to make the kernel available to the ExecuTorch runtime.
+
+```cpp
+// custom_linear.h / custom_linear.cpp
 #include <executorch/runtime/kernel/kernel_includes.h>
 
 Tensor& custom_linear_out(const Tensor& weight, const Tensor& input, optional<Tensor> bias, Tensor& out) {
-
-// calculation
-return out;
+    // calculation
+    return out;
 }
 
-// opset namespace myop
+// Register as myop::custom_linear.out
 EXECUTORCH_LIBRARY(myop, "custom_linear.out", custom_linear_out);
 ```
 
-Now we need to write some wrapper for this op to show up in PyTorch, but don’t worry we don’t need to rewrite the kernel. Create a separate .cpp for this purpose:
+To make this operator available in PyTorch, you can define a wrapper around the ExecuTorch custom kernel. Note that the ExecuTorch
+implementation uses ExecuTorch tensor types, while the PyTorch wrapper uses ATen tensors.
 
-```
+```cpp
 // custom_linear_pytorch.cpp
+
 #include "custom_linear.h"
 #include <torch/library.h>
 
 at::Tensor custom_linear(const at::Tensor& weight, const at::Tensor& input, std::optional<at::Tensor> bias) {
 
-// initialize out
-at::Tensor out = at::empty({weight.size(1), input.size(1)});
+    // initialize out
+    at::Tensor out = at::empty({weight.size(1), input.size(1)});
 
-// wrap kernel in custom_linear.cpp into ATen kernel
-WRAP_TO_ATEN(custom_linear_out, 3)(weight, input, bias, out);
+    // wrap kernel in custom_linear.cpp into ATen kernel
+    WRAP_TO_ATEN(custom_linear_out, 3)(weight, input, bias, out);
 
-return out;
+    return out;
 }
 
-// standard API to register ops into PyTorch
+// Register the operator with PyTorch.
 TORCH_LIBRARY(myop,  m) {
-
-m.def("custom_linear(Tensor weight, Tensor input, Tensor(?) bias) -> Tensor", custom_linear);
-
-m.def("custom_linear.out(Tensor weight, Tensor input, Tensor(?) bias, *, Tensor(a!) out) -> Tensor(a!)", WRAP_TO_ATEN(custom_linear_out, 3));
+    m.def("custom_linear(Tensor weight, Tensor input, Tensor(?) bias) -> Tensor", custom_linear);
+    m.def("custom_linear.out(Tensor weight, Tensor input, Tensor(?) bias, *, Tensor(a!) out) -> Tensor(a!)", WRAP_TO_ATEN(custom_linear_out, 3));
 }
 ```
 
-### Compile and link the custom kernel
-
-Link it into ExecuTorch runtime: In our runner CMakeLists.txt we just need to add custom_linear.h/cpp into the binary target. We can build a dynamically loaded library (.so or .dylib) and link it as well.
-
+### Compile and Link the Custom Kernel
 
+To make it available to the ExecuTorch runtime, compile custom_linear.h/cpp into the binary target. You can also build the kernel as a dynamically loaded library (.so or .dylib) and link it as well.
 
-Link it into PyTorch runtime: We need to package custom_linear.h, custom_linear.cpp and custom_linear_pytorch.cpp into a dynamically loaded library (.so or .dylib) and load it into our python environment. One way of doing this is:
+To make it available to PyTorch, package custom_linear.h, custom_linear.cpp and custom_linear_pytorch.cpp into a dynamically loaded library (.so or .dylib) and load it into the python environment.
+This is needed to make PyTorch aware of the custom operator at the time of export.
 
-```
+```python
 import torch
-torch.ops.load_library("libcustom_linear.so/dylib")
+torch.ops.load_library("libcustom_linear.so")
 ```
 
+Once loaded, you can use the custom operator in PyTorch code.
 
-Once loaded we can perform the next step, of introducing the custom op into PyTorch environment.
-
-### Source-to-source transformation to introduce the custom op
+For more information, see [PyTorch Custom Operators](https://pytorch.org/tutorials/advanced/torch_script_custom_ops.html) and
+and [ExecuTorch Kernel Registration](../kernel-library-custom-aten-kernel.md).
 
-Easier way to introduce our customized linear is by rewriting the eager model. However, that may miss some occurrences of torch.nn.Linear in our example. A safer option is to walk through all the modules in the module hierarchy and perform the swapping.
+### Using a Custom Operator in a Model
 
-For example, we can do the following to swap torch.nn.Linear with our custom linear op:
+The custom operator can explicitly used in the PyTorch model, or you can write a transformation to replace instances of a core operator with the custom variant. For this example, you could find
+all instances of `torch.nn.Linear` and replace them with `CustomLinear`.
 
-```
+```python
 def  replace_linear_with_custom_linear(module):
-    for  name,  child  in  module.named_children():
-        if  isinstance(child,  nn.Linear):
+    for name, child in module.named_children():
+        if isinstance(child, nn.Linear):
             setattr(
                 module,
                 name,
                 CustomLinear(child.in_features,  child.out_features, child.bias),
         )
-    else:
-        replace_linear_with_custom_linear(child)
+        else:
+            replace_linear_with_custom_linear(child)
 ```
 
-The rest of the steps will be the same as the normal flow. Now you can run this module in eager as well as export it to ExecuTorch and run on the runner.
-
-## How to build Mobile Apps
-You can also execute an LLM using ExecuTorch on iOS and Android
-
-**For iOS details see the [iOS Sample App](https://github.com/pytorch/executorch/tree/main/examples/demo-apps/apple_ios).**
+The remaining steps are the same as the normal flow. Now you can run this module in eager mode as well as export to ExecuTorch.
 
+## How to Build Mobile Apps
+See the instructions for building and running LLMs using ExecuTorch on iOS and Android.
 
-**For Android see the [Android Instructions](https://pytorch.org/executorch/main/llm/llama-demo-android.html).**
+* **[iOS ExecuTorch LLaMA Demo App](llama-demo-ios.md)**
+* **[Android ExecuTorch LLaMA Demo App](llama-demo-android.md)**
diff --git a/docs/source/llm/llama-demo-ios.md b/docs/source/llm/llama-demo-ios.md
new file mode 100644
index 00000000000..cc25a24f335
--- /dev/null
+++ b/docs/source/llm/llama-demo-ios.md
@@ -0,0 +1,2 @@
+```{include} ../../../examples/demo-apps/apple_ios/LLaMA/README.md
+```
\ No newline at end of file
diff --git a/docs/source/native-delegates-executorch-vulkan-delegate.md b/docs/source/native-delegates-executorch-vulkan-delegate.md
new file mode 100644
index 00000000000..2c83c7f899c
--- /dev/null
+++ b/docs/source/native-delegates-executorch-vulkan-delegate.md
@@ -0,0 +1 @@
+```{include} ../../backends/vulkan/README.md
diff --git a/docs/source/native-delegates-executorch-xnnpack-delegate.md b/docs/source/native-delegates-executorch-xnnpack-delegate.md
index 12b2e9c2ba7..1d12daef9d8 100644
--- a/docs/source/native-delegates-executorch-xnnpack-delegate.md
+++ b/docs/source/native-delegates-executorch-xnnpack-delegate.md
@@ -74,16 +74,8 @@ Since weight packing creates an extra copy of the weights inside XNNPACK, We fre
 When executing the XNNPACK subgraphs, we prepare the tensor inputs and outputs and feed them to the XNNPACK runtime graph. After executing the runtime graph, the output pointers are filled with the computed tensors.
 
 #### **Profiling**
-We have enabled basic profiling for XNNPACK delegate that can be enabled with the following compiler flag `-DENABLE_XNNPACK_PROFILING`. After running the model it will produce basic per-op and total timings. We provide an example of the profiling below. The timings listed are the average across runs, and the units are in microseconds.
+We have enabled basic profiling for XNNPACK delegate that can be enabled with the following compiler flag `-DENABLE_XNNPACK_PROFILING`. With ExecuTorch's SDK integration, you can also now use the SDK tools to profile the model. You can follow the steps in [Using the ExecuTorch SDK to Profile a Model](./tutorials/sdk-integration-tutorial) on how to profile ExecuTorch models and use SDK's Inspector API to view XNNPACK's internal profiling information.
 
-```
-Fully Connected (NC, F32) GEMM: 109.510002
-Total Time: 109.510002
-```
-
-::::{note}
-Profiling is a work in progress, and is planned to be integrated with [SDK Tools](sdk-delegate-integration.md) and Tensorboard.
-::::
 
 [comment]: <> (TODO: Refactor quantizer to a more official quantization doc)
 ## Quantization
diff --git a/docs/source/sdk-bundled-io.md b/docs/source/sdk-bundled-io.md
index 2ed256d2aeb..33deae3904b 100644
--- a/docs/source/sdk-bundled-io.md
+++ b/docs/source/sdk-bundled-io.md
@@ -23,6 +23,8 @@ ExecuTorch Program can be emitted from user's model by using ExecuTorch APIs. Fo
 
 In `BundledProgram`, we create two new classes, `MethodTestCase` and `MethodTestSuite`, to hold essential info for ExecuTorch program verification.
 
+`MethodTestCase` represents a single testcase. Each `MethodTestCase` contains inputs and expected outputs for a single execution.
+
 :::{dropdown} `MethodTestCase`
 
 ```{eval-rst}
@@ -31,6 +33,8 @@ In `BundledProgram`, we create two new classes, `MethodTestCase` and `MethodTest
 ```
 :::
 
+`MethodTestSuite` contains all testing info for single method, including a str representing method name, and a `List[MethodTestCase]` for all testcases:
+
 :::{dropdown} `MethodTestSuite`
 
 ```{eval-rst}
@@ -44,18 +48,18 @@ Since each model may have multiple inference methods, we need to generate `List[
 
 ### Step 3: Generate `BundledProgram`
 
-We provide `create_bundled_program` API under `executorch/sdk/bundled_program/core.py` to generate `BundledProgram` by bundling the emitted ExecuTorch program with the `List[MethodTestSuite]`:
+We provide `BundledProgram` class under `executorch/sdk/bundled_program/core.py` to bundled the `ExecutorchProgram`-like variable, including
+                            `ExecutorchProgram`, `MultiMethodExecutorchProgram` or `ExecutorchProgramManager`, with the `List[MethodTestSuite]`:
 
 :::{dropdown} `BundledProgram`
 
 ```{eval-rst}
-.. currentmodule:: executorch.sdk.bundled_program.core
-.. autofunction:: create_bundled_program
+.. autofunction:: executorch.sdk.bundled_program.core.BundledProgram.__init__
     :noindex:
 ```
 :::
 
-`create_bundled_program` will do sannity check internally to see if the given `List[MethodTestSuite]` matches the given Program's requirements. Specifically:
+Construtor of `BundledProgram `will do sannity check internally to see if the given `List[MethodTestSuite]` matches the given Program's requirements. Specifically:
 1. The method_names of each `MethodTestSuite` in `List[MethodTestSuite]` for should be also in program. Please notice that it is no need to set testcases for every method in the Program.
 2. The metadata of each testcase should meet the requirement of the coresponding inference methods input.
 
@@ -83,20 +87,20 @@ To serialize `BundledProgram` to make runtime APIs use it, we provide two APIs,
 Here is a flow highlighting how to generate a `BundledProgram` given a PyTorch model and the representative inputs we want to test it along with.
 
 ```python
-
 import torch
 
+from executorch.exir import to_edge
+from executorch.sdk import BundledProgram
+
 from executorch.sdk.bundled_program.config import MethodTestCase, MethodTestSuite
-from executorch.sdk.bundled_program.core import create_bundled_program
 from executorch.sdk.bundled_program.serialize import (
     serialize_from_bundled_program_to_flatbuffer,
 )
-
-from executorch.exir import to_edge
+from torch._export import capture_pre_autograd_graph
 from torch.export import export
 
-# Step 1: ExecuTorch Program Export
 
+# Step 1: ExecuTorch Program Export
 class SampleModel(torch.nn.Module):
     """An example model with multi-methods. Each method has multiple input and single output"""
 
@@ -105,7 +109,7 @@ class SampleModel(torch.nn.Module):
         self.a: torch.Tensor = 3 * torch.ones(2, 2, dtype=torch.int32)
         self.b: torch.Tensor = 2 * torch.ones(2, 2, dtype=torch.int32)
 
-    def encode(self, x: torch.Tensor, q: torch.Tensor) -> torch.Tensor:
+    def forward(self, x: torch.Tensor, q: torch.Tensor) -> torch.Tensor:
         z = x.clone()
         torch.mul(self.a, x, out=z)
         y = x.clone()
@@ -113,74 +117,62 @@ class SampleModel(torch.nn.Module):
         torch.add(y, q, out=y)
         return y
 
-    def decode(self, x: torch.Tensor, q: torch.Tensor) -> torch.Tensor:
-        y = x * q
-        torch.add(y, self.b, out=y)
-        return y
 
-# Inference method names of SampleModel we want to bundle testcases to.
+# Inference method name of SampleModel we want to bundle testcases to.
 # Notices that we do not need to bundle testcases for every inference methods.
-method_names = ["encode", "decode"]
+method_name = "forward"
 model = SampleModel()
 
-capture_inputs = {
-    m_name: (
-        (torch.rand(2, 2) - 0.5).to(dtype=torch.int32),
-        (torch.rand(2, 2) - 0.5).to(dtype=torch.int32),
-    )
-    for m_name in method_names
-}
+# Inputs for graph capture.
+capture_input = (
+    (torch.rand(2, 2) - 0.5).to(dtype=torch.int32),
+    (torch.rand(2, 2) - 0.5).to(dtype=torch.int32),
+)
 
-# Find each method of model needs to be traced my its name, export its FX Graph.
-method_graphs = {
-    m_name: export(getattr(model, m_name), capture_inputs[m_name])
-    for m_name in method_names
-}
+# Export method's FX Graph.
+method_graph = export(
+    capture_pre_autograd_graph(model, capture_input),
+    capture_input,
+)
 
-# Emit the traced methods into ET Program.
-program = to_edge(method_graphs).to_executorch().executorch_program
+
+# Emit the traced method into ET Program.
+et_program = to_edge(method_graph).to_executorch()
 
 # Step 2: Construct MethodTestSuite for Each Method
 
 # Prepare the Test Inputs.
 
-# number of input sets to be verified
+# Number of input sets to be verified
 n_input = 10
 
-# Input sets to be verified for each inference methods.
-# To simplify, here we create same inputs for all methods.
-inputs = {
-    # Inference method name corresponding to its test cases.
-    m_name: [
-        # Each list below is a individual input set.
-        # The number of inputs, dtype and size of each input follow Program's spec.
-        [
-            (torch.rand(2, 2) - 0.5).to(dtype=torch.int32),
-            (torch.rand(2, 2) - 0.5).to(dtype=torch.int32),
-        ]
-        for _ in range(n_input)
+# Input sets to be verified.
+inputs = [
+    # Each list below is a individual input set.
+    # The number of inputs, dtype and size of each input follow Program's spec.
+    [
+        (torch.rand(2, 2) - 0.5).to(dtype=torch.int32),
+        (torch.rand(2, 2) - 0.5).to(dtype=torch.int32),
     ]
-    for m_name in method_names
-}
+    for _ in range(n_input)
+]
 
 # Generate Test Suites
 method_test_suites = [
     MethodTestSuite(
-        method_name=m_name,
+        method_name=method_name,
         test_cases=[
             MethodTestCase(
                 inputs=input,
-                expected_outputs=getattr(model, m_name)(*input),
+                expected_outputs=(getattr(model, method_name)(*input), ),
             )
-            for input in inputs[m_name]
+            for input in inputs
         ],
-    )
-    for m_name in method_names
+    ),
 ]
 
 # Step 3: Generate BundledProgram
-
-bundled_program = create_bundled_program(program, method_test_suites)
+bundled_program = BundledProgram(et_program, method_test_suites)
 
 # Step 4: Serialize BundledProgram to flatbuffer.
 serialized_bundled_program = serialize_from_bundled_program_to_flatbuffer(
@@ -320,10 +312,10 @@ Here's the example of the dtype of test input not meet model's requirement:
 ```python
 import torch
 
-from executorch.sdk.bundled_program.config import MethodTestCase, MethodTestSuite
-from executorch.sdk.bundled_program.core import create_bundled_program
-
 from executorch.exir import to_edge
+from executorch.sdk import BundledProgram
+
+from executorch.sdk.bundled_program.config import MethodTestCase, MethodTestSuite
 from torch.export import export
 
 
@@ -344,15 +336,16 @@ class Module(torch.nn.Module):
 model = Module()
 method_names = ["forward"]
 
-inputs = torch.ones(2, 2, dtype=torch.float)
+inputs = (torch.ones(2, 2, dtype=torch.float), )
 
 # Find each method of model needs to be traced my its name, export its FX Graph.
-method_graphs = {
-    m_name: export(getattr(model, m_name), (inputs,)) for m_name in method_names
-}
+method_graph = export(
+    capture_pre_autograd_graph(model, inputs),
+    inputs,
+)
 
 # Emit the traced methods into ET Program.
-program = to_edge(method_graphs).to_executorch().executorch_program
+et_program = to_edge(method_graph).to_executorch()
 
 # number of input sets to be verified
 n_input = 10
@@ -378,7 +371,7 @@ method_test_suites = [
         test_cases=[
             MethodTestCase(
                 inputs=input,
-                expected_outputs=getattr(model, m_name)(*input),
+                expected_outputs=(getattr(model, m_name)(*input),),
             )
             for input in inputs[m_name]
         ],
@@ -388,7 +381,7 @@ method_test_suites = [
 
 # Generate BundledProgram
 
-bundled_program = create_bundled_program(program, method_test_suites)
+bundled_program = BundledProgram(et_program, method_test_suites)
 ```
 
 :::{dropdown} Raised Error
@@ -455,10 +448,10 @@ Another common error would be the method name in any `MethodTestSuite` does not
 ```python
 import torch
 
-from executorch.sdk.bundled_program.config import MethodTestCase, MethodTestSuite
-from executorch.sdk.bundled_program.core import create_bundled_program
-
 from executorch.exir import to_edge
+from executorch.sdk import BundledProgram
+
+from executorch.sdk.bundled_program.config import MethodTestCase, MethodTestSuite
 from torch.export import export
 
 
@@ -477,18 +470,18 @@ class Module(torch.nn.Module):
 
 
 model = Module()
-
 method_names = ["forward"]
 
-inputs = torch.ones(2, 2, dtype=torch.float)
+inputs = (torch.ones(2, 2, dtype=torch.float),)
 
 # Find each method of model needs to be traced my its name, export its FX Graph.
-method_graphs = {
-    m_name: export(getattr(model, m_name), (inputs,)) for m_name in method_names
-}
+method_graph = export(
+    capture_pre_autograd_graph(model, inputs),
+    inputs,
+)
 
 # Emit the traced methods into ET Program.
-program = to_edge(method_graphs).to_executorch().executorch_program
+et_program = to_edge(method_graph).to_executorch()
 
 # number of input sets to be verified
 n_input = 10
@@ -513,7 +506,7 @@ method_test_suites = [
         test_cases=[
             MethodTestCase(
                 inputs=input,
-                expected_outputs=getattr(model, m_name)(*input),
+                expected_outputs=(getattr(model, m_name)(*input),),
             )
             for input in inputs[m_name]
         ],
@@ -525,7 +518,7 @@ method_test_suites = [
 method_test_suites[0].method_name = "MISSING_METHOD_NAME"
 
 # Generate BundledProgram
-bundled_program = create_bundled_program(program, method_test_suites)
+bundled_program = BundledProgram(et_program, method_test_suites)
 
 ```
 
diff --git a/docs/source/sdk-debugging.md b/docs/source/sdk-debugging.md
index 1563038eb52..45e50b44e87 100644
--- a/docs/source/sdk-debugging.md
+++ b/docs/source/sdk-debugging.md
@@ -20,7 +20,7 @@ For a real example reflecting the steps below, please refer to [sdk_example_runn
     Span<uint8_t> buffer((uint8_t*)debug_buffer, debug_buffer_size);
     etdump_gen.set_debug_buffer(buffer);
     etdump_gen.set_event_tracer_debug_level(
-        EventTracerDebugLogLevel::kIntermediateOutputs);
+        EventTracerDebugLogLevel::kProgramOutputs);
     ```
 
     - Intermediate outputs of executed (non-delegated) operations (will include the program level outputs too)
@@ -28,7 +28,7 @@ For a real example reflecting the steps below, please refer to [sdk_example_runn
     Span<uint8_t> buffer((uint8_t*)debug_buffer, debug_buffer_size);
     etdump_gen.set_debug_buffer(buffer);
     etdump_gen.set_event_tracer_debug_level(
-        EventTracerDebugLogLevel::kProgramOutputs);
+        EventTracerDebugLogLevel::kIntermediateOutputs);
     ```
 3. Build the runtime with the pre-processor flag that enables tracking of debug events. Instructions are in the [ETDump documentation](./sdk-etdump.md).
 4. Run your model and dump out the ETDump buffer as described [here](./sdk-etdump.md). (Do so similarly for the debug buffer if configured above)
diff --git a/docs/source/sdk-delegate-integration.md b/docs/source/sdk-delegate-integration.md
index 7f8c61af8c5..80033711552 100644
--- a/docs/source/sdk-delegate-integration.md
+++ b/docs/source/sdk-delegate-integration.md
@@ -20,7 +20,7 @@ Delegate authors propagate what transformations occur in a lowered backend by re
 
 For example:
 - **{ 0: (10, 11), 1: (11, 12) }:** Identifiers 0 and 1 in the runtime correspond to operators with the debug handles (10, 11) and (11, 12) respectively.
-- **{ “Fancy Fusion”: (11, 12, 15) }**: Identifier “Fancy Fusion” in the runtime corresponds to operators with debug handles (11, 12, 15).
+- **{ “fused_op_1_2_3”: (11, 12, 15) }**: Identifier “fused_op_1_2_3” in the runtime corresponds to operators with debug handles (11, 12, 15), and 11, 12, 15 corresponds to the op 1, op 2 and op 3.
 
 ```{Note}
 Identifiers are a means of connecting runtime results to the model graph; the interpretation of the identifiers is defined by the delegate author.
diff --git a/docs/source/sdk-etdump.md b/docs/source/sdk-etdump.md
index 8937ea5a777..4eacb18b14c 100644
--- a/docs/source/sdk-etdump.md
+++ b/docs/source/sdk-etdump.md
@@ -34,31 +34,11 @@ if (result.buf != nullptr && result.size > 0) {
   }
 ```
 
-4. ***Compile*** your binary with the `ET_EVENT_TRACER_ENABLED` pre-processor flag to enable events to be traced and logged into ETDump inside the ExecuTorch runtime.
-
-    i). ***Buck***
-
-    In Buck, users simply depend on the etdump target which is:
-    ```
-    //executorch/sdk/etdump:etdump_flatcc
-    ```
-    When compiling their binary through Buck, users can pass in this buck config to enable the pre-processor flag. For example, when compiling `sdk_example_runner` to enable ETDump generation, users compile using the following command:
-    ```
-    buck2 build -c executorch.event_tracer_enabled=true examples/sdk/sdk_example_runner:sdk_example_runner
-    ```
-
-    ii). ***CMake***
-
-    In CMake, users add this to their compile flags:
-    ```
-    -DET_EVENT_TRACER_ENABLED
-    ```
-
-    This flag needs to be added to the ExecuTorch library and any operator library that the users are compiling into their binary. For reference, users can take a look at `examples/sdk/CMakeLists.txt`. The lines of of interest are:
-    ```
-    target_compile_options(executorch PUBLIC -DET_EVENT_TRACER_ENABLED)
-    target_compile_options(portable_ops_lib PUBLIC -DET_EVENT_TRACER_ENABLED)
-    ```
+4. ***Compile*** your binary using CMake with the `ET_EVENT_TRACER_ENABLED` pre-processor flag to enable events to be traced and logged into ETDump inside the ExecuTorch runtime. This flag needs to be added to the ExecuTorch library and any operator library that you are compiling into your binary. For reference, you can take a look at `examples/sdk/CMakeLists.txt`. The lines of interest are:
+```
+target_compile_options(executorch INTERFACE -DET_EVENT_TRACER_ENABLED)
+target_compile_options(portable_ops_lib INTERFACE -DET_EVENT_TRACER_ENABLED)
+```
 ## Using an ETDump
 
-1. Pass this ETDump into the [Inspector API](./sdk-inspector.rst) to access this data and  do post-run analysis.
+Pass this ETDump into the [Inspector API](./sdk-inspector.rst) to access this data and do post-run analysis.
diff --git a/docs/source/sdk-etrecord.rst b/docs/source/sdk-etrecord.rst
index e9eeb52b4f5..43ed5095c64 100644
--- a/docs/source/sdk-etrecord.rst
+++ b/docs/source/sdk-etrecord.rst
@@ -29,7 +29,7 @@ the ExecuTorch program (returned by the call to ``to_executorch()``), and option
 they are interested in working with via our tooling.
 
 .. warning::
-    Users should do a deepcopy of the output of to_edge() and pass in the deepcopy to the generate_etrecord API. This is needed because the subsequent call, to_executorch(), does an in-place mutation and will lose debug data in the process.
+    Users should do a deepcopy of the output of ``to_edge()`` and pass in the deepcopy to the ``generate_etrecord`` API. This is needed because the subsequent call, ``to_executorch()``, does an in-place mutation and will lose debug data in the process.
 
 .. currentmodule:: executorch.sdk.etrecord._etrecord
 .. autofunction:: generate_etrecord
diff --git a/docs/source/sdk-inspector.rst b/docs/source/sdk-inspector.rst
index 23c529cb9d2..e15c1f2a395 100644
--- a/docs/source/sdk-inspector.rst
+++ b/docs/source/sdk-inspector.rst
@@ -56,6 +56,7 @@ print_data_tabular
     inspector.print_data_tabular()
 
 .. image:: _static/img/print_data_tabular.png
+Note that the unit of delegate profiling events is "cycles". We're working on providing a way to set different units in the future.
 
 
 find_total_for_module
diff --git a/docs/source/sdk-overview.md b/docs/source/sdk-overview.md
index 85270a44bcf..53f7d88613a 100644
--- a/docs/source/sdk-overview.md
+++ b/docs/source/sdk-overview.md
@@ -14,7 +14,7 @@ The ExecuTorch SDK supports the following features:
     - Model loading and execution time
 - **Delegate Integration** - Surfacing performance details from delegate backends
     - Link back delegate operator execution to the nodes they represent in the edge dialect graph (and subsequently linking back to source code and module hierarchy)
-- **Debugging** (Intermediate outputs and output quality analysis) - Coming soon
+- **Debugging** - Intermediate outputs and output quality analysis
 - **Visualization** - Coming soon
 
 ## Fundamental components of the SDK
diff --git a/docs/source/tutorials_source/sdk-integration-tutorial.py b/docs/source/tutorials_source/sdk-integration-tutorial.py
index 8cf186a8cd9..27474c2251e 100644
--- a/docs/source/tutorials_source/sdk-integration-tutorial.py
+++ b/docs/source/tutorials_source/sdk-integration-tutorial.py
@@ -20,7 +20,7 @@
 # This tutorial will show a full end-to-end flow of how to utilize the SDK.
 # Specifically, it will:
 #
-# 1. Generate the artifacts consumed by the SDK (`ETRecord <../sdk-etrecord>`__, `ETDump <../sdk-etdump.html>`__).
+# 1. Generate the artifacts consumed by the SDK (`ETRecord <../sdk-etrecord.html>`__, `ETDump <../sdk-etdump.html>`__).
 # 2. Create an Inspector class consuming these artifacts.
 # 3. Utilize the Inspector class to analyze the model.
 
@@ -42,7 +42,7 @@
 #
 # ``executorch.sdk.generate_etrecord`` takes in an output file path (str), the
 # edge dialect model (``EdgeProgramManager``), the ExecuTorch dialect model
-# (``ExecutorchProgramManager``), and an optional dictionary of additional models
+# (``ExecutorchProgramManager``), and an optional dictionary of additional models.
 #
 # In this tutorial, an example model (shown below) is used to demonstrate.
 
@@ -113,9 +113,9 @@ def forward(self, x):
 ######################################################################
 #
 # .. warning::
-#    Users should do a deepcopy of the output of to_edge() and pass in the
-#    deepcopy to the generate_etrecord API. This is needed because the
-#    subsequent call, to_executorch(), does an in-place mutation and will
+#    Users should do a deepcopy of the output of ``to_edge()`` and pass in the
+#    deepcopy to the ``generate_etrecord`` API. This is needed because the
+#    subsequent call, ``to_executorch()``, does an in-place mutation and will
 #    lose debug data in the process.
 #
 
@@ -169,21 +169,10 @@ def forward(self, x):
     f.write(serialized_bundled_program)
 
 ######################################################################
-# We provide 2 ways of executing the Bundled Model to generate the ``ETDump``:
-#
-# **Option 1:**
-#
-# Use Buck (follow `these instructions <../getting-started-setup.html#building-a-runtime>`__ to set up buck)::
-#
-#       cd executorch
-#       buck2 run -c executorch.event_tracer_enabled=true examples/sdk/sdk_example_runner:sdk_example_runner -- --bundled_program_path <bundled_program>
-#
-# **Option 2:**
-#
-# Use CMake (follow `these instructions <../runtime-build-and-cross-compilation.html#configure-the-cmake-build>`__ to set up cmake)::
+# Use CMake (follow `these instructions <../runtime-build-and-cross-compilation.html#configure-the-cmake-build>`__ to set up cmake) to execute the Bundled Program to generate the ``ETDump``::
 #
 #       cd executorch
-#       rm -rf cmake-out && mkdir cmake-out && cd cmake-out && cmake -DBUCK2=buck2 -DEXECUTORCH_BUILD_SDK=1 -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=1 ..
+#       rm -rf cmake-out && mkdir cmake-out && cd cmake-out && cmake -DEXECUTORCH_BUILD_SDK=1 -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=1 ..
 #       cd ..
 #       cmake --build cmake-out -j8 -t sdk_example_runner
 #       ./cmake-out/examples/sdk/sdk_example_runner --bundled_program_path <bundled_program>
@@ -308,6 +297,6 @@ def forward(self, x):
 # ^^^^^^^^^^^^^^^
 #
 # - `ExecuTorch SDK <../sdk-overview.html>`__
-# - `ETRecord <../sdk-etrecord>`__
+# - `ETRecord <../sdk-etrecord.html>`__
 # - `ETDump <../sdk-etdump.html>`__
 # - `Inspector <../sdk-inspector.html>`__
diff --git a/examples/README.md b/examples/README.md
index bce3e08b58f..6865a5c35ac 100644
--- a/examples/README.md
+++ b/examples/README.md
@@ -30,6 +30,9 @@ examples
 
 A user's journey may commence by exploring the demos located in the [`portable/`](./portable) directory. Here, you will gain insights into the fundamental end-to-end workflow to generate a binary file from a ML model in [portable mode](../docs/source/concepts.md##portable-mode-lean-mode) and run it on the ExecuTorch runtime.
 
+## Demo of Llama2
+
+[This page](./models/llama2/README.md) demonstrates how to run a Llama 2 7B model on mobile via ExecuTorch. We use XNNPACK to accelerate the performance and 4-bit groupwise PTQ quantization to fit the model on Android and iOS mobile phones.
 
 ## Demo of Selective Build
 
@@ -37,7 +40,7 @@ To understand how to deploy the ExecuTorch runtime with optimization for binary
 
 ## Demo of ExecuTorch SDK
 
-You will find demos of [ExecuTorch SDK](./sdk/) in the [`sdk/`](./sdk/) directory. The examples focuses on exporting and executing BundledProgram for ExecuTorch model verification, and ETDump generation.
+You will find demos of [ExecuTorch SDK](./sdk/) in the [`sdk/`](./sdk/) directory. The examples focuses on exporting and executing BundledProgram for ExecuTorch model verification and ETDump for collecting profiling and debug data.
 
 ## Demo Apps
 
@@ -63,11 +66,6 @@ You will find demos of [ExecuTorch QNN Backend](./qualcomm) in the [`qualcomm/`]
 
 The [`xtensa/`](./xtensa) directory hosts a demo that showcases the process of exporting and executing a model on Xtensa Hifi4 DSP. You can utilize [this tutorial](../docs/source/build-run-xtensa.md) to guide you in configuring the demo and running it.
 
-
-## Demo of ExecuTorch SDK
-
-You will find demos of [ExecuTorch SDK](./sdk/) in the [`sdk/`](./sdk/) directory. The examples focuses on exporting and executing BundledProgram for ExecuTorch model verification and ETDump for collecting profiling and debug data.
-
 ## Dependencies
 
 Various models and workflows listed in this directory have dependencies on some other packages. You need to follow the setup guide in [Setting up ExecuTorch from GitHub](https://pytorch.org/executorch/stable/getting-started-setup) to have appropriate packages installed.
diff --git a/examples/apple/coreml/README.md b/examples/apple/coreml/README.md
index a10f3efcc95..f4270956b2c 100644
--- a/examples/apple/coreml/README.md
+++ b/examples/apple/coreml/README.md
@@ -1,6 +1,6 @@
 # Examples
 
-This directory contains scripts and other helper utilities to illustrate an end-to-end workflow to run a **Core ML** delegated `torch.nn.module` with the **ExecuTorch** runtime.
+This directory contains scripts and other helper utilities to illustrate an end-to-end workflow to run a Core ML delegated `torch.nn.module` with the ExecuTorch runtime.
 
 
 ## Directory structure
@@ -13,7 +13,7 @@ coreml
 
 ## Using the examples
 
-We will walk through an example model to generate a **Core ML** delegated binary file from a python `torch.nn.module` then we will use the `coreml/executor_runner` to run the exported binary file.
+We will walk through an example model to generate a Core ML delegated binary file from a python `torch.nn.module` then we will use the `coreml_executor_runner` to run the exported binary file.
 
 1. Following the setup guide in [Setting Up ExecuTorch](https://pytorch.org/executorch/stable/getting-started-setup)
 you should be able to get the basic development environment for ExecuTorch working.
@@ -27,7 +27,7 @@ cd executorch
 
 ```
 
-3. Run the export script to generate a **Core ML** delegated binary file.
+3. Run the export script to generate a Core ML delegated binary file.
 
 ```bash
 cd executorch
@@ -35,11 +35,11 @@ cd executorch
 # To get a list of example models
 python3 -m examples.portable.scripts.export -h
 
-# Generates ./add_coreml_all.pte file if successful.
+# Generates add_coreml_all.pte file if successful.
 python3 -m examples.apple.coreml.scripts.export --model_name add
 ```
 
-4. Once we have the **Core ML** delegated model binary (pte) file, then let's run it with the **ExecuTorch** runtime using the `coreml_executor_runner`.
+4. Run the binary file using the `coreml_executor_runner`.
 
 ```bash
 cd executorch
@@ -47,20 +47,30 @@ cd executorch
 # Builds the Core ML executor runner. Generates ./coreml_executor_runner if successful.
 ./examples/apple/coreml/scripts/build_executor_runner.sh
 
-# Run the Core ML delegate model.
+# Run the delegated model.
 ./coreml_executor_runner --model_path add_coreml_all.pte
 ```
 
 ## Frequently encountered errors and resolution.
-- The `examples.apple.coreml.scripts.export` could fail if the model is not supported by the **Core ML** backend. The following models from the examples models list (` python3 -m examples.portable.scripts.export -h`)are currently supported by the **Core ML** backend.
+- The `examples.apple.coreml.scripts.export` could fail if the model is not supported by the Core ML backend. The following models from the examples models list (` python3 -m examples.portable.scripts.export -h`) are currently supported by the Core ML backend.
 
-```
+```text
 add
 add_mul
+dl3
+edsr
+emformer_join
+emformer_predict
+emformer_transcribe
+ic3
 ic4
 linear
+llama2
+llava_encoder
+mobilebert
 mul
 mv2
+mv2_untrained
 mv3
 resnet18
 resnet50
diff --git a/examples/apple/coreml/executor_runner/coreml_executor_runner.xcodeproj/project.pbxproj b/examples/apple/coreml/executor_runner/coreml_executor_runner.xcodeproj/project.pbxproj
index 66c0b182cd5..16e9e590027 100644
--- a/examples/apple/coreml/executor_runner/coreml_executor_runner.xcodeproj/project.pbxproj
+++ b/examples/apple/coreml/executor_runner/coreml_executor_runner.xcodeproj/project.pbxproj
@@ -15,7 +15,10 @@
 		C94D51642ACFCBC500AF47FD /* CoreML.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = C94D51632ACFCBC500AF47FD /* CoreML.framework */; };
 		C94D51662ACFCBCB00AF47FD /* Accelerate.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = C94D51652ACFCBCB00AF47FD /* Accelerate.framework */; };
 		C94D51682ACFCC7100AF47FD /* libcoremldelegate.a in Frameworks */ = {isa = PBXBuildFile; fileRef = C94D51672ACFCC7100AF47FD /* libcoremldelegate.a */; };
+		C97BFFA42BC0C17300F55BAC /* libportable_kernels.a in Frameworks */ = {isa = PBXBuildFile; fileRef = C97BFFA32BC0C17300F55BAC /* libportable_kernels.a */; };
+		C97BFFA62BC0C1F200F55BAC /* libportable_ops_lib.a in Frameworks */ = {isa = PBXBuildFile; fileRef = C97BFFA52BC0C1F200F55BAC /* libportable_ops_lib.a */; };
 		C988D69D2B998CDE00979CF6 /* libprotobuf-lite.a in Frameworks */ = {isa = PBXBuildFile; fileRef = C988D69C2B998CD700979CF6 /* libprotobuf-lite.a */; };
+		F24817E72BC65B2000E80D98 /* libexecutorch_no_prim_ops.a in Frameworks */ = {isa = PBXBuildFile; fileRef = F24817E62BC65B2000E80D98 /* libexecutorch_no_prim_ops.a */; };
 /* End PBXBuildFile section */
 
 /* Begin PBXCopyFilesBuildPhase section */
@@ -40,7 +43,10 @@
 		C94D51632ACFCBC500AF47FD /* CoreML.framework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.framework; name = CoreML.framework; path = System/Library/Frameworks/CoreML.framework; sourceTree = SDKROOT; };
 		C94D51652ACFCBCB00AF47FD /* Accelerate.framework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.framework; name = Accelerate.framework; path = System/Library/Frameworks/Accelerate.framework; sourceTree = SDKROOT; };
 		C94D51672ACFCC7100AF47FD /* libcoremldelegate.a */ = {isa = PBXFileReference; lastKnownFileType = archive.ar; name = libcoremldelegate.a; path = libraries/libcoremldelegate.a; sourceTree = "<group>"; };
+		C97BFFA32BC0C17300F55BAC /* libportable_kernels.a */ = {isa = PBXFileReference; lastKnownFileType = archive.ar; name = libportable_kernels.a; path = libraries/libportable_kernels.a; sourceTree = "<group>"; };
+		C97BFFA52BC0C1F200F55BAC /* libportable_ops_lib.a */ = {isa = PBXFileReference; lastKnownFileType = archive.ar; name = libportable_ops_lib.a; path = libraries/libportable_ops_lib.a; sourceTree = "<group>"; };
 		C988D69C2B998CD700979CF6 /* libprotobuf-lite.a */ = {isa = PBXFileReference; lastKnownFileType = archive.ar; name = "libprotobuf-lite.a"; path = "libraries/libprotobuf-lite.a"; sourceTree = "<group>"; };
+		F24817E62BC65B2000E80D98 /* libexecutorch_no_prim_ops.a */ = {isa = PBXFileReference; lastKnownFileType = archive.ar; name = libexecutorch_no_prim_ops.a; path = libraries/libexecutorch_no_prim_ops.a; sourceTree = "<group>"; };
 /* End PBXFileReference section */
 
 /* Begin PBXFrameworksBuildPhase section */
@@ -49,11 +55,14 @@
 			buildActionMask = 2147483647;
 			files = (
 				38626BB52B225A890059413D /* libetdump.a in Frameworks */,
+				F24817E72BC65B2000E80D98 /* libexecutorch_no_prim_ops.a in Frameworks */,
 				38626BB42B225A560059413D /* libflatccrt.a in Frameworks */,
 				C94D51682ACFCC7100AF47FD /* libcoremldelegate.a in Frameworks */,
 				C94D51662ACFCBCB00AF47FD /* Accelerate.framework in Frameworks */,
 				C988D69D2B998CDE00979CF6 /* libprotobuf-lite.a in Frameworks */,
+				C97BFFA62BC0C1F200F55BAC /* libportable_ops_lib.a in Frameworks */,
 				C94D51642ACFCBC500AF47FD /* CoreML.framework in Frameworks */,
+				C97BFFA42BC0C17300F55BAC /* libportable_kernels.a in Frameworks */,
 				C94D51622ACFCBBA00AF47FD /* libsqlite3.tbd in Frameworks */,
 				C94D515E2ACFCBA000AF47FD /* libexecutorch.a in Frameworks */,
 			);
@@ -90,6 +99,9 @@
 				C94D515C2ACFCBA000AF47FD /* libexecutorch.a */,
 				C94D51612ACFCBBA00AF47FD /* libsqlite3.tbd */,
 				C94D51672ACFCC7100AF47FD /* libcoremldelegate.a */,
+				F24817E62BC65B2000E80D98 /* libexecutorch_no_prim_ops.a */,
+				C97BFFA32BC0C17300F55BAC /* libportable_kernels.a */,
+				C97BFFA52BC0C1F200F55BAC /* libportable_ops_lib.a */,
 			);
 			name = Frameworks;
 			sourceTree = "<group>";
diff --git a/examples/apple/coreml/scripts/build_executor_runner.sh b/examples/apple/coreml/scripts/build_executor_runner.sh
index ad63d2a942c..347f3b4474f 100755
--- a/examples/apple/coreml/scripts/build_executor_runner.sh
+++ b/examples/apple/coreml/scripts/build_executor_runner.sh
@@ -37,6 +37,7 @@ cmake "$EXECUTORCH_ROOT_PATH" -B"$CMAKE_BUILD_DIR_PATH" \
 -DEXECUTORCH_BUILD_XNNPACK=OFF \
 -DEXECUTORCH_BUILD_SDK=ON \
 -DEXECUTORCH_BUILD_COREML=ON \
+-DCOREML_BUILD_EXECUTOR_RUNNER=ON \
 -Dprotobuf_BUILD_TESTS=OFF \
 -Dprotobuf_BUILD_EXAMPLES=OFF \
 -DCMAKE_MACOSX_BUNDLE=OFF \
@@ -60,12 +61,15 @@ cp -rf "$COREML_DIR_PATH/runtime/include/" "$INCLUDE_DIR_PATH"
 # Copy required libraries
 echo "ExecuTorch: Copying libraries"
 mkdir "$LIBRARIES_DIR_PATH"
-find "$CMAKE_BUILD_DIR_PATH/" -name 'libexecutorch.a' -exec cp -f "{}" "$LIBRARIES_DIR_PATH"  \;
-find "$CMAKE_BUILD_DIR_PATH/" -name 'libetdump.a' -exec cp -f "{}" "$LIBRARIES_DIR_PATH"  \;
-find "$CMAKE_BUILD_DIR_PATH/" -name 'libcoremldelegate.a' -exec cp -f "{}" "$LIBRARIES_DIR_PATH"  \;
-find "$CMAKE_BUILD_DIR_PATH/" -name 'libprotobuf-lite.a' -exec cp -f "{}" "$LIBRARIES_DIR_PATH"  \;
+find "$CMAKE_BUILD_DIR_PATH/" -name 'libexecutorch.a' -exec cp -f "{}" "$LIBRARIES_DIR_PATH/libexecutorch.a"  \;
+find "$CMAKE_BUILD_DIR_PATH/" -name 'libexecutorch_no_prim_ops.a' -exec cp -f "{}" "$LIBRARIES_DIR_PATH/libexecutorch_no_prim_ops.a"  \;
+find "$CMAKE_BUILD_DIR_PATH/" -name 'libprotobuf-lite.a' -exec cp -f "{}" "$LIBRARIES_DIR_PATH/libprotobuf-lite.a"  \;
 find "$CMAKE_BUILD_DIR_PATH/" -name 'libprotobuf-lited.a' -exec cp -f "{}" "$LIBRARIES_DIR_PATH/libprotobuf-lite.a"  \;
-cp -f "$EXECUTORCH_ROOT_PATH/third-party/flatcc/lib/libflatccrt.a" "$LIBRARIES_DIR_PATH"
+find "$CMAKE_BUILD_DIR_PATH/" -name 'libetdump.a' -exec cp -f "{}" "$LIBRARIES_DIR_PATH/libetdump.a"  \;
+find "$CMAKE_BUILD_DIR_PATH/" -name 'libcoremldelegate.a' -exec cp -f "{}" "$LIBRARIES_DIR_PATH/libcoremldelegate.a"  \;
+find "$CMAKE_BUILD_DIR_PATH/" -name 'libportable_ops_lib.a' -exec cp -f "{}" "$LIBRARIES_DIR_PATH/libportable_ops_lib.a"  \;
+find "$CMAKE_BUILD_DIR_PATH/" -name 'libportable_kernels.a' -exec cp -f "{}" "$LIBRARIES_DIR_PATH/libportable_kernels.a"  \;
+cp -f "$EXECUTORCH_ROOT_PATH/third-party/flatcc/lib/libflatccrt.a" "$LIBRARIES_DIR_PATH/libflatccrt.a"
 
 # Build the runner
 echo "ExecuTorch: Building runner"
diff --git a/examples/apple/coreml/scripts/extract_coreml_models.py b/examples/apple/coreml/scripts/extract_coreml_models.py
index 32c750196dd..6317b0f3d3f 100644
--- a/examples/apple/coreml/scripts/extract_coreml_models.py
+++ b/examples/apple/coreml/scripts/extract_coreml_models.py
@@ -1,5 +1,3 @@
-#!/usr/bin/env bash
-#
 # Copyright © 2024 Apple Inc. All rights reserved.
 #
 # This source code is licensed under the BSD-style license found in the
@@ -55,7 +53,8 @@ def extract_coreml_models(pte_data: bytes):
         if executorchcoreml.unflatten_directory_contents(
             coreml_processed_bytes, str(model_path.absolute())
         ):
-            print(f"CoreML model is extracted and saved to path = {model_path}")
+            print(f"Core ML models are extracted and saved to path = {model_path}")
+        model_index += 1
 
     if len(coreml_delegates) == 0:
         print("The model isn't delegated to CoreML.")
@@ -63,7 +62,7 @@ def extract_coreml_models(pte_data: bytes):
 
 if __name__ == "__main__":
     """
-    Extracts the CoreML models embedded in the ``.pte`` file and saves them to the
+    Extracts the Core ML models embedded in the ``.pte`` file and saves them to the
     file system.
     """
     parser = argparse.ArgumentParser()
diff --git a/examples/apple/coreml/scripts/inspector_cli.py b/examples/apple/coreml/scripts/inspector_cli.py
index 3f8990bdab6..077c8c26ef7 100644
--- a/examples/apple/coreml/scripts/inspector_cli.py
+++ b/examples/apple/coreml/scripts/inspector_cli.py
@@ -7,7 +7,7 @@
 import argparse
 import json
 
-from typing import Any, Dict, Final, List, Tuple
+from typing import Any, Dict, Final, List, Tuple, Union
 
 from executorch.sdk import Inspector
 from executorch.sdk.inspector._inspector_utils import compare_results
@@ -34,6 +34,12 @@ def parse_coreml_delegate_metadata(delegate_metadatas: List[str]) -> Dict[str, A
         return {}
 
 
+def convert_coreml_delegate_time(
+    event_name: Union[str, int], input_time: Union[int, float]
+) -> Union[int, float]:
+    return input_time / (1000 * 1000)
+
+
 def main() -> None:
     parser = argparse.ArgumentParser()
     parser.add_argument(
@@ -60,6 +66,7 @@ def main() -> None:
         etrecord=args.etrecord_path,
         debug_buffer_path=args.debug_buffer_path,
         delegate_metadata_parser=parse_coreml_delegate_metadata,
+        delegate_time_scale_converter=convert_coreml_delegate_time,
     )
     inspector.print_data_tabular(include_delegate_debug_data=True)
     if args.compare_results:
diff --git a/examples/apple/mps/CMakeLists.txt b/examples/apple/mps/CMakeLists.txt
index 89c2b141b01..976ecebc979 100644
--- a/examples/apple/mps/CMakeLists.txt
+++ b/examples/apple/mps/CMakeLists.txt
@@ -42,7 +42,7 @@ add_compile_options("-Wall" "-Werror")
 
 include(${EXECUTORCH_ROOT}/build/Utils.cmake)
 
-set(_common_compile_options -Wno-deprecated-declarations -fPIC)
+set(_common_compile_options -Wno-deprecated-declarations -fPIC -DET_EVENT_TRACER_ENABLED)
 
 # Let files say "include <executorch/path/to/header.h>".
 set(_common_include_directories ${EXECUTORCH_ROOT}/..)
@@ -51,7 +51,7 @@ set(_common_include_directories ${EXECUTORCH_ROOT}/..)
 # portable_ops_lib, etdump, bundled_program.
 find_package(executorch CONFIG REQUIRED)
 target_include_directories(executorch INTERFACE ${_common_include_directories})
-target_compile_options(executorch INTERFACE -DET_EVENT_TRACER_ENABLED)
+target_compile_options(executorch INTERFACE ${_common_compile_options})
 
 find_package(
   gflags REQUIRED PATHS ${CMAKE_CURRENT_BINARY_DIR}/../../../third-party
@@ -73,7 +73,7 @@ generate_bindings_for_kernels(
   FUNCTIONS_YAML ${EXECUTORCH_ROOT}/kernels/portable/functions.yaml
 )
 gen_operators_lib(
-  "portable_ops_lib"
+  "mps_portable_ops_lib"
   KERNEL_LIBS portable_kernels
   DEPS executorch)
 
@@ -107,9 +107,9 @@ list(TRANSFORM _mps_executor_runner__srcs PREPEND "${EXECUTORCH_ROOT}/")
 add_executable(mps_executor_runner ${_mps_executor_runner__srcs})
 
 if(CMAKE_BUILD_TYPE MATCHES "Debug")
-    set(FLATCC_LIB flatcc_d)
+  set(FLATCC_LIB flatccrt_d)
 else()
-    set(FLATCC_LIB flatcc)
+  set(FLATCC_LIB flatccrt)
 endif()
 
 target_link_libraries(mps_executor_runner bundled_program
@@ -117,7 +117,7 @@ target_link_libraries(mps_executor_runner bundled_program
                                           etdump
                                           ${FLATCC_LIB}
                                           mpsdelegate
-                                          portable_ops_lib
+                                          mps_portable_ops_lib
                                           ${mps_executor_runner_libs})
 target_compile_options(mps_executor_runner PUBLIC ${_common_compile_options})
 endif()
diff --git a/examples/apple/mps/scripts/bench_utils.py b/examples/apple/mps/scripts/bench_utils.py
new file mode 100644
index 00000000000..c00738987ab
--- /dev/null
+++ b/examples/apple/mps/scripts/bench_utils.py
@@ -0,0 +1,117 @@
+#
+#  Copyright (c) 2024 Apple Inc. All rights reserved.
+#  Provided subject to the LICENSE file in the top level directory.
+#
+
+import logging
+import time
+
+import torch
+from torch._export.exported_program import ExportedProgram
+
+
+def assert_outputs_equal(model_output, ref_output):
+    """
+    Helper testing function that asserts that the model output and the reference output
+    are equal with some tolerance. Due to numerical differences between eager mode and
+    the MPS's backend, we relax the detal such that absolute tolerance is 1e-3. and
+    relative tolerance is 1e-3.
+    """
+
+    # Compare the result from executor and eager mode direclty
+    if isinstance(ref_output, tuple) or isinstance(ref_output, list):
+        # Multiple outputs executor always returns tuple, even if there is one output
+        assert len(ref_output) == len(
+            model_output
+        ), "Length of outputs is not matching!"
+        for i in range(len(ref_output)):
+            assert torch.allclose(
+                model_output[i], ref_output[i], atol=1e-03, rtol=1e-03
+            )
+    else:
+        # If one output, eager returns tensor while executor tuple of size 1
+        assert torch.allclose(
+            model_output[0], ref_output, atol=1e-03, rtol=1e-03
+        ), "Outputs are not matching!"
+
+
+def bench_forward(func, *args):
+    # warmup
+    for _ in range(10):
+        func(*args)
+
+    start = time.time()
+    for _ in range(100):
+        func(*args)
+    end = time.time()
+    return end - start
+
+
+def executorch_forward_pass(model, inputs):
+    for _ in range(10):
+        model.forward(inputs)
+
+
+def synchronize():
+    torch.mps.synchronize()
+
+
+def pytorch_forward_pass(model, inputs):
+    for _ in range(10):
+        model(*inputs)
+    synchronize()
+
+
+def get_mps_inputs(inputs):
+    inputs_mps = []
+    for tensor in inputs:
+        inputs_mps.append(tensor.to("mps"))
+    inputs_mps = tuple(inputs_mps)
+    return inputs_mps
+
+
+def get_executorch_model(executorch_program: ExportedProgram):
+    try:
+        from executorch.extension.pybindings.portable_lib import (  # @manual
+            _load_for_executorch_from_buffer,
+        )
+
+        return _load_for_executorch_from_buffer(executorch_program.buffer)
+    except ImportError:
+        logging.info(
+            "ExecuTorch MPS delegate was built without pybind support (not possible to run forward pass within python)"
+        )
+        return None
+
+
+def bench_torch(executorch_program: ExportedProgram, model, inputs, model_name):
+    model = model.to("mps")
+    inputs_mps = get_mps_inputs(inputs)
+
+    executorch_model = get_executorch_model(executorch_program)
+    if executorch_model is not None:
+        t_pytorch = bench_forward(pytorch_forward_pass, model, inputs_mps)
+        t_executorch = bench_forward(executorch_forward_pass, executorch_model, inputs)
+
+        logging.info(f"Model name: {model_name}")
+        logging.info(f"Pytorch MPS forward pass: {t_pytorch} seconds")
+        logging.info(f"ExecuTorch MPS forward pass: {t_executorch} seconds")
+        logging.info(
+            f"ExecuTorch speedup: {((t_pytorch - t_executorch) / t_pytorch) * 100}%"
+        )
+
+
+def compare_outputs(executorch_program: ExportedProgram, model, inputs, model_name):
+    inputs_copy = []
+    for t in inputs:
+        inputs_copy.append(t.detach().clone())
+    inputs_copy = tuple(inputs_copy)
+
+    pytorch_results = model(*inputs)
+    executorch_model = get_executorch_model(executorch_program)
+    if executorch_model is not None:
+        executorch_results = executorch_model.forward(inputs_copy)
+        assert_outputs_equal(executorch_results, pytorch_results)
+        logging.info(
+            f"Results between ExecuTorch forward pass with MPS backend and PyTorch forward pass for {model_name} are matching!"
+        )
diff --git a/examples/apple/mps/scripts/build_mps_executor_runner.sh b/examples/apple/mps/scripts/build_mps_executor_runner.sh
new file mode 100755
index 00000000000..16754588b67
--- /dev/null
+++ b/examples/apple/mps/scripts/build_mps_executor_runner.sh
@@ -0,0 +1,64 @@
+#!/usr/bin/env bash
+#  Copyright (c) 2024 Apple Inc. All rights reserved.
+#  Provided subject to the LICENSE file in the top level directory.
+
+set -e
+
+MODE="Release"
+OUTPUT="cmake-out"
+
+usage() {
+  echo "Usage: $0 [OPTIONS]"
+  echo "Build frameworks for Apple platforms."
+  echo "SOURCE_ROOT_DIR defaults to the current directory if not provided."
+  echo
+  echo "Options:"
+  echo "  --output=DIR         Output directory. Default: 'cmake-out'"
+  echo "  --Debug              Use Debug build mode. Default: 'Release'"
+  echo "Example:"
+  echo "  $0 --output=cmake-out --Debug"
+  exit 0
+}
+
+for arg in "$@"; do
+  case $arg in
+      -h|--help) usage ;;
+      --output=*) OUTPUT="${arg#*=}" ;;
+      --Debug) MODE="Debug" ;;
+      *)
+      if [[ -z "$SOURCE_ROOT_DIR" ]]; then
+          SOURCE_ROOT_DIR="$arg"
+      else
+          echo "Invalid argument: $arg"
+          exit 1
+      fi
+      ;;
+  esac
+done
+
+rm -rf "$OUTPUT"
+
+cmake -DBUCK2="$BUCK" \
+          -DCMAKE_INSTALL_PREFIX=cmake-out \
+          -DCMAKE_BUILD_TYPE="$MODE" \
+          -DEXECUTORCH_BUILD_SDK=ON \
+          -DEXECUTORCH_ENABLE_EVENT_TRACER=ON \
+          -DEXECUTORCH_BUILD_MPS=ON \
+          -DPYTHON_EXECUTABLE="$PYTHON_EXECUTABLE" \
+          -Bcmake-out .
+cmake --build cmake-out -j9 --target install --config "$MODE"
+CMAKE_PREFIX_PATH="${PWD}/cmake-out/lib/cmake/ExecuTorch;${PWD}/cmake-out/third-party/gflags"
+# build mps_executor_runner
+rm -rf cmake-out/examples/apple/mps
+cmake \
+    -DCMAKE_PREFIX_PATH="$CMAKE_PREFIX_PATH" \
+    -DCMAKE_BUILD_TYPE="$MODE" \
+    -DPYTHON_EXECUTABLE="$PYTHON_EXECUTABLE" \
+    -Bcmake-out/examples/apple/mps \
+    examples/apple/mps
+
+cmake --build cmake-out/examples/apple/mps -j9 --config "$MODE"
+
+echo "Build succeeded!"
+
+./cmake-out/examples/apple/mps/mps_executor_runner --model_path mps_logical_not.pte --bundled_program
diff --git a/examples/apple/mps/scripts/mps_example.py b/examples/apple/mps/scripts/mps_example.py
index a86a54c4d5c..0bfef7bf4ce 100644
--- a/examples/apple/mps/scripts/mps_example.py
+++ b/examples/apple/mps/scripts/mps_example.py
@@ -10,6 +10,7 @@
 import logging
 
 import torch
+from examples.apple.mps.scripts.bench_utils import bench_torch, compare_outputs
 from executorch import exir
 from executorch.backends.apple.mps.mps_preprocess import MPSBackend
 from executorch.backends.apple.mps.partition.mps_partitioner import MPSPartitioner
@@ -36,7 +37,28 @@
 FORMAT = "[%(levelname)s %(asctime)s %(filename)s:%(lineno)s] %(message)s"
 logging.basicConfig(level=logging.INFO, format=FORMAT)
 
-if __name__ == "__main__":
+
+def get_bundled_program(executorch_program, example_inputs, expected_output):
+    method_test_suites = [
+        MethodTestSuite(
+            method_name="forward",
+            test_cases=[
+                MethodTestCase(
+                    inputs=example_inputs, expected_outputs=[expected_output]
+                )
+            ],
+        )
+    ]
+    logging.info(f"Expected output: {expected_output}")
+
+    bundled_program = BundledProgram(executorch_program, method_test_suites)
+    bundled_program_buffer = serialize_from_bundled_program_to_flatbuffer(
+        bundled_program
+    )
+    return bundled_program_buffer
+
+
+def parse_args():
     parser = argparse.ArgumentParser()
     parser.add_argument(
         "-m",
@@ -54,11 +76,18 @@
 
     parser.add_argument(
         "--use_partitioner",
-        default=False,
+        default=True,
         action=argparse.BooleanOptionalAction,
         help="Use MPS partitioner to run the model instead of using whole graph lowering.",
     )
 
+    parser.add_argument(
+        "--bench_pytorch",
+        default=False,
+        action=argparse.BooleanOptionalAction,
+        help="Bench ExecuTorch MPS foward pass with PyTorch MPS forward pass.",
+    )
+
     parser.add_argument(
         "-b",
         "--bundled",
@@ -68,6 +97,15 @@
         help="Flag for bundling inputs and outputs in the final flatbuffer program",
     )
 
+    parser.add_argument(
+        "-c",
+        "--check_correctness",
+        action="store_true",
+        required=False,
+        default=False,
+        help="Whether to compare the ExecuTorch MPS results with the PyTorch forward pass",
+    )
+
     parser.add_argument(
         "--generate_etrecord",
         action="store_true",
@@ -76,25 +114,64 @@
         help="Generate ETRecord metadata to link with runtime results (used for profiling)",
     )
 
+    parser.add_argument(
+        "--checkpoint",
+        required=False,
+        default=None,
+        help="checkpoing for llama model",
+    )
+
+    parser.add_argument(
+        "--params",
+        required=False,
+        default=None,
+        help="params for llama model",
+    )
+
     args = parser.parse_args()
+    return args
+
+
+def get_model_config(args):
+    model_config = {}
+    model_config["module_name"] = MODEL_NAME_TO_MODEL[args.model_name][0]
+    model_config["model_class_name"] = MODEL_NAME_TO_MODEL[args.model_name][1]
+
+    if args.model_name == "llama2":
+        if args.checkpoint:
+            model_config["checkpoint"] = args.checkpoint
+        if args.params:
+            model_config["params"] = args.params
+        model_config["use_kv_cache"] = True
+    return model_config
+
+
+if __name__ == "__main__":
+    args = parse_args()
 
     if args.model_name not in MODEL_NAME_TO_MODEL:
         raise RuntimeError(f"Available models are {list(MODEL_NAME_TO_MODEL.keys())}.")
 
-    model, example_inputs, _ = EagerModelFactory.create_model(
-        *MODEL_NAME_TO_MODEL[args.model_name]
-    )
+    model_config = get_model_config(args)
+    model, example_inputs, _ = EagerModelFactory.create_model(**model_config)
 
     model = model.eval()
+    if args.check_correctness or args.bench_pytorch:
+        model_copy = copy.deepcopy(model)
+        inputs_copy = []
+        for t in example_inputs:
+            inputs_copy.append(t.detach().clone())
+        inputs_copy = tuple(inputs_copy)
 
     # pre-autograd export. eventually this will become torch.export
-    model = torch._export.capture_pre_autograd_graph(model, example_inputs)
+    with torch.no_grad():
+        model = torch._export.capture_pre_autograd_graph(model, example_inputs)
+        edge: EdgeProgramManager = export_to_edge(
+            model,
+            example_inputs,
+            edge_compile_config=EdgeCompileConfig(_check_ir_validity=False),
+        )
 
-    edge: EdgeProgramManager = export_to_edge(
-        model,
-        example_inputs,
-        edge_compile_config=EdgeCompileConfig(_check_ir_validity=False),
-    )
     edge_program_manager_copy = copy.deepcopy(edge)
 
     compile_specs = [CompileSpec("use_fp16", bytes([args.use_fp16]))]
@@ -120,31 +197,30 @@
     model_name = f"{args.model_name}_mps"
 
     if args.bundled:
-        method_test_suites = [
-            MethodTestSuite(
-                method_name="forward",
-                test_cases=[
-                    MethodTestCase(
-                        inputs=example_inputs, expected_outputs=[model(*example_inputs)]
-                    )
-                ],
-            )
-        ]
-        logging.info(f"Expected output: {model(*example_inputs)}")
-
-        bundled_program = BundledProgram(executorch_program, method_test_suites)
-        bundled_program_buffer = serialize_from_bundled_program_to_flatbuffer(
-            bundled_program
+        expected_output = model(*example_inputs)
+        bundled_program_buffer = get_bundled_program(
+            executorch_program, example_inputs, expected_output
         )
         model_name = f"{model_name}_bundled"
         extension = "fp16"
         if not args.use_fp16:
             extension = "fp32"
-        model_name = f"{model_name}_{extension}"
+        model_name = f"{model_name}_{extension}.pte"
 
     if args.generate_etrecord:
         etrecord_path = "etrecord.bin"
         logging.info("generating etrecord.bin")
         generate_etrecord(etrecord_path, edge_program_manager_copy, executorch_program)
 
-    save_pte_program(executorch_program, model_name)
+    if args.bundled:
+        with open(model_name, "wb") as file:
+            file.write(bundled_program_buffer)
+        logging.info(f"Saved bundled program to {model_name}")
+    else:
+        save_pte_program(executorch_program, model_name)
+
+    if args.bench_pytorch:
+        bench_torch(executorch_program, model_copy, example_inputs, model_name)
+
+    if args.check_correctness:
+        compare_outputs(executorch_program, model_copy, inputs_copy, model_name)
diff --git a/examples/arm/executor_runner/CMakeLists.txt b/examples/arm/executor_runner/CMakeLists.txt
index c738a9502bf..6836f8a79ca 100644
--- a/examples/arm/executor_runner/CMakeLists.txt
+++ b/examples/arm/executor_runner/CMakeLists.txt
@@ -43,6 +43,11 @@ add_library(executorch STATIC IMPORTED)
 set_property(TARGET executorch PROPERTY IMPORTED_LOCATION
   "${ET_BUILD_DIR_PATH}/libexecutorch.a")
 
+add_library(executorch_no_prim_ops STATIC IMPORTED)
+set_property(TARGET executorch_no_prim_ops PROPERTY IMPORTED_LOCATION
+  "${ET_BUILD_DIR_PATH}/libexecutorch_no_prim_ops.a")
+target_link_libraries(executorch INTERFACE executorch_no_prim_ops)
+
 add_library(executorch_delegate_ethos_u STATIC IMPORTED)
 set_property(TARGET executorch_delegate_ethos_u PROPERTY IMPORTED_LOCATION
   "${ET_BUILD_DIR_PATH}/backends/arm/libexecutorch_delegate_ethos_u.a")
diff --git a/examples/demo-apps/android/ExecuTorchDemo/README.md b/examples/demo-apps/android/ExecuTorchDemo/README.md
index 990dcfadc53..1d993da3d41 100644
--- a/examples/demo-apps/android/ExecuTorchDemo/README.md
+++ b/examples/demo-apps/android/ExecuTorchDemo/README.md
@@ -17,7 +17,7 @@ This guide explains how to setup ExecuTorch for Android using a demo app. The ap
 * Refer to [Setting up ExecuTorch](https://pytorch.org/executorch/stable/getting-started-setup) to set up the repo and dev environment.
 * Download and install [Android Studio and SDK](https://developer.android.com/studio).
 * Supported Host OS: CentOS, macOS Ventura (M1/x86_64). See below for Qualcomm HTP specific requirements.
-* *Qualcomm HTP Only[^1]:* To build and run on Qualcomm's AI Engine Direct, please follow [Building and Running ExecuTorch with Qualcomm AI Engine Direct Backend](build-run-qualcomm-ai-engine-direct-backend.md) for hardware and software pre-requisites.
+* *Qualcomm HTP Only[^1]:* To build and run on Qualcomm's AI Engine Direct, please follow [Building and Running ExecuTorch with Qualcomm AI Engine Direct Backend](build-run-qualcomm-ai-engine-direct-backend.md) for hardware and software pre-requisites. The version we use for this tutorial is 2.19. The chip we use for this tutorial is SM8450.
 :::
 ::::
 
@@ -39,7 +39,6 @@ We generate the model file for the ExecuTorch runtime in Android Demo App.
 For delegating DeepLab v3 to XNNPACK backend, please do the following to export the model:
 
 ```bash
-export FLATC_EXECUTABLE=$(realpath third-party/flatbuffers/cmake-out/flatc)
 python3 -m examples.xnnpack.aot_compiler --model_name="dl3" --delegate
 mkdir -p examples/demo-apps/android/ExecuTorchDemo/app/src/main/assets/
 cp dl3_xnnpack_fp32.pte examples/demo-apps/android/ExecuTorchDemo/app/src/main/assets/
@@ -54,7 +53,7 @@ For delegating to Qualcomm Hexagon NPU, please follow the tutorial [here](build-
 After generating the model, copy the model to `assets` directory.
 
 ```bash
-python -m examples.qualcomm.scripts.deeplab_v3 -b build_android -m SM8550 -s <adb_connected_device_serial>
+python -m examples.qualcomm.scripts.deeplab_v3 -b build_android -m SM8450 -s <adb_connected_device_serial>
 cp deeplab_v3/dlv3_qnn.pte examples/demo-apps/android/ExecuTorchDemo/app/src/main/assets/
 ```
 
@@ -68,22 +67,20 @@ We build the required ExecuTorch runtime library to run the model.
 
 ```bash
 export ANDROID_NDK=<path-to-android-ndk>
-export BUCK2=/tmp/buck2 # Or your buck path
+export ANDROID_ABI=arm64-v8a
 
-rm -rf cmake-out && mkdir cmake-out && cd cmake-out
+rm -rf cmake-android-out && mkdir cmake-android-out
 
 # Build the core executorch library
-cmake .. -DCMAKE_INSTALL_PREFIX=cmake-out \
+cmake . -DCMAKE_INSTALL_PREFIX=cmake-android-out \
   -DCMAKE_TOOLCHAIN_FILE="${ANDROID_NDK}/build/cmake/android.toolchain.cmake" \
   -DANDROID_ABI="${ANDROID_ABI}" \
-  -DBUCK2="${BUCK2}" \
   -DEXECUTORCH_BUILD_XNNPACK=ON \
-  -DEXECUTORCH_BUILD_FLATC=OFF \
   -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \
-  -DFLATC_EXECUTABLE="${FLATC}" \
-  -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON
+  -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \
+  -Bcmake-android-out
 
-cmake --build . -j16 --target install
+cmake --build cmake-android-out -j16 --target install
 ```
 
 When we set `EXECUTORCH_BUILD_XNNPACK=ON`, we will build the target [`xnnpack_backend`](https://github.com/pytorch/executorch/blob/main/backends/xnnpack/CMakeLists.txt) which in turn is linked into libexecutorch_jni via [CMake](https://github.com/pytorch/executorch/blob/main/examples/demo-apps/android/jni/CMakeLists.txt).
@@ -93,45 +90,53 @@ When we set `EXECUTORCH_BUILD_XNNPACK=ON`, we will build the target [`xnnpack_ba
 ```bash
 
 # Build the android extension
-cmake ../extension/android -DBUCK2="${BUCK2}" \
-  -DCMAKE_TOOLCHAIN_FILE=${ANDROID_NDK}/build/cmake/android.toolchain.cmake \
+cmake extension/android \
+  -DCMAKE_TOOLCHAIN_FILE="${ANDROID_NDK}"/build/cmake/android.toolchain.cmake \
   -DANDROID_ABI="${ANDROID_ABI}" \
-  -DCMAKE_INSTALL_PREFIX=cmake-out \
-  -Bextension/android
+  -DCMAKE_INSTALL_PREFIX=cmake-android-out \
+  -Bcmake-android-out/extension/android
 
-cmake --build ./extension/android -j16
+cmake --build cmake-android-out/extension/android -j16
 ```
 
 `libexecutorch_jni.so` wraps up the required XNNPACK Backend runtime library from `xnnpack_backend`, and adds an additional JNI layer using fbjni. This is later exposed to Java app.
 
 #### Qualcomm Hexagon NPU
 
-1. Configure the CMake target for the library with Qualcomm Hexagon NPU (HTP) backend (XNNPACK also included):
+1. Build the CMake target for the library with Qualcomm Hexagon NPU (HTP) backend (XNNPACK also included):
 
 ```bash
 export ANDROID_NDK=<path-to-android-ndk>
-export QNN_SDK=<path-to-qnn-sdk>
-
-rm -rf cmake-out && mkdir cmake-out && cd cmake-out
-cmake .. \
-    -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK/build/cmake/android.toolchain.cmake \
-    -DANDROID_ABI=arm64-v8a \
-    -DBUCK2=/tmp/buck2 \
-    -DEXECUTORCH_BUILD_ANDROID_JNI=ON \
+export ANDROID_ABI=arm64-v8a
+export QNN_SDK_ROOT=<path-to-qnn-sdk>
+
+rm -rf cmake-android-out && mkdir cmake-android-out && cd cmake-android-out
+cmake . -DCMAKE_INSTALL_PREFIX=cmake-android-out \
+    -DCMAKE_TOOLCHAIN_FILE="${ANDROID_NDK}/build/cmake/android.toolchain.cmake" \
+    -DANDROID_ABI="${ANDROID_ABI}" \
     -DEXECUTORCH_BUILD_XNNPACK=ON \
-    -DEXECUTORCH_BUILD_FLATC=OFF \
     -DEXECUTORCH_BUILD_QNN=ON \
-    -DQNN_SDK_ROOT=$QNN_SDK \
-    -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON
+    -DQNN_SDK_ROOT="${QNN_SDK_ROOT}" \
+    -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \
+    -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \
+    -Bcmake-android-out
+
+cmake --build cmake-android-out -j16 --target install
 ```
 Similar to the XNNPACK library, with this setup, we compile `libexecutorch_jni.so` but it adds an additional static library `qnn_executorch_backend` which wraps up Qualcomm HTP runtime library and registers the Qualcomm HTP backend. This is later exposed to Java app.
 
 `qnn_executorch_backend` is built when we turn on CMake option `EXECUTORCH_BUILD_QNN`. It will include the [CMakeLists.txt](https://github.com/pytorch/executorch/blob/main/backends/qualcomm/CMakeLists.txt) from backends/qualcomm where we `add_library(qnn_executorch_backend STATIC)`.
 
-2. Build the libraries:
+2. Build the Android extension:
 
 ```bash
-cmake --build . -j16
+cmake extension/android \
+  -DCMAKE_TOOLCHAIN_FILE="${ANDROID_NDK}"/build/cmake/android.toolchain.cmake \
+  -DANDROID_ABI="${ANDROID_ABI}" \
+  -DCMAKE_INSTALL_PREFIX=cmake-android-out \
+  -Bcmake-android-out/extension/android
+
+cmake --build cmake-android-out/extension/android -j16
 ```
 
 ## Deploying on Device via Demo App
@@ -139,14 +144,9 @@ cmake --build . -j16
 ### Steps for Deploying Model via XNNPACK
 
 ```bash
-mkdir -p ../examples/demo-apps/android/ExecuTorchDemo/app/src/main/jniLibs/arm64-v8a
-```
-
-Copy the core libraries:
-
-```bash
-cp ./examples/demo-apps/android/jni/libexecutorch_jni.so \
-   ../examples/demo-apps/android/ExecuTorchDemo/app/src/main/jniLibs/arm64-v8a/libexecutorch.so
+mkdir -p examples/demo-apps/android/ExecuTorchDemo/app/src/main/jniLibs/arm64-v8a
+cp cmake-android-out/extension/android/libexecutorch_jni.so \
+   examples/demo-apps/android/ExecuTorchDemo/app/src/main/jniLibs/arm64-v8a/libexecutorch.so
 ```
 
 This allows the Android app to load ExecuTorch runtime with XNNPACK backend as a JNI library. Later, this shared library will be loaded by `NativePeer.java` in Java code.
@@ -160,15 +160,17 @@ mkdir -p ../examples/demo-apps/android/ExecuTorchDemo/app/src/main/jniLibs/arm64
 We need to push some additional Qualcomm HTP backend libraries to the app. Please refer to [Qualcomm docs](build-run-qualcomm-ai-engine-direct-backend.md) here.
 
 ```bash
-cp ${QNN_SDK_ROOT}/lib/aarch64-android/libQnnHtp.so ${QNN_SDK_ROOT}/lib/aarch64-android/libQnnHtpV69Skel.so ${QNN_SDK_ROOT}/lib/aarch64-android/libQnnHtpStub.so ${QNN_SDK_ROOT}/lib/aarch64-android/libQnnSystem.so \
-   ../examples/demo-apps/android/ExecuTorchDemo/app/src/main/jniLibs/arm64-v8a
+cp ${QNN_SDK_ROOT}/lib/aarch64-android/libQnnHtp.so ${QNN_SDK_ROOT}/lib/hexagon-v69/unsigned/libQnnHtpV69Skel.so ${QNN_SDK_ROOT}/lib/aarch64-android/libQnnHtpV69Stub.so ${QNN_SDK_ROOT}/lib/aarch64-android/libQnnSystem.so \
+   examples/demo-apps/android/ExecuTorchDemo/app/src/main/jniLibs/arm64-v8a
 ```
 
 Copy the core libraries:
 
 ```bash
-cp ./examples/demo-apps/android/jni/libexecutorch_jni.so \
-   ../examples/demo-apps/android/ExecuTorchDemo/app/src/main/jniLibs/arm64-v8a/libexecutorch.so
+cp cmake-android-out/extension/android/libexecutorch_jni.so \
+   examples/demo-apps/android/ExecuTorchDemo/app/src/main/jniLibs/arm64-v8a/libexecutorch.so
+cp cmake-android-out/lib/libqnn_executorch_backend.so \
+   examples/demo-apps/android/ExecuTorchDemo/app/src/main/jniLibs/arm64-v8a/libqnn_executorch_backend.so
 ```
 
 ## Running the App
diff --git a/examples/demo-apps/android/ExecuTorchDemo/app/build.gradle.kts b/examples/demo-apps/android/ExecuTorchDemo/app/build.gradle.kts
index 4407fbc3fe6..615fee860f8 100644
--- a/examples/demo-apps/android/ExecuTorchDemo/app/build.gradle.kts
+++ b/examples/demo-apps/android/ExecuTorchDemo/app/build.gradle.kts
@@ -68,3 +68,12 @@ dependencies {
   debugImplementation("androidx.compose.ui:ui-tooling")
   debugImplementation("androidx.compose.ui:ui-test-manifest")
 }
+
+tasks.register("setup") {
+  doFirst {
+    exec {
+      commandLine("sh", "examples/demo-apps/android/LlamaDemo/setup.sh")
+      workingDir("../../../../../")
+    }
+  }
+}
diff --git a/examples/demo-apps/android/ExecuTorchDemo/setup.sh b/examples/demo-apps/android/ExecuTorchDemo/setup.sh
index 66be7da3157..8ff65bee59b 100644
--- a/examples/demo-apps/android/ExecuTorchDemo/setup.sh
+++ b/examples/demo-apps/android/ExecuTorchDemo/setup.sh
@@ -1,40 +1,40 @@
 #!/usr/bin/env bash
+# Copyright (c) Meta Platforms, Inc. and affiliates.
 # All rights reserved.
 #
-# Copyright 2023-2024 Arm Limited and/or its affiliates.
-#
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
 set -eu
 
-# Note: Set up ANDROID_NDK, ANDROID_ABI, BUCK2, and FLATC
-cmake . -DCMAKE_INSTALL_PREFIX=cmake-out \
+CMAKE_OUT="${CMAKE_OUT:-cmake-out-android}"
+# Note: Set up ANDROID_NDK and ANDROID_ABI
+cmake . -DCMAKE_INSTALL_PREFIX="${CMAKE_OUT}" \
   -DCMAKE_TOOLCHAIN_FILE="${ANDROID_NDK}/build/cmake/android.toolchain.cmake" \
   -DANDROID_ABI="${ANDROID_ABI}" \
-  -DBUCK2="${BUCK2}" \
   -DEXECUTORCH_BUILD_XNNPACK=ON \
-  -DEXECUTORCH_BUILD_FLATC=OFF \
   -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \
-  -DFLATC_EXECUTABLE="${FLATC}" \
   -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \
-  -Bcmake-out
+  -DEXECUTORCH_BUILD_OPTIMIZED=ON \
+  -DCMAKE_BUILD_TYPE=Release \
+  -B"${CMAKE_OUT}"
 
 if [ "$(uname)" == "Darwin" ]; then
   CMAKE_JOBS=$(( $(sysctl -n hw.ncpu) - 1 ))
 else
   CMAKE_JOBS=$(( $(nproc) - 1 ))
 fi
-cmake --build cmake-out -j "${CMAKE_JOBS}" --target install
+cmake --build "${CMAKE_OUT}" -j "${CMAKE_JOBS}" --target install --config Release
 
-cmake extension/android -DBUCK2="${BUCK2}" \
+cmake extension/android \
   -DCMAKE_TOOLCHAIN_FILE=${ANDROID_NDK}/build/cmake/android.toolchain.cmake \
   -DANDROID_ABI="${ANDROID_ABI}" \
-  -DCMAKE_INSTALL_PREFIX=cmake-out \
-  -Bcmake-out/extension/android
+  -DCMAKE_INSTALL_PREFIX="${CMAKE_OUT}" \
+  -DCMAKE_BUILD_TYPE=Release \
+  -B"${CMAKE_OUT}"/extension/android
 
-cmake --build cmake-out/extension/android -j "${CMAKE_JOBS}"
+cmake --build "${CMAKE_OUT}"/extension/android -j "${CMAKE_JOBS}" --config Release
 
 JNI_LIBS_PATH="examples/demo-apps/android/ExecuTorchDemo/app/src/main/jniLibs"
 mkdir -p "${JNI_LIBS_PATH}/${ANDROID_ABI}"
-cp cmake-out/extension/android/libexecutorch_jni.so "${JNI_LIBS_PATH}/${ANDROID_ABI}/libexecutorch.so"
+cp "${CMAKE_OUT}"/extension/android/libexecutorch_jni.so "${JNI_LIBS_PATH}/${ANDROID_ABI}/"
diff --git a/examples/demo-apps/android/LlamaDemo/README.md b/examples/demo-apps/android/LlamaDemo/README.md
index fccc4288f53..0c70ec1620a 100644
--- a/examples/demo-apps/android/LlamaDemo/README.md
+++ b/examples/demo-apps/android/LlamaDemo/README.md
@@ -13,9 +13,7 @@ This app demonstrates the use of the LLaMA chat app demonstrating local inferenc
  * Alternatively, you can follow [this guide](https://github.com/pytorch/executorch/blob/856e085b9344c8b0bf220a97976140a5b76356aa/examples/demo-apps/android/LlamaDemo/SDK.md) to set up Java/SDK/NDK with CLI.
 * Supported Host OS: CentOS, macOS Sonoma on Apple Silicon.
 
-```{note}
-This demo app and tutorial has only been validated with arm64-v8a [ABI](https://developer.android.com/ndk/guides/abis), with NDK 25.0.8775105.
-```
+Note: This demo app and tutorial has only been validated with arm64-v8a [ABI](https://developer.android.com/ndk/guides/abis), with NDK 25.0.8775105.
 
 ## Getting models
 Please refer to the [ExecuTorch Llama2 docs](https://github.com/pytorch/executorch/blob/main/examples/models/llama2/README.md) to export the model.
@@ -27,22 +25,19 @@ adb push llama2.pte /data/local/tmp/llama
 adb push tokenizer.bin /data/local/tmp/llama
 ```
 
-```{note}
-The demo app searches in `/data/local/tmp/llama` for .pte and .bin files as LLAMA model and tokenizer.
-```
+Note: The demo app searches in `/data/local/tmp/llama` for .pte and .bin files as LLAMA model and tokenizer.
 
 ## Build JNI library
 1. Open a terminal window and navigate to the root directory of the `executorch`.
 2. Set the following environment variables:
-```{note}
-<path_to_android_ndk> is the root for the NDK, which is usually under
-~/Library/Android/sdk/ndk/XX.Y.ZZZZZ for macOS, and contains NOTICE and README.md.
-We use <path_to_android_ndk>/build/cmake/android.toolchain.cmake for CMake to cross-compile.
-```
 ```bash
 export ANDROID_NDK=<path_to_android_ndk>
 export ANDROID_ABI=arm64-v8a
 ```
+Note: `<path_to_android_ndk>` is the root for the NDK, which is usually under
+`~/Library/Android/sdk/ndk/XX.Y.ZZZZZ` for macOS, and contains NOTICE and README.md.
+We use `<path_to_android_ndk>/build/cmake/android.toolchain.cmake` for CMake to cross-compile.
+
 3. Run the following command set up the required JNI library:
 ```bash
 pushd examples/demo-apps/android/LlamaDemo
diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/MainActivity.java b/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/MainActivity.java
index f2529eb6b86..f9fc7dd3137 100644
--- a/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/MainActivity.java
+++ b/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/MainActivity.java
@@ -79,9 +79,9 @@ private void setLocalModel(String modelPath, String tokenizerPath) {
     if (loadResult != 0) {
       AlertDialog.Builder builder = new AlertDialog.Builder(this);
       builder.setTitle("Load failed: " + loadResult);
-      AlertDialog alert = builder.create();
       runOnUiThread(
           () -> {
+            AlertDialog alert = builder.create();
             alert.show();
           });
     }
diff --git a/examples/demo-apps/android/LlamaDemo/setup.sh b/examples/demo-apps/android/LlamaDemo/setup.sh
index 8bdba698645..f515aa22cc7 100644
--- a/examples/demo-apps/android/LlamaDemo/setup.sh
+++ b/examples/demo-apps/android/LlamaDemo/setup.sh
@@ -16,6 +16,7 @@ cmake . -DCMAKE_INSTALL_PREFIX="${CMAKE_OUT}" \
   -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \
   -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \
   -DEXECUTORCH_BUILD_OPTIMIZED=ON \
+  -DEXECUTORCH_BUILD_XNNPACK=ON \
   -DCMAKE_BUILD_TYPE=Release \
   -B"${CMAKE_OUT}"
 
@@ -30,6 +31,7 @@ cmake examples/models/llama2 \
          -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK/build/cmake/android.toolchain.cmake \
          -DANDROID_ABI="$ANDROID_ABI" \
          -DCMAKE_INSTALL_PREFIX="${CMAKE_OUT}" \
+         -DEXECUTORCH_BUILD_XNNPACK=ON \
          -DCMAKE_BUILD_TYPE=Release \
          -B"${CMAKE_OUT}"/examples/models/llama2
 
diff --git a/examples/demo-apps/apple_ios/ExecuTorchDemo/ExecuTorchDemo.xcodeproj/project.pbxproj b/examples/demo-apps/apple_ios/ExecuTorchDemo/ExecuTorchDemo.xcodeproj/project.pbxproj
index ce20a78e8e4..6ebbca28daa 100644
--- a/examples/demo-apps/apple_ios/ExecuTorchDemo/ExecuTorchDemo.xcodeproj/project.pbxproj
+++ b/examples/demo-apps/apple_ios/ExecuTorchDemo/ExecuTorchDemo.xcodeproj/project.pbxproj
@@ -13,7 +13,7 @@
 		032C01B72AC329B6002955E1 /* CustomViews.swift in Sources */ = {isa = PBXBuildFile; fileRef = 032C01B62AC329B6002955E1 /* CustomViews.swift */; };
 		032C01B92AC32ADF002955E1 /* CameraController.swift in Sources */ = {isa = PBXBuildFile; fileRef = 032C01B82AC32ADF002955E1 /* CameraController.swift */; };
 		032C01E82AC34B60002955E1 /* MobileNetClassifier.mm in Sources */ = {isa = PBXBuildFile; fileRef = 032C01902AC22B16002955E1 /* MobileNetClassifier.mm */; };
-		032C01EC2AC34CAC002955E1 /* libMobileNetClassifier.a in Frameworks */ = {isa = PBXBuildFile; fileRef = 032C01CB2AC34632002955E1 /* libMobileNetClassifier.a */; };
+		032C01EC2AC34CAC002955E1 /* libMobileNetClassifier.a in Frameworks */ = {isa = PBXBuildFile; fileRef = 032C01CB2AC34632002955E1 /* libMobileNetClassifier.a */; platformFilter = ios; };
 		032C02032AC47CFB002955E1 /* mv3_xnnpack_fp32.pte in Resources */ = {isa = PBXBuildFile; fileRef = 032C01FC2AC47CFB002955E1 /* mv3_xnnpack_fp32.pte */; };
 		032C02082AC47CFB002955E1 /* imagenet_classes.txt in Resources */ = {isa = PBXBuildFile; fileRef = 032C02012AC47CFB002955E1 /* imagenet_classes.txt */; };
 		036834D52ACB710D00BA100F /* mv3.pte in Resources */ = {isa = PBXBuildFile; fileRef = 036834D42ACB710D00BA100F /* mv3.pte */; };
@@ -480,6 +480,7 @@
 /* Begin PBXTargetDependency section */
 		032C01EB2AC34CA8002955E1 /* PBXTargetDependency */ = {
 			isa = PBXTargetDependency;
+			platformFilter = ios;
 			target = 032C01CA2AC34632002955E1 /* MobileNetClassifier */;
 			targetProxy = 032C01EA2AC34CA8002955E1 /* PBXContainerItemProxy */;
 		};
@@ -635,9 +636,12 @@
 				PRODUCT_BUNDLE_IDENTIFIER = org.pytorch.executorch.demo;
 				PRODUCT_NAME = "$(PROJECT_NAME)";
 				PROVISIONING_PROFILE_SPECIFIER = "";
+				SUPPORTED_PLATFORMS = "iphoneos iphonesimulator";
 				SUPPORTS_MACCATALYST = NO;
 				SUPPORTS_MAC_DESIGNED_FOR_IPHONE_IPAD = NO;
+				SUPPORTS_XR_DESIGNED_FOR_IPHONE_IPAD = NO;
 				SWIFT_EMIT_LOC_STRINGS = YES;
+				TARGETED_DEVICE_FAMILY = "1,2";
 			};
 			name = Debug;
 		};
@@ -660,9 +664,12 @@
 				PRODUCT_BUNDLE_IDENTIFIER = org.pytorch.executorch.demo;
 				PRODUCT_NAME = "$(PROJECT_NAME)";
 				PROVISIONING_PROFILE_SPECIFIER = "";
+				SUPPORTED_PLATFORMS = "iphoneos iphonesimulator";
 				SUPPORTS_MACCATALYST = NO;
 				SUPPORTS_MAC_DESIGNED_FOR_IPHONE_IPAD = NO;
+				SUPPORTS_XR_DESIGNED_FOR_IPHONE_IPAD = NO;
 				SWIFT_EMIT_LOC_STRINGS = YES;
+				TARGETED_DEVICE_FAMILY = "1,2";
 			};
 			name = Release;
 		};
@@ -784,7 +791,7 @@
 			isa = XCRemoteSwiftPackageReference;
 			repositoryURL = "https://github.com/pytorch/executorch";
 			requirement = {
-				branch = main;
+				branch = 0.2.0;
 				kind = branch;
 			};
 		};
diff --git a/examples/demo-apps/apple_ios/README.md b/examples/demo-apps/apple_ios/ExecuTorchDemo/README.md
similarity index 69%
rename from examples/demo-apps/apple_ios/README.md
rename to examples/demo-apps/apple_ios/ExecuTorchDemo/README.md
index 8c429af74a9..2f9102e7c00 100644
--- a/examples/demo-apps/apple_ios/README.md
+++ b/examples/demo-apps/apple_ios/ExecuTorchDemo/README.md
@@ -40,36 +40,45 @@ pip --version
 
 ### 3. Getting Started Tutorial
 
-Before proceeding, follow the [Setting Up ExecuTorch](https://pytorch.org/executorch/stable/getting-started-setup)
-tutorial to configure the basic environment. Feel free to skip building anything
-just yet. Make sure you have all the required dependencies installed, including
-the following tools:
+Follow the [Setting Up ExecuTorch](https://pytorch.org/executorch/stable/getting-started-setup)
+tutorial to configure the basic environment:
 
-- Buck2 (as `/tmp/buck2`)
-- Cmake (`cmake` reachable at `$PATH`)
-- FlatBuffers Compiler (`flatc` reachable at `$PATH` or as `$FLATC_EXECUTABLE`
-  enironment variable)
+```bash
+git clone -b release/0.2 https://github.com/pytorch/executorch.git
+cd executorch
+git submodule update --init
+
+python3 -m venv .venv && source .venv/bin/activate
+
+./install_requirements.sh --pybind coreml mps xnnpack
+```
 
 ### 4. Backend Dependencies
 
-Also, follow the corresponding sections from [Core ML](build-run-coreml.md) and
-[MPS](build-run-mps.md) tutorials to install additional dependencies for those
-backends. Feel free to skip building anything just yet.
+Also, follow the corresponding sections from [Core ML](https://pytorch.org/executorch/stable/build-run-coreml) and
+[MPS](https://pytorch.org/executorch/stable/build-run-mps) tutorials to install additional dependencies for those
+backends:
+
+```bash
+./backends/apple/coreml/scripts/install_requirements.sh
+
+./backends/apple/mps/install_requirements.sh
+```
 
 ## Models and Labels
 
-Now let's move on to exporting and bundling the MobileNet v3 model.
+Now, let's move on to exporting and bundling the MobileNet v3 model.
 
 ### 1. Export Model
 
-Export the MobileNet v3 model with Core ML, MPS and XNNPACK delegates, and move
+Export the MobileNet v3 model with Core ML, MPS and XNNPACK backends, and move
 the exported model to a specific location where the Demo App will pick them up:
 
 ```bash
 python3 -m examples.portable.scripts.export --model_name="mv3"
-python3 -m examples.xnnpack.aot_compiler --delegate --model_name="mv3"
 python3 -m examples.apple.coreml.scripts.export --model_name="mv3"
 python3 -m examples.apple.mps.scripts.mps_example --model_name="mv3"
+python3 -m examples.xnnpack.aot_compiler --delegate --model_name="mv3"
 
 mkdir -p examples/demo-apps/apple_ios/ExecuTorchDemo/ExecuTorchDemo/Resources/Models/MobileNet/
 mv mv3*.pte examples/demo-apps/apple_ios/ExecuTorchDemo/ExecuTorchDemo/Resources/Models/MobileNet/
@@ -84,27 +93,6 @@ curl https://raw.githubusercontent.com/pytorch/hub/master/imagenet_classes.txt \
   -o examples/demo-apps/apple_ios/ExecuTorchDemo/ExecuTorchDemo/Resources/Models/MobileNet/imagenet_classes.txt
 ```
 
-## Build Runtime and Backends
-
-Next, we will build the necessary
-[frameworks](https://developer.apple.com/documentation/xcode/creating-a-multi-platform-binary-framework-bundle)
-for ExecuTorch and move them over for app linking.
-
-### 1. Build Frameworks
-
-```bash
-./build/build_apple_frameworks.sh --Release --coreml --mps --xnnpack
-```
-
-### 2. Move Frameworks for App Linking
-
-Make sure to have all the `.xcframework` bundles generated at the previous step
-at a specific location where the Demo App will pick them up:
-
-```bash
-mv cmake-out examples/demo-apps/apple_ios/ExecuTorchDemo/ExecuTorchDemo/Frameworks
-```
-
 ## Final Steps
 
 We're almost done! Now, we just need to open the project in Xcode, run the
diff --git a/examples/demo-apps/apple_ios/LLaMA/LLaMA.xcodeproj/project.pbxproj b/examples/demo-apps/apple_ios/LLaMA/LLaMA.xcodeproj/project.pbxproj
index 80ab3c34b0d..5810ee9b559 100644
--- a/examples/demo-apps/apple_ios/LLaMA/LLaMA.xcodeproj/project.pbxproj
+++ b/examples/demo-apps/apple_ios/LLaMA/LLaMA.xcodeproj/project.pbxproj
@@ -607,7 +607,7 @@
 				SUPPORTS_MAC_DESIGNED_FOR_IPHONE_IPAD = NO;
 				SUPPORTS_XR_DESIGNED_FOR_IPHONE_IPAD = NO;
 				SWIFT_EMIT_LOC_STRINGS = YES;
-				TARGETED_DEVICE_FAMILY = 1;
+				TARGETED_DEVICE_FAMILY = "1,2";
 			};
 			name = Debug;
 		};
@@ -665,7 +665,7 @@
 				SUPPORTS_MAC_DESIGNED_FOR_IPHONE_IPAD = NO;
 				SUPPORTS_XR_DESIGNED_FOR_IPHONE_IPAD = NO;
 				SWIFT_EMIT_LOC_STRINGS = YES;
-				TARGETED_DEVICE_FAMILY = 1;
+				TARGETED_DEVICE_FAMILY = "1,2";
 			};
 			name = Release;
 		};
@@ -708,7 +708,7 @@
 				SUPPORTS_MAC_DESIGNED_FOR_IPHONE_IPAD = NO;
 				SUPPORTS_XR_DESIGNED_FOR_IPHONE_IPAD = NO;
 				SWIFT_EMIT_LOC_STRINGS = YES;
-				TARGETED_DEVICE_FAMILY = 1;
+				TARGETED_DEVICE_FAMILY = "1,2";
 				VERSIONING_SYSTEM = "apple-generic";
 				VERSION_INFO_PREFIX = "";
 			};
@@ -753,7 +753,7 @@
 				SUPPORTS_MAC_DESIGNED_FOR_IPHONE_IPAD = NO;
 				SUPPORTS_XR_DESIGNED_FOR_IPHONE_IPAD = NO;
 				SWIFT_EMIT_LOC_STRINGS = YES;
-				TARGETED_DEVICE_FAMILY = 1;
+				TARGETED_DEVICE_FAMILY = "1,2";
 				VERSIONING_SYSTEM = "apple-generic";
 				VERSION_INFO_PREFIX = "";
 			};
@@ -796,7 +796,7 @@
 			isa = XCRemoteSwiftPackageReference;
 			repositoryURL = "https://github.com/pytorch/executorch";
 			requirement = {
-				branch = main;
+				branch = 0.2.0;
 				kind = branch;
 			};
 		};
diff --git a/examples/demo-apps/apple_ios/LLaMA/LLaMA/Application/ContentView.swift b/examples/demo-apps/apple_ios/LLaMA/LLaMA/Application/ContentView.swift
index 5d7ddbc388f..9afb0cafb37 100644
--- a/examples/demo-apps/apple_ios/LLaMA/LLaMA/Application/ContentView.swift
+++ b/examples/demo-apps/apple_ios/LLaMA/LLaMA/Application/ContentView.swift
@@ -150,6 +150,7 @@ struct ContentView: View {
         }
       }
     }
+    .navigationViewStyle(StackNavigationViewStyle())
   }
 
   private func generate() {
@@ -215,7 +216,7 @@ struct ContentView: View {
           tokens.append(token)
           if tokens.count > 2 {
             let text = tokens.joined()
-            let count = text.count
+            let count = tokens.count
             tokens = []
             DispatchQueue.main.async {
               withAnimation {
diff --git a/examples/demo-apps/apple_ios/LLaMA/README.md b/examples/demo-apps/apple_ios/LLaMA/README.md
new file mode 100644
index 00000000000..ddd542a0066
--- /dev/null
+++ b/examples/demo-apps/apple_ios/LLaMA/README.md
@@ -0,0 +1,52 @@
+# Building ExecuTorch LLaMA iOS Demo App
+
+This app demonstrates the use of the LLaMA chat app demonstrating local inference use case with ExecuTorch.
+
+## Prerequisites
+* [Xcode 15](https://developer.apple.com/xcode)
+* [iOS 17 SDK](https://developer.apple.com/ios)
+* Set up your ExecuTorch repo and environment if you haven’t done so by following the [Setting up ExecuTorch](https://pytorch.org/executorch/stable/getting-started-setup) to set up the repo and dev environment:
+
+```bash
+git clone -b release/0.2 https://github.com/pytorch/executorch.git
+cd executorch
+git submodule update --init
+
+python3 -m venv .venv && source .venv/bin/activate
+
+./install_requirements.sh
+```
+
+## Exporting models
+Please refer to the [ExecuTorch Llama2 docs](https://github.com/pytorch/executorch/blob/main/examples/models/llama2/README.md) to export the model.
+
+## Run the App
+
+1. Open the [project](https://github.com/pytorch/executorch/blob/main/examples/demo-apps/apple_ios/LLaMA/LLaMA.xcodeproj) in Xcode.
+2. Run the app (cmd+R).
+3. In app UI pick a model and tokenizer to use, type a prompt and tap the arrow buton
+
+```{note}
+ExecuTorch runtime is distributed as a Swift package providing some .xcframework as prebuilt binary targets.
+Xcode will dowload and cache the package on the first run, which will take some time.
+```
+
+## Copy the model to Simulator
+
+1. Drag&drop the model and tokenizer files onto the Simulator window and save them somewhere inside the iLLaMA folder.
+2. Pick the files in the app dialog, type a prompt and click the arrow-up button.
+
+## Copy the model to Device
+
+1. Wire-connect the device and open the contents in Finder.
+2. Navigate to the Files tab and drag&drop the model and tokenizer files onto the iLLaMA folder.
+3. Wait until the files are copied.
+
+Click the image below to see it in action!
+
+<a href="https://pytorch.org/executorch/main/_static/img/llama_ios_app.mp4">
+  <img src="https://pytorch.org/executorch/main/_static/img/llama_ios_app.png" width="600" alt="iOS app running a LlaMA model">
+</a>
+
+## Reporting Issues
+If you encountered any bugs or issues following this tutorial please file a bug/issue here on [Github](https://github.com/pytorch/executorch/issues/new).
diff --git a/examples/llm_manual/CMakeLists.txt b/examples/llm_manual/CMakeLists.txt
new file mode 100644
index 00000000000..c605e947409
--- /dev/null
+++ b/examples/llm_manual/CMakeLists.txt
@@ -0,0 +1,33 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+cmake_minimum_required(VERSION 3.19)
+project(nanogpt_runner)
+
+set(CMAKE_CXX_STANDARD 17)
+set(CMAKE_CXX_STANDARD_REQUIRED True)
+
+# Set options for executorch build.
+option(EXECUTORCH_BUILD_EXTENSION_DATA_LOADER "" ON)
+option(EXECUTORCH_BUILD_EXTENSION_MODULE "" ON)
+option(EXECUTORCH_BUILD_OPTIMIZED "" ON)
+option(EXECUTORCH_BUILD_XNNPACK "" ON) # Build with Xnnpack backend
+
+# Include the executorch subdirectory.
+add_subdirectory(
+    ${CMAKE_CURRENT_SOURCE_DIR}/third-party/executorch
+    ${CMAKE_BINARY_DIR}/executorch)
+
+# include_directories(${CMAKE_CURRENT_SOURCE_DIR}/src)
+
+add_executable(nanogpt_runner main.cpp)
+target_link_libraries(
+    nanogpt_runner
+    PRIVATE
+    executorch
+    extension_module_static # Provides the Module class
+    optimized_native_cpu_ops_lib # Provides baseline cross-platform kernels
+    xnnpack_backend) # Provides the XNNPACK CPU acceleration backend
diff --git a/examples/llm_manual/README.md b/examples/llm_manual/README.md
new file mode 100644
index 00000000000..0ee6bb6a9f1
--- /dev/null
+++ b/examples/llm_manual/README.md
@@ -0,0 +1,3 @@
+# LLM Manual
+
+This repository is a storage place for the files that [LLM Maunal](https://pytorch.org/executorch/main/llm/getting-started.html) needs. Please refer to the documentation website for more information.
diff --git a/examples/llm_manual/basic_sampler.h b/examples/llm_manual/basic_sampler.h
new file mode 100644
index 00000000000..a95b823de8d
--- /dev/null
+++ b/examples/llm_manual/basic_sampler.h
@@ -0,0 +1,20 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <algorithm>
+#include <vector>
+class BasicSampler {
+ public:
+  BasicSampler() {}
+  int64_t sample(std::vector<float> logits) {
+    // Find the token with the highest log probability.
+    int64_t max_index =
+        std::max_element(logits.begin(), logits.end()) - logits.begin();
+    return max_index;
+  }
+};
diff --git a/examples/llm_manual/basic_tokenizer.h b/examples/llm_manual/basic_tokenizer.h
new file mode 100644
index 00000000000..eb51d15fc50
--- /dev/null
+++ b/examples/llm_manual/basic_tokenizer.h
@@ -0,0 +1,192 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <fstream>
+#include <iostream>
+#include <sstream>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+class BasicTokenizer {
+ public:
+  BasicTokenizer(const std::string& filePath) {
+    std::ifstream file(filePath);
+
+    if (!file) {
+      std::cerr << "Unable to open file";
+      exit(9); // return with error code
+    }
+    std::string str(
+        (std::istreambuf_iterator<char>(file)),
+        std::istreambuf_iterator<char>());
+
+    size_t i = 0u;
+    i = consume_whitespace(str, i);
+    i = expect(str, i, '{');
+
+    while (i < str.size() && str[i] != '}') {
+      i = consume_field(str, i);
+    }
+
+    // Build decode map as inverse of encode.
+    for (auto& i : encode_) {
+      decode_[i.second] = i.first;
+    }
+  }
+
+  std::vector<int64_t> encode(const std::string& prompt) {
+    std::vector<std::string> words = parse_prompt(prompt);
+    std::vector<int64_t> result;
+    for (auto word : words) {
+      result.push_back(encode_[word]);
+    }
+    return result;
+  }
+
+  std::string decode(const std::vector<int64_t>& indices) {
+    std::string result;
+    for (const auto& index : indices) {
+      result += decode_[index];
+    }
+    return result;
+  }
+
+ private:
+  std::unordered_map<std::string, int64_t> encode_;
+  std::unordered_map<int64_t, std::string> decode_;
+
+  // Advance the input string index until a non-whitespace character is found
+  // or it reaches the end of string.
+  size_t consume_whitespace(const std::string& data, size_t i) {
+    while (i < data.size() && std::isspace(data[i])) {
+      i++;
+    }
+
+    return i;
+  }
+
+  // Consumes an JSON field of the form
+  //  "str": id,
+  size_t consume_field(const std::string& data, size_t i) {
+    i = consume_whitespace(data, i);
+
+    // Parse the key literal.
+    i = expect(data, i, '"');
+
+    auto in_escape = false;
+    std::string key = "";
+    while (i < data.size()) {
+      if (in_escape) {
+        key += data[i];
+        i++;
+        in_escape = false;
+      } else { // !in_escape
+        if (data[i] == '"') { // End of string literal
+          i++;
+          break;
+        } else if (data[i] == '\\') { // Escaped code point
+          in_escape = true;
+        }
+        key += data[i];
+        i++;
+      }
+    }
+
+    key = post_process_key(key);
+
+    i = expect(data, i, ':');
+    i = consume_whitespace(data, i);
+
+    // Read unsigned integer value
+    auto value_start = i;
+    while (i < data.size() && std::isdigit(data[i])) {
+      i++;
+    }
+    auto value = static_cast<int64_t>(
+        std::stol(data.substr(value_start, i - value_start)));
+
+    encode_[key] = value;
+
+    i = consume_whitespace(data, i);
+    if (i < data.size() && data[i] == ',') {
+      i++;
+    }
+
+    return i;
+  }
+
+  // Assert that the next character in the input string is equal to c. Increment
+  // the input string index by one.
+  size_t expect(const std::string& data, size_t i, char c) {
+    if (i >= data.size() || data[i] != c) {
+      std::cerr << "Invalid tokenizer vocabulary file. Expected '" << c
+                << "' at index " << i << std::endl;
+      exit(1);
+    }
+
+    return i + 1;
+  }
+
+  std::string post_process_key(std::string key) {
+    // Replace the unicode characters with the corresponding byte encoding
+    // TODO: adopt byte encoder to handle unicode characters in json file.
+
+    std::unordered_map<std::string, std::string> replacements = {
+        {"\\u0120", " "},
+        {"\\u010a", "\n"},
+    };
+
+    for (const auto& replacement : replacements) {
+      size_t pos = 0;
+      // While loop through all instances of the substring in the string
+      while ((pos = key.find(replacement.first, pos)) != std::string::npos) {
+        key.replace(pos, replacement.first.length(), replacement.second);
+        pos += replacement.second.length();
+      }
+    }
+
+    // remove duplicate backslashes
+    for (size_t idx = 0; idx < key.length(); idx++) {
+      if (key[idx] == '\\') {
+        key.erase(idx, 1);
+        if (key[idx] == '\\') {
+          // If there are two backslashes, keep the second one
+          idx += 1;
+        }
+      }
+    }
+
+    return key;
+  }
+  std::vector<std::string> parse_prompt(const std::string& prompt) {
+    std::vector<std::string> result;
+    std::string word;
+    for (char c : prompt) {
+      if (c == ' ') {
+        if (!word.empty()) {
+          result.push_back(word);
+          word.clear();
+        }
+        word += c;
+      } else if (ispunct(c)) {
+        if (!word.empty()) {
+          result.push_back(word);
+          word.clear();
+        }
+        result.push_back(std::string(1, c));
+      } else {
+        word += c;
+      }
+    }
+    if (!word.empty()) {
+      result.push_back(word);
+    }
+    return result;
+  }
+};
diff --git a/examples/llm_manual/export_nanogpt.py b/examples/llm_manual/export_nanogpt.py
new file mode 100644
index 00000000000..cf29a69c080
--- /dev/null
+++ b/examples/llm_manual/export_nanogpt.py
@@ -0,0 +1,45 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+# export_nanogpt.py
+
+# Load partitioner for Xnnpack backend
+import torch
+from executorch.backends.xnnpack.partition.xnnpack_partitioner import XnnpackPartitioner
+
+# Model to be delegated to specific backend should use specific edge compile config
+from executorch.backends.xnnpack.utils.configs import get_xnnpack_edge_compile_config
+from executorch.exir import to_edge
+
+from model import GPT
+from torch._export import capture_pre_autograd_graph
+from torch.export import export
+from torch.nn.attention import sdpa_kernel, SDPBackend
+
+model = GPT.from_pretrained("gpt2")  # use gpt2 weight as pretrained weight
+example_inputs = (
+    torch.randint(0, 100, (1, model.config.block_size), dtype=torch.long),
+)
+dynamic_shape = ({1: torch.export.Dim("token_dim", max=model.config.block_size)},)
+
+# Trace the model, converting it to a portable intermediate representation.
+# The torch.no_grad() call tells PyTorch to exclude training-specific logic.
+with sdpa_kernel([SDPBackend.MATH]), torch.no_grad():
+    m = capture_pre_autograd_graph(model, example_inputs, dynamic_shapes=dynamic_shape)
+    traced_model = export(m, example_inputs, dynamic_shapes=dynamic_shape)
+
+# Convert the model into a runnable ExecuTorch program.
+# To be further lowered to Xnnpack backend, `traced_model` needs xnnpack-specific edge compile config
+edge_config = get_xnnpack_edge_compile_config()
+edge_manager = to_edge(traced_model, compile_config=edge_config)
+
+# Delegate exported model to Xnnpack backend by invoking `to_backend` function with Xnnpack partitioner.
+edge_manager = edge_manager.to_backend(XnnpackPartitioner())
+et_program = edge_manager.to_executorch()
+
+# Save the Xnnpack-delegated ExecuTorch program to a file.
+with open("nanogpt.pte", "wb") as file:
+    file.write(et_program.buffer)
diff --git a/examples/llm_manual/main.cpp b/examples/llm_manual/main.cpp
new file mode 100644
index 00000000000..2b336059cff
--- /dev/null
+++ b/examples/llm_manual/main.cpp
@@ -0,0 +1,122 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// main.cpp
+
+#include <cstdint>
+#include <functional>
+#include <memory>
+#include <unordered_map>
+
+#include "basic_sampler.h"
+#include "basic_tokenizer.h"
+#include "managed_tensor.h"
+
+#include <executorch/extension/evalue_util/print_evalue.h>
+#include <executorch/extension/module/module.h>
+#include <executorch/runtime/core/exec_aten/exec_aten.h>
+#include <executorch/runtime/core/exec_aten/util/dim_order_util.h>
+#include <executorch/runtime/core/exec_aten/util/tensor_util.h>
+
+using namespace torch::executor;
+
+using SizesType = exec_aten::SizesType;
+using DimOrderType = exec_aten::DimOrderType;
+using StridesType = exec_aten::StridesType;
+
+// main.cpp
+
+#define ENDOFTEXT 50256
+
+std::string generate(
+    Module& llm_model,
+    std::string& prompt,
+    BasicTokenizer& tokenizer,
+    BasicSampler& sampler,
+    size_t max_input_length,
+    size_t max_output_length) {
+  // Convert the input text into a list of integers (tokens) that represents
+  // it, using the string-to-token mapping that the model was trained on.
+  // Each token is an integer that represents a word or part of a word.
+  std::vector<int64_t> input_tokens = tokenizer.encode(prompt);
+  std::vector<int64_t> output_tokens;
+
+  for (auto i = 0u; i < max_output_length; i++) {
+    // Convert the input_tokens from a vector of int64_t to EValue.
+    // EValue is a unified data type in the ExecuTorch runtime.
+    ManagedTensor tensor_tokens(
+        input_tokens.data(),
+        {1, static_cast<int>(input_tokens.size())},
+        ScalarType::Long);
+    std::vector<EValue> inputs = {tensor_tokens.get_tensor()};
+
+    // Run the model. It will return a tensor of logits (log-probabilities).
+    Result<std::vector<EValue>> logits_evalue = llm_model.forward(inputs);
+
+    // Convert the output logits from EValue to std::vector, which is what
+    // the sampler expects.
+    Tensor logits_tensor = logits_evalue.get()[0].toTensor();
+    std::vector<float> logits(
+        logits_tensor.data_ptr<float>(),
+        logits_tensor.data_ptr<float>() + logits_tensor.numel());
+
+    // Sample the next token from the logits.
+    int64_t next_token = sampler.sample(logits);
+
+    // Break if we reached the end of the text.
+    if (next_token == ENDOFTEXT) {
+      break;
+    }
+
+    // Add the next token to the output.
+    output_tokens.push_back(next_token);
+
+    std::cout << tokenizer.decode({next_token});
+    std::cout.flush();
+
+    // Update next input.
+    input_tokens.push_back(next_token);
+    if (input_tokens.size() > max_input_length) {
+      input_tokens.erase(input_tokens.begin());
+    }
+  }
+
+  std::cout << std::endl;
+
+  // Convert the output tokens into a human-readable string.
+  std::string output_string = tokenizer.decode(output_tokens);
+  return output_string;
+}
+
+// main.cpp
+
+int main() {
+  // Set up the prompt. This provides the seed text for the model to elaborate.
+  std::cout << "Prompt: ";
+  std::string prompt;
+  std::getline(std::cin, prompt);
+
+  // The tokenizer is used to convert between tokens (used by the model) and
+  // human-readable strings.
+  BasicTokenizer tokenizer("vocab.json");
+
+  // The sampler is used to sample the next token from the logits.
+  BasicSampler sampler = BasicSampler();
+
+  // Load the exported nanoGPT program, which was generated via the previous
+  // steps.
+  Module model(
+      "nanogpt.pte",
+      torch::executor::Module::MlockConfig::UseMlockIgnoreErrors);
+
+  const auto max_input_tokens = 1024;
+  const auto max_output_tokens = 30;
+  std::cout << prompt;
+  generate(
+      model, prompt, tokenizer, sampler, max_input_tokens, max_output_tokens);
+}
diff --git a/examples/llm_manual/managed_tensor.h b/examples/llm_manual/managed_tensor.h
new file mode 100644
index 00000000000..d401ae4d18b
--- /dev/null
+++ b/examples/llm_manual/managed_tensor.h
@@ -0,0 +1,74 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/runtime/core/exec_aten/exec_aten.h>
+#include <executorch/runtime/core/exec_aten/util/dim_order_util.h>
+#include <executorch/runtime/core/exec_aten/util/tensor_util.h>
+#include <executorch/runtime/platform/assert.h>
+
+#include <executorch/runtime/core/portable_type/tensor.h>
+
+#pragma once
+
+namespace torch {
+namespace executor {
+
+/**
+ * A tensor wrapper takes ownership of all the memory of the necessary metadata
+ * for torch::executor::Tensor. Note that it doesn't own the data memory.
+ */
+class ManagedTensor {
+ public:
+  /// The type used for elements of `sizes()`.
+  using SizesType = exec_aten::SizesType;
+  /// The type used for elements of `dim_order()`.
+  using DimOrderType = exec_aten::DimOrderType;
+  /// The type used for elements of `strides()`.
+  using StridesType = exec_aten::StridesType;
+  ManagedTensor() = delete;
+
+  explicit ManagedTensor(
+      void* data,
+      const std::vector<SizesType>& sizes,
+      ScalarType dtype)
+      : dtype_(dtype), sizes_(sizes), data_ptr_(data) {
+    ssize_t dim = sizes.size();
+    dim_order_.resize(dim);
+    strides_.resize(dim);
+    for (size_t i = 0; i < dim; ++i) {
+      dim_order_[i] = i;
+    }
+    dim_order_to_stride_nocheck(
+        sizes.data(), dim_order_.data(), dim, strides_.data());
+    tensor_impl_ = std::make_unique<TensorImpl>(
+        dtype_,
+        dim,
+        sizes_.data(),
+        data_ptr_,
+        dim_order_.data(),
+        strides_.data(),
+        TensorShapeDynamism::DYNAMIC_BOUND);
+  }
+
+  /**
+   * Get the Tensor object managed by this class.
+   */
+  Tensor get_tensor() {
+    return Tensor(tensor_impl_.get());
+  }
+
+ private:
+  void* data_ptr_ = nullptr;
+  std::unique_ptr<TensorImpl> tensor_impl_;
+  std::vector<SizesType> sizes_;
+  std::vector<StridesType> strides_;
+  std::vector<DimOrderType> dim_order_;
+  ScalarType dtype_;
+};
+} // namespace executor
+} // namespace torch
diff --git a/examples/models/llama2/README.md b/examples/models/llama2/README.md
index d392673d34a..31ef935eb69 100644
--- a/examples/models/llama2/README.md
+++ b/examples/models/llama2/README.md
@@ -30,16 +30,19 @@ We evaluated WikiText perplexity using [LM Eval](https://github.com/EleutherAI/l
 
 Note that groupsize less than 128 was not enabled, since such model were still too large. This is because our current efforts have focused on enabling FP32 and support for FP16 is under way. What this implies for model size is that 1) embedding table is in FP32 and 2) quantized weights scales are FP32.
 
+## Enablement 
+
+We have verified running Llama 2 7B [mobile applications](#step-6-build-mobile-apps) efficiently on select devices including the iPhone 15 Pro, iPhone 15 Pro Max, Samsung Galaxy S22 and S24, and One Plus 12.
+
 ## Performance
 
-Performance was measured on Samsung Galaxy S22, S23, S24 and One Plus 12. Measurement performance is in terms of tokens/second.
+Performance was measured on the Samsung Galaxy S22, S24, and One Plus 12 devices. The performance measurement is expressed in terms of tokens per second using an [adb binary-based approach](#step-5-run-benchmark-on).
 
 |Device  | Groupwise 4-bit (128) | Groupwise 4-bit (256)
 |--------| ---------------------- | ---------------
-|Galaxy S22 | 8.15 tokens/second | 8.3 tokens/second |
+|Galaxy S22  | 8.15 tokens/second | 8.3 tokens/second |
 |Galaxy S24 | 10.66 tokens/second | 11.26 tokens/second |
 |One plus 12 | 11.55 tokens/second | 11.6 tokens/second |
-|iPhone 15 pro | x | x |
 
 
 # Instructions
@@ -61,10 +64,17 @@ You can export and run the original Llama2 7B model.
 
 1. Llama2 pretrained parameters can be downloaded from [Meta's official website](https://ai.meta.com/resources/models-and-libraries/llama-downloads/) or from [Hugging Face](https://huggingface.co/meta-llama/Llama-2-7b).
 
-2. Export model and generate `.pte` file:
+2. Edit `params.json` file. Replace `"vocab_size": -1` with `"vocab_size": 32000`. This is a short-term workaround.
+
+3. Export model and generate `.pte` file:
     ```
     python -m examples.models.llama2.export_llama --checkpoint <checkpoint.pth> --params <params.json> -kv --use_sdpa_with_kv_cache -X -qmode 8da4w --group_size 128 -d fp32
     ```
+4. Create tokenizer.bin.
+
+    ```
+    python -m examples.models.llama2.tokenizer.tokenizer -t tokenizer.model -o tokenizer.bin
+    ```
 
 ### Option B: Download and export stories110M model
 
@@ -89,6 +99,18 @@ If you want to deploy and run a smaller model for educational purposes. From `ex
     python -m examples.models.llama2.tokenizer.tokenizer -t tokenizer.model -o tokenizer.bin
     ```
 
+### Option C: Download and export Llama3 8B model
+
+You can export and run the original Llama3 8B model.
+
+1. Llama3 pretrained parameters can be downloaded from [Meta's official llama3 repository](https://github.com/meta-llama/llama3/).
+
+2. Export model and generate `.pte` file
+    ```
+    python -m examples.models.llama2.export_llama --checkpoint <consolidated.00.pth> -p <params.json> -kv --use_sdpa_with_kv_cache -X -qmode 8da4w  --group_size 128 -d fp32 --metadata '{"get_bos_id":128000, "get_eos_id":128001}' --embedding-quantize 4,32 --output_name="llama3_kv_sdpa_xnn_qe_4_32.pte"
+    ```
+
+    Due to the larger vocabulary size of Llama3, we recommend quantizing the embeddings with `--embedding-quantize 4,32` to further reduce the model size.
 
 ## (Optional) Finetuning
 
@@ -208,22 +230,35 @@ cmake --build cmake-out-android/examples/models/llama2 -j16 --config Release
 
 **2.2 Upload model, tokenizer and llama runner binary to phone**
 ```
-adb push <model.pte> /data/local/tmp/
-adb push <tokenizer.bin> /data/local/tmp/
-adb push cmake-out-android/examples/models/llama2/llama_main /data/local/tmp/
+adb shell mkdir -p /data/local/tmp/llama
+adb push <model.pte> /data/local/tmp/llama/
+adb push <tokenizer.bin> /data/local/tmp/llama/
+adb push cmake-out-android/examples/models/llama2/llama_main /data/local/tmp/llama/
 ```
 
 **2.3 Run model**
 ```
-adb shell "cd /data/local/tmp && ./llama_main --model_path <model.pte> --tokenizer_path <tokenizer.bin> --prompt "Once upon a time" --seq_len 120
+adb shell "cd /data/local/tmp/llama && ./llama_main --model_path <model.pte> --tokenizer_path <tokenizer.bin> --prompt "Once upon a time" --seq_len 120
 ```
-## Step 6: Build iOS and/or Android apps
+## Step 6: Build Mobile apps
 
-TODO
+### iOS
 
-### Android app
+Please refer to [this tutorial](https://pytorch.org/executorch/main/llm/llama-demo-ios.html) to for full instructions on building the iOS LLAMA Demo App.
+
+### Android
 Please refer to [this tutorial](https://pytorch.org/executorch/main/llm/llama-demo-android.html) to for full instructions on building the Android LLAMA Demo App.
 
+## Optional: Smaller models delegated to other backends
+Currently we supported lowering the stories model to other backends, including, CoreML, MPS and QNN. Please refer to the instruction
+for each backend ([CoreML](https://pytorch.org/executorch/main/build-run-coreml.html), [MPS](https://pytorch.org/executorch/main/build-run-mps.html), [QNN](https://pytorch.org/executorch/main/build-run-qualcomm.html)) before trying to lower them. After the backend library is installed, the script to export a lowered model is
+
+- Lower to CoreML: `python -m examples.models.llama2.export_llama -kv --coreml -c stories110M.pt -p params.json`
+- MPS: `python -m examples.models.llama2.export_llama -kv --mps -c stories110M.pt -p params.json`
+- QNN: `python -m examples.models.llama2.export_llama -kv --qnn -c stories110M.pt -p params.json`
+
+The iOS LLAMA app supports the CoreML and MPS model and the Android LLAMA app supports the QNN model. On Android, it also allow to cross compiler the llama runner binary, push to the device and run.
+
 # What is coming next?
 ## Quantization
 - Enabling FP16 model to leverage smaller groupsize for 4-bit quantization.
@@ -238,7 +273,6 @@ Please refer to [this tutorial](https://pytorch.org/executorch/main/llm/llama-de
 - Enabling LLama2 7b and other architectures via Vulkan
 - Enabling performant execution of widely used quantization schemes.
 
-TODO
 
 # Notes
 This example tries to reuse the Python code, with minimal modifications to make it compatible with current ExecuTorch:
diff --git a/examples/models/llama2/TARGETS b/examples/models/llama2/TARGETS
index c93ea6149ff..b132962963a 100644
--- a/examples/models/llama2/TARGETS
+++ b/examples/models/llama2/TARGETS
@@ -18,7 +18,6 @@ runtime.python_library(
     ],
     deps = [
         "//caffe2:torch",
-        "//executorch/examples/models/llama2/custom_ops:llama_custom_ops_aot_lib",
     ],
 )
 
@@ -85,6 +84,7 @@ runtime.python_library(
         "//executorch/backends/vulkan/partitioner:vulkan_partitioner",
         "//executorch/examples/models:model_base",
         "//executorch/examples/models:models",
+        "//executorch/examples/models/llama2/custom_ops:custom_ops_aot_py",
         "//executorch/examples/portable:utils",
         "//executorch/exir:lib",
         "//executorch/sdk/etrecord:etrecord",
diff --git a/examples/models/llama2/builder.py b/examples/models/llama2/builder.py
index 3473391b641..cb1a82e9618 100644
--- a/examples/models/llama2/builder.py
+++ b/examples/models/llama2/builder.py
@@ -202,11 +202,7 @@ def source_transform(
     def _get_dynamic_shape(self) -> Any:
         dim = torch.export.Dim("token_dim", max=self.model.params.max_seq_len - 1)
         if self.use_kv_cache:
-            if self.use_sdpa_with_kv_cache:
-                return None
-            else:
-                # return {1: dim}, {0: dim}} TODO update xnnpack to be able to handle dynamic shape kv cache
-                return None
+            return None
         else:
             return ({1: dim},)
 
diff --git a/examples/models/llama2/export_llama_lib.py b/examples/models/llama2/export_llama_lib.py
index de1e711a2c9..26f684af09c 100644
--- a/examples/models/llama2/export_llama_lib.py
+++ b/examples/models/llama2/export_llama_lib.py
@@ -9,6 +9,7 @@
 import argparse
 import copy
 import logging
+import math
 import os
 import shlex
 
@@ -23,7 +24,11 @@
     XnnpackDynamicallyQuantizedPartitioner,
 )
 
-from executorch.examples.models.llama2.llama_transformer import Transformer
+from executorch.examples.models.llama2.llama_transformer import (
+    KVCache,
+    SDPA,
+    Transformer,
+)
 from executorch.exir.backend.backend_details import CompileSpec
 
 from executorch.sdk.etrecord import generate_etrecord
@@ -88,6 +93,131 @@ def materialze_broadcast_of_rope_freq_cis(
     return module
 
 
+class SDPACustom(torch.nn.Module):
+    def __init__(
+        self,
+        kv_cache: KVCache,
+        dim: int,
+    ):
+        super().__init__()
+        self.kv_cache = kv_cache
+        self.dim = dim
+
+    def forward(
+        self,
+        input_pos: torch.Tensor,
+        q: torch.Tensor,
+        k: torch.Tensor,
+        v: torch.Tensor,
+        bsz,
+        seqlen,
+        mask,
+    ):
+        output = torch.ops.llama.sdpa_with_kv_cache(
+            q,
+            k,
+            v,
+            self.kv_cache.k_cache,
+            self.kv_cache.v_cache,
+            input_pos[-1].item(),
+            seqlen,
+        )
+        return output.view(bsz, seqlen, self.dim)
+
+
+def _replace_sdpa_with_custom_op(module: torch.nn.Module):
+    for name, child in module.named_children():
+        if isinstance(child, SDPA):
+            setattr(
+                module,
+                name,
+                SDPACustom(child.kv_cache, child.dim),
+            )
+        else:
+            _replace_sdpa_with_custom_op(child)
+
+
+def replace_sdpa_with_custom_op(module: torch.nn.Module) -> torch.nn.Module:
+    from executorch.examples.models.llama2.custom_ops import sdpa_with_kv_cache  # noqa
+
+    _replace_sdpa_with_custom_op(module)
+    return module
+
+
+class SDPASimple(torch.nn.Module):
+
+    def __init__(
+        self,
+        kv_cache: KVCache,
+        dim: int,
+        head_dim: int,
+        n_rep: int,
+    ):
+        super().__init__()
+        self.kv_cache = kv_cache
+        self.dim = dim
+        self.head_dim = head_dim
+        self.n_rep = n_rep
+
+    def forward(
+        self,
+        input_pos: torch.Tensor,
+        q: torch.Tensor,
+        k: torch.Tensor,
+        v: torch.Tensor,
+        bsz,
+        seqlen,
+        mask,
+    ):
+        q = q.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)
+        k = k.transpose(1, 2)
+        v = v.transpose(1, 2)
+
+        k, v = self.kv_cache.update(input_pos, k, v)
+        attn_mask = mask[None, None, input_pos]
+
+        k = k.repeat_interleave(self.n_rep, dim=1)
+        v = v.repeat_interleave(self.n_rep, dim=1)
+        scale_factor = 1 / math.sqrt(q.size(-1))
+        attn_weight = q @ k.transpose(-2, -1) * scale_factor
+        attn_weight += attn_mask
+        attn_weight = torch.softmax(attn_weight, dim=-1)
+        y = attn_weight @ v
+
+        return y.transpose(1, 2).contiguous().view(bsz, seqlen, self.dim)
+
+
+def replace_sdpa_with_simple_sdpa(module: torch.nn.Module):
+    for name, child in module.named_children():
+        if isinstance(child, SDPA):
+            setattr(
+                module,
+                name,
+                SDPASimple(child.kv_cache, child.dim, child.head_dim, child.n_rep),
+            )
+        else:
+            replace_sdpa_with_simple_sdpa(child)
+    return module
+
+
+def replace_causal_mask(module: torch.nn.Module):
+    for buffer_fqn_name, buffer in module.named_buffers():
+        buffer_name = buffer_fqn_name.split(".")[-1]
+        if buffer_name == "mask":
+            max_seq_len = buffer.shape[-1]
+            mask = torch.full(
+                (max_seq_len, max_seq_len),
+                float("-inf"),
+                device="cpu",
+            )
+
+            mask = torch.triu(mask, diagonal=1)
+            module.register_buffer(buffer_name, mask)
+    for _, child in module.named_children():
+        replace_causal_mask(child)
+    return module
+
+
 def quantize(
     model: torch.nn.Module,
     qmode: str,
@@ -225,6 +355,13 @@ def build_args_parser() -> argparse.ArgumentParser:
     parser.add_argument(
         "--pt2e_quantize",
         default=None,
+        choices=[
+            "xnnpack_dynamic",
+            "xnnpack_dynamic_qc4",
+            "qnn_8a8w",
+            "qnn_16a16w",
+            "qnn_16a4w",
+        ],
         help="Use PT2E quantization. Comma separated options. e.g. xnnpack_dynamic (for per channel 8 bit weight), xnnpack_dynamic_qc4 (for per channel 4 bit weight), embedding.",
     )
     parser.add_argument(
@@ -482,6 +619,12 @@ def _prepare_for_llama_export(modelname: str, args) -> LlamaEdgeManager:
     if args.expand_rope_table:
         transforms.append(materialze_broadcast_of_rope_freq_cis)
 
+    if args.use_sdpa_with_kv_cache:
+        transforms.append(replace_sdpa_with_custom_op)
+
+    if args.qnn and args.use_kv_cache:
+        transforms.append(replace_sdpa_with_simple_sdpa)
+        transforms.append(replace_causal_mask)
     return (
         load_llama_model(
             checkpoint=checkpoint_path,
@@ -503,26 +646,61 @@ def _export_llama(modelname, args) -> str:  # noqa: C901
     # export_to_edge
     pt2e_quant_params = _get_pt2e_quantization_params(args)
     quantizers = get_pt2e_quantizers(pt2e_quant_params, args)
-    if args.qnn:
-        assert (
-            args.quantization_mode is None
-        ), "Currently qnn backend only supports QnnQuantizer via pt2e flow"
+    quant_dtype = None
+    if args.qnn and args.pt2e_quantize:
         try:
             # pyre-ignore: Undefined import [21]: Could not find a module corresponding to import `executorch.backends.qualcomm.quantizer.quantizer`
-            from executorch.backends.qualcomm.quantizer.quantizer import QnnQuantizer
+            from executorch.backends.qualcomm.quantizer.quantizer import (
+                get_16a4w_qnn_ptq_config,
+                get_default_16bit_qnn_ptq_config,
+                QnnQuantizer,
+                QuantDtype,
+            )
 
             # reset quantizers and pt2e_quant_params from xnnpack backend
             pt2e_quant_params = None
             quantizers = []
         except ImportError:
             raise ImportError(
-                "Please install the Qualcomm backend follwing https://pytorch.org/executorch/main/build-run-qualcomm.html"
+                "Please install the Qualcomm backend follwing "
+                "https://pytorch.org/executorch/main/build-run-qualcomm-ai-engine-direct-backend.html"
             )
 
+        backend, quant_config = args.pt2e_quantize.split("_")
+        assert (
+            backend == "qnn"
+        ), f"The quantization config is for backend {backend} instead of qnn."
         # pyre-ignore: Undefined attribute [16]: Module `executorch.backends` has no attribute `qualcomm`.
         qnn_quantizer = QnnQuantizer()
         # more custom quantization are supported including 16a4w etc. default to 8bit quantized
         custom_annotations = ()
+        if quant_config == "8a8w":
+            # pyre-ignore: Undefined attribute [16]: Module `executorch.backends` has no attribute `qualcomm`.
+            quant_dtype = QuantDtype.use_8a8w
+            pass
+        elif quant_config == "16a16w":
+            # pyre-ignore: Undefined attribute [16]: Module `executorch.backends` has no attribute `qualcomm`.
+            quant_dtype = QuantDtype.use_16a16w
+            qnn_quantizer.add_16bit_quant_ops(qnn_quantizer.SUPPORTED_OPS)
+            # pyre-ignore: Undefined attribute [16]: Module `executorch.backends` has no attribute `qualcomm`.
+            qnn_quantizer.set_bit16_op_quant_config(get_default_16bit_qnn_ptq_config())
+        elif quant_config == "16a4w":
+            # pyre-ignore: Undefined attribute [16]: Module `executorch.backends` has no attribute `qualcomm`.
+            quant_dtype = QuantDtype.use_16a4w
+            qnn_quantizer.add_16bit_quant_ops(qnn_quantizer.SUPPORTED_OPS)
+            # pyre-ignore: Undefined attribute [16]: Module `executorch.backends` has no attribute `qualcomm`.
+            qnn_quantizer.set_bit16_op_quant_config(get_16a4w_qnn_ptq_config())
+            qnn_quantizer.set_per_channel_weight_dtype(
+                weight_dtype_for_16bit_act="int4"
+            )
+        else:
+            raise AssertionError(
+                f"No support for quant type {quant_config}. Support 8a8w, 16a16w and 16a4w."
+            )
+
+        assert (
+            args.quantization_mode is None
+        ), "Currently qnn backend only supports QnnQuantizer via pt2e flow"
         qnn_quantizer.add_custom_quant_annotations(custom_annotations)
         quantizers.append(qnn_quantizer)
 
@@ -598,16 +776,14 @@ def _export_llama(modelname, args) -> str:  # noqa: C901
         # pyre-ignore: Undefined attribute [16]: Module `executorch.backends` has no attribute `apple`.
         compile_specs = CoreMLBackend.generate_compile_specs(
             compute_precision=ct.precision(ct.precision.FLOAT16.value),
-            compute_unit=ct.ComputeUnit[ct.ComputeUnit.ALL.name.upper()],
+            # using `ComputeUnit.ALL` can increase the model load time, default to `ComputeUnit.CPU_AND_GPU`
+            compute_unit=ct.ComputeUnit[ct.ComputeUnit.CPU_AND_GPU.name.upper()],
             # pyre-ignore: Undefined attribute [16]: Module `executorch.backends` has no attribute `apple`
             model_type=CoreMLBackend.MODEL_TYPE.MODEL,
         )
         partitioners.append(
             # pyre-ignore: Undefined attribute [16]: Module `executorch.backends` has no attribute `apple`
             CoreMLPartitioner(
-                skip_ops_for_coreml_delegation=[
-                    "aten.index_put.default",
-                ],
                 compile_specs=compile_specs,
             )
         )
@@ -639,8 +815,20 @@ def _export_llama(modelname, args) -> str:  # noqa: C901
                 "Please install the Qualcomm backend follwing https://pytorch.org/executorch/main/build-run-qualcomm.html"
             )
 
-        # pyre-ignore: Undefined attribute [16]: Module `executorch.backends` has no attribute `qualcomm`
-        backend_options = generate_htp_compiler_spec(use_fp16=False)
+        use_fp16 = True
+        skip_node_op_set = {}
+        if args.pt2e_quantize:
+            use_fp16 = False
+            # TODO: fix the lowering error without skipping nodes
+            # pyre-ignore: Undefined attribute [16]: Module `executorch.backends` has no attribute `qualcomm`.
+            if quant_dtype == QuantDtype.use_8a8w:
+                raise NotImplementedError("8a8w for llama is still under development")
+            # pyre-ignore: Undefined attribute [16]: Module `executorch.backends` has no attribute `qualcomm`.
+            elif quant_dtype == QuantDtype.use_16a16w:
+                raise NotImplementedError("16a16w for llama is still under development")
+            # pyre-ignore: Undefined attribute [16]: Module `executorch.backends` has no attribute `qualcomm`.
+            elif quant_dtype == QuantDtype.use_16a4w:
+                raise NotImplementedError("16a4w for llama is still under development")
         partitioners.append(
             # pyre-ignore: Undefined attribute [16]: Module `executorch.backends` has no attribute `qualcomm`
             QnnPartitioner(
@@ -648,16 +836,17 @@ def _export_llama(modelname, args) -> str:  # noqa: C901
                 generate_qnn_executorch_compiler_spec(
                     # pyre-ignore: Undefined attribute [16]: Module `executorch.backends` has no attribute `qualcomm`.
                     soc_model=QcomChipset.SM8650,  # default to SM8650
-                    backend_options=backend_options,
+                    # pyre-ignore: Undefined attribute [16]: Module `executorch.backends` has no attribute `qualcomm`.
+                    backend_options=generate_htp_compiler_spec(use_fp16=use_fp16),
                     debug=False,
                     saver=False,
                 ),
                 skip_node_id_set={},
-                skip_node_op_set={},
+                skip_node_op_set=skip_node_op_set,
             )
         )
         # pyre-ignore: Undefined attribute [16]: Module `executorch.backends` has no attribute `qualcomm`
-        _transform(builder_exported_to_edge.export_program())
+        _transform(builder_exported_to_edge.edge_manager.exported_program())
 
     if args.generate_etrecord:
         if not builder_exported_to_edge.edge_manager:
diff --git a/examples/models/llama2/llama_transformer.py b/examples/models/llama2/llama_transformer.py
index 2a259af59cb..189280bb8a5 100644
--- a/examples/models/llama2/llama_transformer.py
+++ b/examples/models/llama2/llama_transformer.py
@@ -193,6 +193,44 @@ def update(
         return k_out, v_out
 
 
+class SDPA(nn.Module):
+    def __init__(
+        self,
+        kv_cache: KVCache,
+        dim: int,
+        head_dim: int,
+        n_rep: int,
+    ):
+        super().__init__()
+        self.kv_cache = kv_cache
+        self.dim = dim
+        self.head_dim = head_dim
+        self.n_rep = n_rep
+
+    def forward(
+        self,
+        input_pos: torch.Tensor,
+        q: torch.Tensor,
+        k: torch.Tensor,
+        v: torch.Tensor,
+        bsz,
+        seqlen,
+        mask: torch.Tensor,
+    ) -> torch.Tensor:
+        q = q.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)
+        k = k.transpose(1, 2)
+        v = v.transpose(1, 2)
+
+        k, v = self.kv_cache.update(input_pos, k, v)
+        attn_mask = mask[None, None, input_pos]
+
+        k = k.repeat_interleave(self.n_rep, dim=1)
+        v = v.repeat_interleave(self.n_rep, dim=1)
+        y = F.scaled_dot_product_attention(q, k, v, attn_mask=attn_mask, dropout_p=0.0)
+
+        return y.transpose(1, 2).contiguous().view(bsz, seqlen, self.dim)
+
+
 class Attention(nn.Module):
     def __init__(self, args: ModelArgs, layer_id: int):
         super().__init__()
@@ -213,7 +251,6 @@ def __init__(self, args: ModelArgs, layer_id: int):
         self.wv = nn.Linear(args.dim, self.n_kv_heads * self.head_dim, bias=False)
         self.wo = nn.Linear(args.n_heads * self.head_dim, args.dim, bias=False)
 
-        self.use_sdpa_with_kv_cache_op = args.use_sdpa_with_kv_cache_op
         self.layer_id = layer_id
 
         causal_mask = torch.tril(
@@ -234,6 +271,12 @@ def __init__(self, args: ModelArgs, layer_id: int):
                 self.head_dim,
                 not args.use_sdpa_with_kv_cache_op,  # if we are using the custom op dont transpose the cache. Expect untransposed q k v
             )
+            self.SDPA = SDPA(
+                kv_cache=self.kv_cache,
+                dim=self.dim,
+                head_dim=self.head_dim,
+                n_rep=self.n_rep,
+            )
 
     def forward(
         self,
@@ -256,41 +299,8 @@ def forward(
 
         if self.use_kv_cache:
             assert input_pos is not None
-
-            if not self.use_sdpa_with_kv_cache_op:
-
-                q = q.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)
-                k = k.transpose(1, 2)
-                v = v.transpose(1, 2)
-
-                k, v = self.kv_cache.update(input_pos, k, v)
-                mask = self.mask[None, None, input_pos]
-
-                k = k.repeat_interleave(self.n_rep, dim=1)
-                v = v.repeat_interleave(self.n_rep, dim=1)
-                y = F.scaled_dot_product_attention(
-                    q, k, v, attn_mask=mask, dropout_p=0.0
-                )
-
-                y = y.transpose(1, 2).contiguous().view(bsz, seqlen, self.dim)
-
-                y = self.wo(y)
-                return y
-            else:
-                from .custom_ops.sdpa_with_kv_cache import sdpa_with_kv_cache  # noqa
-
-                output = torch.ops.llama.sdpa_with_kv_cache(
-                    q,
-                    k,
-                    v,
-                    self.kv_cache.k_cache,
-                    self.kv_cache.v_cache,
-                    input_pos[-1].item(),
-                    seqlen,
-                )
-                output = output.view(bsz, seqlen, -1)
-                output = self.wo(output)
-                return output
+            output = self.SDPA(input_pos, q, k, v, bsz, seqlen, self.mask)
+            return self.wo(output)
 
         q = q.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)
         k = k.transpose(1, 2)
diff --git a/examples/models/llama2/model.py b/examples/models/llama2/model.py
index 68882433679..461c0844435 100644
--- a/examples/models/llama2/model.py
+++ b/examples/models/llama2/model.py
@@ -173,11 +173,7 @@ def get_eager_model(self):
 
     def get_example_inputs(self):
         if self.use_kv_cache:
-            if self.use_sdpa_with_kv_cache_op:
-                return self.get_example_inputs_kvcache_sdpa()
-            else:
-                # return self.get_example_inputs_kvcache() TODO xnnpack does not handle forwarding symints, update partitioner to not partition symints
-                return self.get_example_inputs_kvcache_sdpa()
+            return self.get_example_inputs_kvcache_sdpa()
         else:
             return (
                 torch.tensor(
@@ -195,13 +191,3 @@ def get_example_inputs_kvcache_sdpa(self):
                 [0], dtype=torch.long
             ),  # start_pos, what token of output are we on.)
         )
-
-    def get_example_inputs_kvcache(self):
-        return (
-            torch.tensor(
-                [[1, 2, 3]], dtype=torch.long
-            ),  # tokens, with kv cache our input token length is always just 1 token.
-            torch.tensor(
-                [0, 1, 2], dtype=torch.long
-            ),  # start_pos, what token of output are we on.
-        )
diff --git a/examples/models/llama2/tests/TARGETS b/examples/models/llama2/tests/TARGETS
new file mode 100644
index 00000000000..3d2aef6209f
--- /dev/null
+++ b/examples/models/llama2/tests/TARGETS
@@ -0,0 +1,15 @@
+load("@fbcode_macros//build_defs:python_unittest.bzl", "python_unittest")
+
+oncall("executorch")
+
+python_unittest(
+    name = "test_simple_sdpa",
+    srcs = [
+        "test_simple_sdpa.py",
+    ],
+    deps = [
+        "//caffe2:torch",
+        "//executorch/examples/models/llama2:export_library",
+        "//executorch/examples/models/llama2:llama_transformer",
+    ],
+)
diff --git a/examples/models/llama2/tests/test_simple_sdpa.py b/examples/models/llama2/tests/test_simple_sdpa.py
new file mode 100644
index 00000000000..e5360f0e0fa
--- /dev/null
+++ b/examples/models/llama2/tests/test_simple_sdpa.py
@@ -0,0 +1,54 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import copy
+import unittest
+
+import torch
+from executorch.examples.models.llama2.export_llama_lib import SDPASimple
+from executorch.examples.models.llama2.llama_transformer import KVCache, SDPA
+
+
+class SDPATest(unittest.TestCase):
+    def test_simple_sdpa(self):
+        # Verify the correctness between the simple SDPA and the original SDPA module defined in llama_transformer.py
+        max_batch_size = 1
+        max_seq_length = 128
+        n_heads = 8
+        head_dim = 8
+        dim = 64
+        n_rep = 1
+        bsz = 1
+        seqlen = 1
+        n_local_heads = n_heads
+        kv_cache = KVCache(
+            max_batch_size=max_batch_size,
+            max_seq_length=max_seq_length,
+            n_heads=n_heads,
+            head_dim=head_dim,
+            transpose_cache=True,
+        )
+        sdpa = SDPA(
+            kv_cache=copy.deepcopy(kv_cache), dim=dim, head_dim=head_dim, n_rep=n_rep
+        )
+        input_pos = torch.tensor([0])
+        query = torch.randn(1, 1, n_local_heads, head_dim)
+        key = torch.randn(1, 1, n_local_heads, head_dim)
+        value = torch.randn(1, 1, n_local_heads, head_dim)
+        mask = torch.randn(max_seq_length, max_seq_length)
+        sdpa_output = sdpa(
+            input_pos, query, key, value, bsz=bsz, seqlen=seqlen, mask=mask
+        )
+
+        simple_sdpa = SDPASimple(
+            kv_cache=copy.deepcopy(kv_cache), dim=dim, head_dim=head_dim, n_rep=n_rep
+        )
+        simple_sdpa_output = simple_sdpa(
+            input_pos, query, key, value, bsz=bsz, seqlen=seqlen, mask=mask
+        )
+
+        # Compare the output from output from two sdpa implementation
+        self.assertTrue(torch.allclose(sdpa_output, simple_sdpa_output))
diff --git a/examples/models/llava_encoder/README.md b/examples/models/llava_encoder/README.md
index a074fa61332..76224e41454 100644
--- a/examples/models/llava_encoder/README.md
+++ b/examples/models/llava_encoder/README.md
@@ -5,10 +5,7 @@ In this example, we initiate the process of running multi modality through Execu
 
 ## Instructions
 Note that this folder does not host the pretrained LLava model.
-- To have Llava available, follow the [Install instructions](https://github.com/haotian-liu/LLaVA?tab=readme-ov-file#install) in the LLava github. Follow the licence in the specific repo when using L
-- Since the pytorch model version may not be updated, `cd executorch`, run `./install_requirements.sh`.
-- If there is numpy compatibility issue, run `pip install bitsandbytes -I`.
-- Alternatively, run `examples/models/llava_encoder/install_requirements.sh`, to replace the steps above.
+- Run `examples/models/llava_encoder/install_requirements.sh`.
 - Run `python3 -m examples.portable.scripts.export --model_name="llava_encoder"`. The llava_encoder.pte file will be generated.
 - Run `./cmake-out/executor_runner --model_path ./llava_encoder.pte` to verify the exported model with ExecuTorch runtime with portable kernels. Note that the portable kernels are not performance optimized. Please refer to other examples like those in llama2 folder for optimization.
 
diff --git a/examples/models/llava_encoder/install_requirements.sh b/examples/models/llava_encoder/install_requirements.sh
index 5a4ff71285b..b8357e61e54 100644
--- a/examples/models/llava_encoder/install_requirements.sh
+++ b/examples/models/llava_encoder/install_requirements.sh
@@ -5,6 +5,8 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
+set -x
+
 # install llava from the submodule
 pip install --force-reinstall -e examples/third-party/LLaVA
 
@@ -19,4 +21,4 @@ pip install bitsandbytes -I
 # For example, torch version required from llava is older than ExecuTorch.
 # To make both work, recover ExecuTorch's original dependencies by rerunning
 # the install_requirements.sh.
-./install_requirements.sh
+bash -x ./install_requirements.sh
diff --git a/examples/qualcomm/executor_runner/qnn_executor_runner.cpp b/examples/qualcomm/executor_runner/qnn_executor_runner.cpp
index b2691da2ec7..8998ee634e0 100644
--- a/examples/qualcomm/executor_runner/qnn_executor_runner.cpp
+++ b/examples/qualcomm/executor_runner/qnn_executor_runner.cpp
@@ -202,8 +202,10 @@ int main(int argc, char** argv) {
   // be used by a single thread at at time, but it can be reused.
   //
   torch::executor::ETDumpGen etdump_gen = torch::executor::ETDumpGen();
+  // TODO: So far we have issues with etdump_gen during load_method. Enable it
+  // after the issues are fixed.
   Result<Method> method =
-      program->load_method(method_name, &memory_manager, &etdump_gen);
+      program->load_method(method_name, &memory_manager, nullptr);
   ET_CHECK_MSG(
       method.ok(),
       "Loading of method %s failed with status 0x%" PRIx32,
diff --git a/exir/backend/test/test_partitioner.py b/exir/backend/test/test_partitioner.py
index 74974d16231..d492c291f34 100644
--- a/exir/backend/test/test_partitioner.py
+++ b/exir/backend/test/test_partitioner.py
@@ -26,7 +26,7 @@
 from executorch.exir.backend.test.demos.rpc.executor_backend_preprocess import (
     ExecutorBackend,
 )
-from executorch.exir.backend.utils import get_delegates
+from executorch.exir.backend.utils import get_delegates, tag_constant_data
 
 from executorch.exir.dialects._ops import ops as exir_ops
 
@@ -523,3 +523,85 @@ def partition(
             "constant data node (b_const) is tagged with (tag0) but has user (aten_sub_tensor) which has tag (None)",
             str(error.exception),
         )
+
+    def test_not_delegate_mutable_buffers(self) -> None:
+        """
+        A test case to check the mutated buffer is not delegated. We'll need to add a test case
+        to consider when the delegate can consume the mutable buffer.
+        """
+
+        class MutableStateModule(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.register_buffer("my_state", torch.zeros(1))
+
+            def forward(self, x):
+                y = x + self.my_state
+                self.my_state.add_(1)
+                return y
+
+        edge = exir.to_edge(
+            torch.export.export(
+                MutableStateModule(),
+                (torch.zeros(1),),
+            )
+        )
+        self.assertGreater(
+            len(edge.exported_program().graph_signature.buffers_to_mutate),
+            0,
+            "The test case should at leaset one mutable buffer",
+        )
+
+        class PartitionerTagData(Partitioner):
+            def __init__(self):
+                super().__init__()
+                self.delegation_spec = DelegationSpec(
+                    ExecutorBackend.__name__,
+                    [CompileSpec(key, value) for key, value in self.spec.items()],
+                )
+
+            def partition(
+                self, edge_exported_program: ExportedProgram
+            ) -> PartitionResult:
+                partition_tags = {}
+                for node in edge_exported_program.graph.nodes:
+                    if node.op == "call_function" and node.target in [
+                        exir_ops.edge.aten.add.Tensor
+                    ]:
+                        delegation_tag = "tag0"
+                        node.meta["delegation_tag"] = delegation_tag
+                        partition_tags[delegation_tag] = self.delegation_spec
+                tag_constant_data(edge_exported_program)
+                return PartitionResult(
+                    tagged_exported_program=edge_exported_program,
+                    partition_tags=partition_tags,
+                )
+
+        # Check the edge program inital buffers_to_mutate
+        mutate_op = "aten_add_tensor_1"
+        self.assertEqual(
+            edge.exported_program().graph_signature.buffers_to_mutate[mutate_op],
+            "my_state",
+        )
+        edge = edge.to_backend(PartitionerTagData())
+        # After to_backend, add is delegated and is no longer in buffers_to_mutate.
+        self.assertNotIn(
+            mutate_op,
+            edge.exported_program().graph_signature.buffers_to_mutate,
+        )
+
+        mutate_op = "getitem_1"
+        # Ensure the mutated buffer is not delegated, and the new mutate node is getitem (from call_delegate)
+        self.assertEqual(
+            edge.exported_program().graph_signature.buffers_to_mutate[mutate_op],
+            "my_state",
+        )
+        # Check the copy_ node is inserted
+        edge = edge.to_executorch()
+        copy_node = [
+            node
+            for node in edge.exported_program().graph.nodes
+            if node.op == "call_function"
+            and node.target == torch.ops.aten.copy_.default
+        ]
+        self.assertEqual(len(copy_node), 1)
diff --git a/exir/backend/utils.py b/exir/backend/utils.py
index f4c1c28f8bd..b299ba4be8a 100644
--- a/exir/backend/utils.py
+++ b/exir/backend/utils.py
@@ -508,6 +508,20 @@ def tag_constant_data(edge_program: ExportedProgram) -> None:
     subgraph. Throw error when const/param/buffers is used across different partitions. That is the
     underlying data will be owned by multiple delegates.
     """
+    mutated_buffer = set()
+    for node in edge_program.graph.nodes:
+        if node.op == "placeholder" and (
+            is_param(edge_program, node)
+            or is_buffer(edge_program, node)
+            or is_lifted_tensor_constant(edge_program, node)
+        ):
+            for node_user in node.users:
+                if node_user.name in edge_program.graph_signature.buffers_to_mutate:
+                    logging.info(
+                        "The buffer node is a mutated buffer node, which is not constant."
+                    )
+                    mutated_buffer.add(node)
+
     for node in edge_program.graph.nodes:
         # go through const/param/buffer nodes, if all users of const/param/buffer nodes are partitioned then partition
         if node.op == "placeholder" and (
@@ -515,20 +529,21 @@ def tag_constant_data(edge_program: ExportedProgram) -> None:
             or is_buffer(edge_program, node)
             or is_lifted_tensor_constant(edge_program, node)
         ):
-            user_tags = set()
-            for user in node.users:
-                user_tag = user.meta.get("delegation_tag", None)
-                if user_tag is not None:
-                    user_tags.add(user_tag)
-            if len(user_tags) > 1:
-                logging.info(
-                    f"The data node is used across multiple partitions, including {user_tags}. "
-                    "If the data is too large and it's not preferred to copy, please tag the "
-                    "constant node like node.['no_copy'] = True and they won't be copied."
-                )
-            # tag the data node with the same tag as the last user
-            if len(user_tags) > 0:
-                node.meta["delegation_tag"] = user_tags.pop()
+            if node not in mutated_buffer:
+                user_tags = set()
+                for user in node.users:
+                    user_tag = user.meta.get("delegation_tag", None)
+                    if user_tag is not None:
+                        user_tags.add(user_tag)
+                if len(user_tags) > 1:
+                    logging.info(
+                        f"The data node is used across multiple partitions, including {user_tags}. "
+                        "If the data is too large and it's not preferred to copy, please tag the "
+                        "constant node like node.['no_copy'] = True and they won't be copied."
+                    )
+                # tag the data node with the same tag as the last user
+                if len(user_tags) > 0:
+                    node.meta["delegation_tag"] = user_tags.pop()
 
 
 # TODO - style: use templated types
diff --git a/exir/serde/export_serialize.py b/exir/serde/export_serialize.py
index 799a1dbe78f..fef2b2411fa 100644
--- a/exir/serde/export_serialize.py
+++ b/exir/serde/export_serialize.py
@@ -1190,13 +1190,17 @@ def deserialize_tensor_meta(
                 ),
             )
 
-    def deserialize_graph_output(self, output) -> torch.fx.Node:
+    def deserialize_graph_output(self, output) -> Optional[Union[torch.fx.Node, int]]:
         if output.type == "as_tensor":
             return self.serialized_name_to_node[output.as_tensor.name]
         elif output.type == "as_sym_int":
             return self.serialized_name_to_node[output.as_sym_int.as_name]
         elif output.type == "as_sym_bool":
             return self.serialized_name_to_node[output.as_sym_bool.as_name]
+        elif output.type == "as_int":
+            return output.as_int
+        elif output.type == "as_none":
+            return None
         else:
             raise SerializeError(f"Unable to deserialize output node {output}")
 
@@ -1249,7 +1253,8 @@ def deserialize_graph(self, serialized_graph: Graph) -> torch.fx.Graph:
             output_node.meta["val"] = output_node.args[0].meta["val"]
         else:
             output_node.meta["val"] = tuple(
-                arg.meta["val"] for arg in output_node.args[0]
+                arg.meta["val"] if isinstance(arg, torch.fx.Node) else arg
+                for arg in output_node.args[0]
             )
 
         return self.graph
diff --git a/exir/serde/serialize.py b/exir/serde/serialize.py
index 5eb28b830ce..dea86155f21 100644
--- a/exir/serde/serialize.py
+++ b/exir/serde/serialize.py
@@ -35,7 +35,6 @@
     LoweredBackendModule as SerdeLoweredBackendModule,
 )
 from torch._export.serde.schema import SchemaVersion
-from torch._export.serde.serialize import SerializeError
 from torch._export.serde.union import _Union
 from torch._export.verifier import load_verifier
 from torch.fx.experimental import symbolic_shapes
@@ -479,23 +478,22 @@ def deserialize_metadata(self, metadata: Dict[str, str]) -> Dict[str, Any]:
 
         return res
 
-    def deserialize_graph_output(self, output: schema.Argument) -> torch.fx.Node:
-        if isinstance(output.value, schema.TensorArgument):
-            if output.value.name in self.state_dict:  # TODO(T157676982)
-                val = self.state_dict[output.value.name]
-                setattr(self.module, output.value.name, val)
-                node = self.graph.create_node(
-                    "get_attr",
-                    output.value.name,
-                    name=output.value.name,
-                )
-                node.meta = {"val": ""}
-                return node
-            return self.serialized_name_to_node[output.value.name]
-        elif isinstance(output.value, (schema.SymIntArgument, schema.SymBoolArgument)):
-            return self.serialized_name_to_node[output.value.as_name]
-        else:
-            raise SerializeError(f"Unable to deserialize output node {output}")
+    def deserialize_graph_output(
+        self, output: schema.Argument
+    ) -> Optional[Union[torch.fx.Node, int]]:
+        if (
+            output.type == "as_tensor" and output.value.name in self.state_dict
+        ):  # TODO(T157676982)
+            val = self.state_dict[output.value.name]
+            setattr(self.module, output.value.name, val)
+            node = self.graph.create_node(
+                "get_attr",
+                output.value.name,
+                name=output.value.name,
+            )
+            node.meta = {"val": ""}
+            return node
+        return super().deserialize_graph_output(output)
 
     # pyre-ignore
     def deserialize_alloc_inputs(self, serialized_inputs: List[schema.NamedArgument]):
diff --git a/extension/android/CMakeLists.txt b/extension/android/CMakeLists.txt
index c69cea0323e..27bae6a9730 100644
--- a/extension/android/CMakeLists.txt
+++ b/extension/android/CMakeLists.txt
@@ -58,13 +58,14 @@ if(EXECUTORCH_BUILD_LLAMA_JNI)
   add_library(llama_runner STATIC IMPORTED)
   set_property(TARGET llama_runner PROPERTY IMPORTED_LOCATION ${LLAMA_RUNNER_PATH})
 
-  set(CUSTOM_OPS_LIB_PATH ${CMAKE_CURRENT_BINARY_DIR}/../../examples/models/llama2/custom_ops/libcustom_ops_lib.a)
-  add_library(custom_ops_lib STATIC IMPORTED)
-  set_property(TARGET custom_ops_lib PROPERTY IMPORTED_LOCATION ${CUSTOM_OPS_LIB_PATH})
-
   set(CUSTOM_OPS_PATH ${CMAKE_CURRENT_BINARY_DIR}/../../examples/models/llama2/custom_ops/libcustom_ops.a)
   add_library(custom_ops STATIC IMPORTED)
   set_property(TARGET custom_ops PROPERTY IMPORTED_LOCATION ${CUSTOM_OPS_PATH})
+  target_link_options_shared_lib(custom_ops)
+
+  set(CUSTOM_OPS_LIB_PATH ${CMAKE_CURRENT_BINARY_DIR}/../../examples/models/llama2/custom_ops/libcustom_ops_lib.a)
+  add_library(custom_ops_lib STATIC IMPORTED)
+  set_property(TARGET custom_ops_lib PROPERTY IMPORTED_LOCATION ${CUSTOM_OPS_LIB_PATH})
   target_link_options_shared_lib(custom_ops_lib)
 
   if(TARGET pthreadpool)
diff --git a/extension/aot_util/README.md b/extension/aot_util/README.md
deleted file mode 100644
index dbb3866bec3..00000000000
--- a/extension/aot_util/README.md
+++ /dev/null
@@ -1,9 +0,0 @@
-# AOT Util
-
-Ahead-of-time (AOT) utility library. Contains native code used by the AOT lowering and delegation logic. Note 
-that this library should build independently of the runtime code, and as such, should not have dependencies 
-on runtime targets.
-
-This library is intended to be built and distributed as part of the Python pip package, such that it can be
-loaded by AOT Python code.
-
diff --git a/extension/pybindings/TARGETS b/extension/pybindings/TARGETS
index 0b4e9ef3049..9dee0e208b1 100644
--- a/extension/pybindings/TARGETS
+++ b/extension/pybindings/TARGETS
@@ -30,9 +30,9 @@ runtime.genrule(
     srcs = [":pybinding_types"],
     outs = {
         "aten_lib.pyi": ["aten_lib.pyi"],
-        "portable_lib.pyi": ["portable_lib.pyi"],
+        "_portable_lib.pyi": ["_portable_lib.pyi"],
     },
-    cmd = "cp $(location :pybinding_types)/* $OUT/portable_lib.pyi && cp $(location :pybinding_types)/* $OUT/aten_lib.pyi",
+    cmd = "cp $(location :pybinding_types)/* $OUT/_portable_lib.pyi && cp $(location :pybinding_types)/* $OUT/aten_lib.pyi",
     visibility = ["//executorch/extension/pybindings/..."],
 )
 
@@ -46,8 +46,9 @@ executorch_pybindings(
 executorch_pybindings(
     compiler_flags = ["-std=c++17"],
     cppdeps = PORTABLE_MODULE_DEPS + MODELS_ATEN_OPS_LEAN_MODE_GENERATED_LIB,
-    python_module_name = "portable_lib",
-    types = ["//executorch/extension/pybindings:pybindings_types_gen[portable_lib.pyi]"],
+    # Give this an underscore prefix because it has a pure python wrapper.
+    python_module_name = "_portable_lib",
+    types = ["//executorch/extension/pybindings:pybindings_types_gen[_portable_lib.pyi]"],
     visibility = ["PUBLIC"],
 )
 
@@ -58,3 +59,10 @@ executorch_pybindings(
     types = ["//executorch/extension/pybindings:pybindings_types_gen[aten_lib.pyi]"],
     visibility = ["PUBLIC"],
 )
+
+runtime.python_library(
+    name = "portable_lib",
+    srcs = ["portable_lib.py"],
+    visibility = ["@EXECUTORCH_CLIENTS"],
+    deps = [":_portable_lib"],
+)
diff --git a/extension/pybindings/portable_lib.py b/extension/pybindings/portable_lib.py
new file mode 100644
index 00000000000..b9ed089f918
--- /dev/null
+++ b/extension/pybindings/portable_lib.py
@@ -0,0 +1,34 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+# pyre-strict
+
+# When installed as a pip wheel, we must import `torch` before trying to import
+# the pybindings shared library extension. This will load libtorch.so and
+# related libs, ensuring that the pybindings lib can resolve those runtime
+# dependencies.
+import torch as _torch
+
+# Let users import everything from the C++ _portable_lib extension as if this
+# python file defined them. Although we could import these dynamically, it
+# wouldn't preserve the static type annotations.
+from executorch.extension.pybindings._portable_lib import (  # noqa: F401
+    # Disable "imported but unused" (F401) checks.
+    _create_profile_block,  # noqa: F401
+    _dump_profile_results,  # noqa: F401
+    _get_operator_names,  # noqa: F401
+    _load_bundled_program_from_buffer,  # noqa: F401
+    _load_for_executorch,  # noqa: F401
+    _load_for_executorch_from_buffer,  # noqa: F401
+    _load_for_executorch_from_bundled_program,  # noqa: F401
+    _reset_profile_results,  # noqa: F401
+    BundledModule,  # noqa: F401
+    ExecuTorchModule,  # noqa: F401
+)
+
+# Clean up so that `dir(portable_lib)` is the same as `dir(_portable_lib)`
+# (apart from some __dunder__ names).
+del _torch
diff --git a/extension/pybindings/pybindings.pyi b/extension/pybindings/pybindings.pyi
index 0392b5b7cef..8f2b43cac9e 100644
--- a/extension/pybindings/pybindings.pyi
+++ b/extension/pybindings/pybindings.pyi
@@ -7,9 +7,12 @@
 # pyre-strict
 from typing import Any, Dict, List, Sequence, Tuple
 
-class ExecutorchModule:
+class ExecuTorchModule:
+    # pyre-ignore[2, 3]: "Any" in parameter and return type annotations.
     def __call__(self, inputs: Any) -> List[Any]: ...
+    # pyre-ignore[2, 3]: "Any" in parameter and return type annotations.
     def run_method(self, method_name: str, inputs: Sequence[Any]) -> List[Any]: ...
+    # pyre-ignore[2, 3]: "Any" in parameter and return type annotations.
     def forward(self, inputs: Sequence[Any]) -> List[Any]: ...
     # Bundled program methods.
     def load_bundled_input(
@@ -30,16 +33,17 @@ class BundledModule: ...
 
 def _load_for_executorch(
     path: str, enable_etdump: bool = False
-) -> ExecutorchModule: ...
+) -> ExecuTorchModule: ...
 def _load_for_executorch_from_buffer(
     buffer: bytes, enable_etdump: bool = False
-) -> ExecutorchModule: ...
+) -> ExecuTorchModule: ...
 def _load_for_executorch_from_bundled_program(
     module: BundledModule, enable_etdump: bool = False
-) -> ExecutorchModule: ...
+) -> ExecuTorchModule: ...
 def _load_bundled_program_from_buffer(
     buffer: bytes, non_const_pool_size: int = ...
 ) -> BundledModule: ...
+def _get_operator_names() -> List[str]: ...
 def _create_profile_block(name: str) -> None: ...
 def _dump_profile_results() -> bytes: ...
 def _reset_profile_results() -> None: ...
diff --git a/install_requirements.sh b/install_requirements.sh
index c96aefc5628..7918ea7e1b8 100755
--- a/install_requirements.sh
+++ b/install_requirements.sh
@@ -9,7 +9,7 @@
 # Dependencies are defined in .pyproject.toml
 if [[ -z $PYTHON_EXECUTABLE ]];
 then
-  if [[ -z $CONDA_DEFAULT_ENV ]] || [[ $CONDA_DEFAULT_ENV == "base" ]];
+  if [[ -z $CONDA_DEFAULT_ENV ]] || [[ $CONDA_DEFAULT_ENV == "base" ]] || [[ ! -x "$(command -v python)" ]];
   then
     PYTHON_EXECUTABLE=python3
   else
@@ -17,6 +17,14 @@ then
   fi
 fi
 
+if [[ "$PYTHON_EXECUTABLE" == "python" ]];
+then
+  PIP_EXECUTABLE=pip
+else
+  PIP_EXECUTABLE=pip3
+fi
+
+
 # Parse options.
 EXECUTORCH_BUILD_PYBIND=OFF
 CMAKE_ARGS=""
@@ -49,24 +57,20 @@ done
 # Since ExecuTorch often uses main-branch features of pytorch, only the nightly
 # pip versions will have the required features. The NIGHTLY_VERSION value should
 # agree with the third-party/pytorch pinned submodule commit.
-#
-# NOTE: If a newly-fetched version of the executorch repo changes the value of
-# NIGHTLY_VERSION, you should re-run this script to install the necessary
-# package versions.
-NIGHTLY_VERSION=dev20240324
 
 # The pip repository that hosts nightly torch packages.
-TORCH_NIGHTLY_URL="https://download.pytorch.org/whl/nightly/cpu"
+TORCH_URL="https://download.pytorch.org/whl/test/cpu"
 
 # pip packages needed by exir.
 EXIR_REQUIREMENTS=(
-  torch=="2.4.0.${NIGHTLY_VERSION}"
-  torchvision=="0.19.0.${NIGHTLY_VERSION}"  # For testing.
+  torch=="2.3.0"
+  torchvision=="0.18.0"
 )
 
 # pip packages needed for development.
 DEVEL_REQUIREMENTS=(
   cmake  # For building binary targets.
+  pyyaml  # Imported by the kernel codegen tools.
   setuptools  # For building the pip package.
   tomli  # Imported by extract_sources.py when using python < 3.11.
   wheel  # For building the pip package archive.
@@ -77,7 +81,7 @@ DEVEL_REQUIREMENTS=(
 # TODO(dbort): Make each example publish its own requirements.txt
 EXAMPLES_REQUIREMENTS=(
   timm==0.6.13
-  torchaudio=="2.2.0.${NIGHTLY_VERSION}"
+  torchaudio=="2.3.0"
   torchsr==1.0.4
   transformers==4.38.2
 )
@@ -92,13 +96,16 @@ REQUIREMENTS_TO_INSTALL=(
 
 # Install the requirements. `--extra-index-url` tells pip to look for package
 # versions on the provided URL if they aren't available on the default URL.
-pip install --extra-index-url "${TORCH_NIGHTLY_URL}" \
+$PIP_EXECUTABLE install --extra-index-url "${TORCH_URL}" \
     "${REQUIREMENTS_TO_INSTALL[@]}"
 
 #
 # Install executorch pip package. This also makes `flatc` available on the path.
+# The --extra-index-url may be necessary if pyproject.toml has a dependency on a
+# pre-release or nightly version of a torch package.
 #
 
 EXECUTORCH_BUILD_PYBIND="${EXECUTORCH_BUILD_PYBIND}" \
     CMAKE_ARGS="${CMAKE_ARGS}" \
-    pip install . --no-build-isolation -v
+    $PIP_EXECUTABLE install . --no-build-isolation -v \
+        --extra-index-url "${TORCH_URL}"
diff --git a/kernels/portable/cpu/op_add.cpp b/kernels/portable/cpu/op_add.cpp
index a532cfc7ba6..5bbba7b39f2 100644
--- a/kernels/portable/cpu/op_add.cpp
+++ b/kernels/portable/cpu/op_add.cpp
@@ -41,10 +41,12 @@ Tensor& add_out(
   ET_KERNEL_CHECK(
       ctx, check_alpha_type(alpha_type, common_type), InvalidArgument, out);
 
-  ET_SWITCH_REALHB_TYPES(a_type, ctx, "add.out", CTYPE_A, [&]() {
-    ET_SWITCH_REALHB_TYPES(b_type, ctx, "add.out", CTYPE_B, [&]() {
-      ET_SWITCH_REALB_TYPES(common_type, ctx, "add.out", CTYPE_IN, [&]() {
-        ET_SWITCH_REALHB_TYPES(out_type, ctx, "add.out", CTYPE_OUT, [&]() {
+  constexpr auto name = "add.out";
+
+  ET_SWITCH_REALHB_TYPES(a_type, ctx, name, CTYPE_A, [&]() {
+    ET_SWITCH_REALHB_TYPES(b_type, ctx, name, CTYPE_B, [&]() {
+      ET_SWITCH_REALB_TYPES(common_type, ctx, name, CTYPE_IN, [&]() {
+        ET_SWITCH_REALHB_TYPES(out_type, ctx, name, CTYPE_OUT, [&]() {
           CTYPE_IN alpha_val;
           utils::extract_scalar(alpha, &alpha_val);
 
@@ -99,29 +101,29 @@ Tensor& add_scalar_out(
     common_type = ScalarType::Float;
   }
 
-  ET_SWITCH_REALHB_TYPES(a_type, ctx, "add.Scalar_out", CTYPE_A, [&]() {
-    ET_SWITCH_SCALAR_OBJ_TYPES(b_type, ctx, "add.Scalar_out", CTYPE_B, [&]() {
-      ET_SWITCH_REALB_TYPES(
-          common_type, ctx, "add.Scalar_out", CTYPE_IN, [&]() {
-            ET_SWITCH_REALHB_TYPES(
-                out_type, ctx, "add.Scalar_out", CTYPE_OUT, [&]() {
-                  CTYPE_B b_val;
-                  utils::extract_scalar(b, &b_val);
-                  CTYPE_IN b_casted = static_cast<CTYPE_IN>(b_val);
-                  CTYPE_IN alpha_val;
-                  utils::extract_scalar(alpha, &alpha_val);
-
-                  apply_unary_map_fn(
-                      [b_casted, alpha_val](const CTYPE_A val_a) {
-                        CTYPE_IN a_casted = static_cast<CTYPE_IN>(val_a);
-                        CTYPE_IN value = a_casted + alpha_val * b_casted;
-                        return static_cast<CTYPE_OUT>(value);
-                      },
-                      a.const_data_ptr<CTYPE_A>(),
-                      out.mutable_data_ptr<CTYPE_OUT>(),
-                      out.numel());
-                });
-          });
+  constexpr auto name = "add.Scalar_out";
+
+  ET_SWITCH_REALHB_TYPES(a_type, ctx, name, CTYPE_A, [&]() {
+    ET_SWITCH_SCALAR_OBJ_TYPES(b_type, ctx, name, CTYPE_B, [&]() {
+      ET_SWITCH_REALB_TYPES(common_type, ctx, name, CTYPE_IN, [&]() {
+        ET_SWITCH_REALHB_TYPES(out_type, ctx, name, CTYPE_OUT, [&]() {
+          CTYPE_B b_val;
+          utils::extract_scalar(b, &b_val);
+          CTYPE_IN b_casted = static_cast<CTYPE_IN>(b_val);
+          CTYPE_IN alpha_val;
+          utils::extract_scalar(alpha, &alpha_val);
+
+          apply_unary_map_fn(
+              [b_casted, alpha_val](const CTYPE_A val_a) {
+                CTYPE_IN a_casted = static_cast<CTYPE_IN>(val_a);
+                CTYPE_IN value = a_casted + alpha_val * b_casted;
+                return static_cast<CTYPE_OUT>(value);
+              },
+              a.const_data_ptr<CTYPE_A>(),
+              out.mutable_data_ptr<CTYPE_OUT>(),
+              out.numel());
+        });
+      });
     });
   });
 
diff --git a/kernels/portable/cpu/op_isinf.cpp b/kernels/portable/cpu/op_isinf.cpp
index da8599d5fac..0ac1fa11955 100644
--- a/kernels/portable/cpu/op_isinf.cpp
+++ b/kernels/portable/cpu/op_isinf.cpp
@@ -14,8 +14,18 @@ namespace torch {
 namespace executor {
 namespace native {
 
+namespace {
+// Passing std::isinf directly to unary_ufunc_realhb_to_bool can cause "error:
+// cannot resolve overloaded function ‘isinf’ based on conversion to type
+// ‘torch::executor::FunctionRef<bool(double)>’" in some compilation
+// environments.
+bool isinf_wrapper(double num) {
+  return std::isinf(num);
+}
+} // namespace
+
 Tensor& isinf_out(RuntimeContext& ctx, const Tensor& in, Tensor& out) {
-  return internal::unary_ufunc_realhb_to_bool(std::isinf, ctx, in, out);
+  return internal::unary_ufunc_realhb_to_bool(isinf_wrapper, ctx, in, out);
 }
 
 } // namespace native
diff --git a/kernels/portable/cpu/op_isnan.cpp b/kernels/portable/cpu/op_isnan.cpp
index 2a82b127d3e..d9ef038b73a 100644
--- a/kernels/portable/cpu/op_isnan.cpp
+++ b/kernels/portable/cpu/op_isnan.cpp
@@ -14,8 +14,18 @@ namespace torch {
 namespace executor {
 namespace native {
 
+namespace {
+// Passing std::isnan directly to unary_ufunc_realhb_to_bool can cause "error:
+// cannot resolve overloaded function ‘isnan’ based on conversion to type
+// ‘torch::executor::FunctionRef<bool(double)>’" in some compilation
+// environments.
+bool isnan_wrapper(double num) {
+  return std::isnan(num);
+}
+} // namespace
+
 Tensor& isnan_out(RuntimeContext& ctx, const Tensor& in, Tensor& out) {
-  return internal::unary_ufunc_realhb_to_bool(std::isnan, ctx, in, out);
+  return internal::unary_ufunc_realhb_to_bool(isnan_wrapper, ctx, in, out);
 }
 
 } // namespace native
diff --git a/kernels/portable/cpu/op_sub.cpp b/kernels/portable/cpu/op_sub.cpp
index e8a0fc919d7..2df71a6d6b0 100644
--- a/kernels/portable/cpu/op_sub.cpp
+++ b/kernels/portable/cpu/op_sub.cpp
@@ -29,7 +29,7 @@ Tensor& sub_out(
       InvalidArgument,
       out);
 
-  ET_KERNEL_CHECK(ctx, tensor_is_realhb_type(out), InvalidArgument, out);
+  ET_KERNEL_CHECK(ctx, tensor_is_realh_type(out), InvalidArgument, out);
 
   ScalarType a_type = a.scalar_type();
   ScalarType b_type = b.scalar_type();
@@ -37,15 +37,16 @@ Tensor& sub_out(
   ScalarType common_type = promoteTypes(a_type, b_type, /*half_to_float*/ true);
   ScalarType out_type = out.scalar_type();
 
+  ET_KERNEL_CHECK(ctx, canCast(common_type, out_type), InvalidArgument, out);
   ET_KERNEL_CHECK(
       ctx, check_alpha_type(alpha_type, common_type), InvalidArgument, out);
-  ET_KERNEL_CHECK(ctx, canCast(common_type, out_type), InvalidArgument, out);
-  ET_KERNEL_CHECK(ctx, tensor_is_realh_type(out), InvalidArgument, out);
 
-  ET_SWITCH_REALH_TYPES(a_type, ctx, "sub.out", CTYPE_A, [&]() {
-    ET_SWITCH_REALH_TYPES(b_type, ctx, "sub.out", CTYPE_B, [&]() {
-      ET_SWITCH_REAL_TYPES(common_type, ctx, "sub.out", CTYPE_IN, [&]() {
-        ET_SWITCH_REALH_TYPES(out_type, ctx, "sub.out", CTYPE_OUT, [&]() {
+  constexpr auto name = "sub.out";
+
+  ET_SWITCH_REALH_TYPES(a_type, ctx, name, CTYPE_A, [&]() {
+    ET_SWITCH_REALH_TYPES(b_type, ctx, name, CTYPE_B, [&]() {
+      ET_SWITCH_REAL_TYPES(common_type, ctx, name, CTYPE_IN, [&]() {
+        ET_SWITCH_REALH_TYPES(out_type, ctx, name, CTYPE_OUT, [&]() {
           CTYPE_IN alpha_val;
           utils::extract_scalar(alpha, &alpha_val);
 
@@ -84,11 +85,11 @@ Tensor& sub_scalar_out(
       out,
       "Failed to resize output tensor.");
 
-  ET_KERNEL_CHECK(ctx, tensor_is_realhb_type(out), InvalidArgument, out);
+  ET_KERNEL_CHECK(ctx, tensor_is_realh_type(out), InvalidArgument, out);
 
   ScalarType a_type = a.scalar_type();
   ScalarType b_type = utils::get_scalar_dtype(b);
-  ScalarType alpha_type = utils::get_scalar_dtype(b);
+  ScalarType alpha_type = utils::get_scalar_dtype(alpha);
   ScalarType common_type =
       utils::promote_type_with_scalar(a_type, b, /*half_to_float*/ false);
   ScalarType out_type = out.scalar_type();
@@ -100,31 +101,30 @@ Tensor& sub_scalar_out(
     common_type = ScalarType::Float;
   }
 
-  ET_SWITCH_REALH_TYPES(a_type, ctx, "sub.Scalar_out", CTYPE_A, [&]() {
-    ET_SWITCH_SCALAR_OBJ_REAL_TYPES(
-        b_type, ctx, "sub.Scalar_out", CTYPE_B, [&]() {
-          ET_SWITCH_REAL_TYPES(
-              common_type, ctx, "sub.Scalar_out", CTYPE_IN, [&]() {
-                ET_SWITCH_REALH_TYPES(
-                    out_type, ctx, "sub.Scalar_out", CTYPE_OUT, [&]() {
-                      CTYPE_B b_val;
-                      utils::extract_scalar(b, &b_val);
-                      CTYPE_IN b_casted = static_cast<CTYPE_IN>(b_val);
-                      CTYPE_IN alpha_val;
-                      utils::extract_scalar(alpha, &alpha_val);
-
-                      apply_unary_map_fn(
-                          [b_casted, alpha_val](const CTYPE_A val_a) {
-                            CTYPE_IN a_casted = static_cast<CTYPE_IN>(val_a);
-                            CTYPE_IN value = a_casted - alpha_val * b_casted;
-                            return static_cast<CTYPE_OUT>(value);
-                          },
-                          a.const_data_ptr<CTYPE_A>(),
-                          out.mutable_data_ptr<CTYPE_OUT>(),
-                          out.numel());
-                    });
-              });
+  constexpr auto name = "sub.Scalar_out";
+
+  ET_SWITCH_REALH_TYPES(a_type, ctx, name, CTYPE_A, [&]() {
+    ET_SWITCH_SCALAR_OBJ_REAL_TYPES(b_type, ctx, name, CTYPE_B, [&]() {
+      ET_SWITCH_REAL_TYPES(common_type, ctx, name, CTYPE_IN, [&]() {
+        ET_SWITCH_REALH_TYPES(out_type, ctx, name, CTYPE_OUT, [&]() {
+          CTYPE_B b_val;
+          utils::extract_scalar(b, &b_val);
+          CTYPE_IN b_casted = static_cast<CTYPE_IN>(b_val);
+          CTYPE_IN alpha_val;
+          utils::extract_scalar(alpha, &alpha_val);
+
+          apply_unary_map_fn(
+              [b_casted, alpha_val](const CTYPE_A val_a) {
+                CTYPE_IN a_casted = static_cast<CTYPE_IN>(val_a);
+                CTYPE_IN value = a_casted - alpha_val * b_casted;
+                return static_cast<CTYPE_OUT>(value);
+              },
+              a.const_data_ptr<CTYPE_A>(),
+              out.mutable_data_ptr<CTYPE_OUT>(),
+              out.numel());
         });
+      });
+    });
   });
 
   return out;
diff --git a/pyproject.toml b/pyproject.toml
index ddd7bb0914c..10b65275165 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,11 +1,51 @@
 [build-system]
-requires = ["setuptools", "wheel"]
+requires = [
+  "cmake",  # For building binary targets in the wheel.
+  "pyyaml",  # Imported by the kernel codegen tools.
+  "setuptools",  # For building the pip package contents.
+  "tomli",  # Imported by extract_sources.py when using python < 3.11.
+  "wheel",  # For building the pip package archive.
+  "zstd",  # Imported by resolve_buck.py.
+]
 build-backend = "setuptools.build_meta"
 
 [project]
 name = "executorch"
-version = "0.1.0"
-# Python dependencies required for development
+dynamic = [
+  # setup.py will set the version.
+  'version',
+]
+description = "On-device AI across mobile, embedded and edge for PyTorch"
+readme = "README-wheel.md"
+authors = [
+  {name="PyTorch Team", email="packages@pytorch.org"},
+]
+license = {file = "LICENSE"}
+keywords = ["pytorch", "machine learning"]
+# PyPI package information.
+classifiers = [
+    # How mature is this project? Common values are
+    #   3 - Alpha
+    #   4 - Beta
+    #   5 - Production/Stable
+    "Development Status :: 3 - Alpha",
+    "Intended Audience :: Developers",
+    "Intended Audience :: Education",
+    "Intended Audience :: Science/Research",
+    "License :: OSI Approved :: BSD License",
+    "Topic :: Scientific/Engineering",
+    "Topic :: Scientific/Engineering :: Mathematics",
+    "Topic :: Scientific/Engineering :: Artificial Intelligence",
+    "Topic :: Software Development",
+    "Topic :: Software Development :: Libraries",
+    "Topic :: Software Development :: Libraries :: Python Modules",
+    "Programming Language :: C++",
+    "Programming Language :: Python :: 3",
+    # Update this as we support more versions of python.
+    "Programming Language :: Python :: 3.10",
+]
+
+# Python dependencies required for use.
 dependencies=[
   "expecttest",
   "flatbuffers",
@@ -20,8 +60,16 @@ dependencies=[
   "ruamel.yaml",
   "sympy",
   "tabulate",
+  "torch==2.3",
 ]
 
+[project.urls]
+# The keys are arbitrary but will be visible on PyPI.
+Homepage = "https://pytorch.org/executorch/"
+Repository = "https://github.com/pytorch/executorch"
+Issues = "https://github.com/pytorch/executorch/issues"
+Changelog = "https://github.com/pytorch/executorch/releases"
+
 # Tell setuptools to generate commandline wrappers for tools that we install
 # under data/bin in the pip package. This will put these commands on the user's
 # path.
diff --git a/runtime/executor/targets.bzl b/runtime/executor/targets.bzl
index b3db52720c7..46f997a80ad 100644
--- a/runtime/executor/targets.bzl
+++ b/runtime/executor/targets.bzl
@@ -44,9 +44,20 @@ def define_common_targets():
 
     for aten_mode in (True, False):
         aten_suffix = "_aten" if aten_mode else ""
-
         runtime.cxx_library(
             name = "program" + aten_suffix,
+            exported_deps = [
+                ":program_no_prim_ops" + aten_suffix,
+                "//executorch/kernels/prim_ops:prim_ops_registry" + aten_suffix,
+            ],
+            visibility = [
+                "//executorch/runtime/executor/...",
+                "@EXECUTORCH_CLIENTS",
+            ],
+        )
+
+        runtime.cxx_library(
+            name = "program_no_prim_ops" + aten_suffix,
             srcs = [
                 "method.cpp",
                 "method_meta.cpp",
@@ -54,34 +65,28 @@ def define_common_targets():
                 "tensor_parser_exec_aten.cpp",
                 "tensor_parser{}.cpp".format(aten_suffix if aten_mode else "_portable"),
             ],
-            headers = [
-                "tensor_parser.h",
-            ],
             exported_headers = [
                 "method.h",
                 "method_meta.h",
                 "program.h",
+                "tensor_parser.h",
             ],
-            deps = [
-                "//executorch/kernels/prim_ops:prim_ops_registry" + aten_suffix,
+            preprocessor_flags = _program_preprocessor_flags(),
+            exported_deps = [
+                ":memory_manager",
                 "//executorch/runtime/backend:interface",
-                "//executorch/runtime/core/exec_aten/util:tensor_util" + aten_suffix,
                 "//executorch/runtime/core:core",
+                "//executorch/runtime/core:evalue" + aten_suffix,
+                "//executorch/runtime/core:event_tracer" + aten_suffix,
+                "//executorch/runtime/core/exec_aten:lib" + aten_suffix,
+                "//executorch/runtime/core/exec_aten/util:tensor_util" + aten_suffix,
                 "//executorch/runtime/kernel:kernel_runtime_context" + aten_suffix,
                 "//executorch/runtime/kernel:operator_registry",
                 "//executorch/runtime/platform:platform",
                 "//executorch/schema:extended_header",
-                "//executorch/schema:program",
-                ":memory_manager",
             ],
-            preprocessor_flags = _program_preprocessor_flags(),
-            exported_deps = [
-                "//executorch/runtime/core/exec_aten:lib" + aten_suffix,
-                "//executorch/runtime/core:core",
-                "//executorch/runtime/core:evalue" + aten_suffix,
-                "//executorch/runtime/platform:platform",
-                "//executorch/runtime/core:event_tracer" + aten_suffix,
-                ":memory_manager",
+            deps = [
+                "//executorch/schema:program",
             ],
             visibility = [
                 "//executorch/runtime/executor/...",
diff --git a/sdk/bundled_program/config.py b/sdk/bundled_program/config.py
index d1ca4c10e36..3bfbe7bc69c 100644
--- a/sdk/bundled_program/config.py
+++ b/sdk/bundled_program/config.py
@@ -62,7 +62,7 @@ def __init__(
             input: All inputs required by eager_model with specific inference method for one-time execution.
 
                     It is worth mentioning that, although both bundled program and ET runtime apis support setting input
-                    other than torch.tensor type, only the input in torch.tensor type will be actually updated in
+                    other than `torch.tensor` type, only the input in `torch.tensor` type will be actually updated in
                     the method, and the rest of the inputs will just do a sanity check if they match the default value in method.
 
             expected_output: Expected output of given input for verification. It can be None if user only wants to use the test case for profiling.
diff --git a/sdk/debug_format/et_schema.py b/sdk/debug_format/et_schema.py
index af95bc7f03a..9a6af4edba9 100644
--- a/sdk/debug_format/et_schema.py
+++ b/sdk/debug_format/et_schema.py
@@ -260,7 +260,12 @@ def gen_operator_graph(
                 assert len(args) == 1
                 # Args of op=='output' is a wrapped list of return nodes ([ret_1, ret_2, ...], )
                 in_nodes = [
-                    nodes[FXOperatorGraph._get_node_name(ret)] for ret in args[0]
+                    (
+                        nodes[FXOperatorGraph._get_node_name(ret)]
+                        if ret is not None
+                        else []
+                    )
+                    for ret in args[0]
                 ]
                 node = ValueNode(
                     name,
diff --git a/sdk/inspector/_inspector.py b/sdk/inspector/_inspector.py
index 91492643c89..45fe272cbb2 100644
--- a/sdk/inspector/_inspector.py
+++ b/sdk/inspector/_inspector.py
@@ -312,6 +312,9 @@ class Event:
     _instruction_id: Optional[int] = None
 
     _delegate_metadata_parser: Optional[Callable[[List[str]], Dict[str, Any]]] = None
+    _delegate_time_scale_converter: Optional[
+        Callable[[Union[int, str], Union[int, float]], Union[int, float]]
+    ] = None
 
     @cached_property
     def delegate_debug_metadatas(self) -> Union[List[str], Dict[str, Any]]:
@@ -391,6 +394,9 @@ def _gen_from_inference_events(
         delegate_metadata_parser: Optional[
             Callable[[List[str]], Dict[str, Any]]
         ] = None,
+        delegate_time_scale_converter: Optional[
+            Callable[[Union[int, str], Union[int, float]], Union[int, float]]
+        ] = None,
     ) -> "Event":
         """
         Given an EventSignature and a list of Events with that signature,
@@ -411,6 +417,7 @@ def _gen_from_inference_events(
             name="",
             _instruction_id=signature.instruction_id,
             _delegate_metadata_parser=delegate_metadata_parser,
+            _delegate_time_scale_converter=delegate_time_scale_converter,
         )
 
         # Populate fields from profile events
@@ -476,14 +483,35 @@ def _populate_profiling_related_fields(
                         f"Expected exactly one profile event per InstructionEvent when generating Inspector Event, but got {len(profile_events)}"
                     )
 
+                profile_event = profile_events[0]
+
                 # Scale factor should only be applied to non-delegated ops
-                scale_factor_updated = 1 if ret_event.is_delegated_op else scale_factor
+                if (
+                    ret_event.is_delegated_op
+                    and ret_event._delegate_time_scale_converter is not None
+                ):
+                    scaled_time = ret_event._delegate_time_scale_converter(
+                        ret_event.name,
+                        profile_event.end_time,
+                        # pyre-ignore
+                    ) - ret_event._delegate_time_scale_converter(
+                        ret_event.name, profile_event.start_time
+                    )
+                # If it's not a delegated op then we can just use the raw time values
+                # and then scale them according to the scale factor that was passed in.
+                elif not ret_event.is_delegated_op:
+                    scaled_time = (
+                        float(profile_event.end_time - profile_event.start_time)
+                        / scale_factor
+                    )
+                # If there was no scale factor passed in just take a difference of the
+                # end and start times.
+                else:
+                    scaled_time = float(
+                        profile_event.end_time - profile_event.start_time
+                    )
 
-                profile_event = profile_events[0]
-                data.append(
-                    float(profile_event.end_time - profile_event.start_time)
-                    / scale_factor_updated
-                )
+                data.append(scaled_time)
                 delegate_debug_metadatas.append(
                     profile_event.delegate_debug_metadata
                     if profile_event.delegate_debug_metadata
@@ -646,6 +674,9 @@ def _gen_from_etdump(
         delegate_metadata_parser: Optional[
             Callable[[List[str]], Dict[str, Any]]
         ] = None,
+        delegate_time_scale_converter: Optional[
+            Callable[[Union[int, str], Union[int, float]], Union[int, float]]
+        ] = None,
     ) -> List["EventBlock"]:
         """
         Given an etdump, generate a list of EventBlocks corresponding to the
@@ -743,6 +774,7 @@ class GroupedRunInstances:
                     scale_factor,
                     output_buffer,
                     delegate_metadata_parser,
+                    delegate_time_scale_converter,
                 )
                 for signature, instruction_events in run_group.items()
             ]
@@ -875,6 +907,9 @@ def __init__(
         delegate_metadata_parser: Optional[
             Callable[[List[str]], Dict[str, Any]]
         ] = None,
+        delegate_time_scale_converter: Optional[
+            Callable[[Union[int, str], Union[int, float]], Union[int, float]]
+        ] = None,
         enable_module_hierarchy: bool = False,
     ) -> None:
         r"""
@@ -930,6 +965,7 @@ def __init__(
             self._target_time_scale,
             output_buffer,
             delegate_metadata_parser=delegate_metadata_parser,
+            delegate_time_scale_converter=delegate_time_scale_converter,
         )
 
         # Connect ETRecord to EventBlocks
diff --git a/sdk/inspector/_inspector_utils.py b/sdk/inspector/_inspector_utils.py
index a71d34753b3..ecef1d13e73 100644
--- a/sdk/inspector/_inspector_utils.py
+++ b/sdk/inspector/_inspector_utils.py
@@ -103,6 +103,9 @@ def get_scalar_type_size(scalar_type: ScalarType) -> Tuple[torch.dtype, int]:
         return torch.zeros(tensor.sizes, dtype=torch_dtype)
 
     tensor_bytes_size = math.prod(tensor.sizes) * dtype_size
+    if tensor_bytes_size == 0:
+        # Empty tensor. Return empty tensor.
+        return torch.zeros(tensor.sizes, dtype=torch_dtype)
 
     if tensor.offset is None:
         raise ValueError("Tensor offset cannot be None")
diff --git a/sdk/inspector/tests/TARGETS b/sdk/inspector/tests/TARGETS
index 0e6d06e776c..374d2ea7538 100644
--- a/sdk/inspector/tests/TARGETS
+++ b/sdk/inspector/tests/TARGETS
@@ -9,6 +9,7 @@ python_unittest(
         "//executorch/exir:lib",
         "//executorch/sdk:lib",
         "//executorch/sdk/debug_format:et_schema",
+        "//executorch/sdk/etdump:schema_flatcc",
         "//executorch/sdk/etrecord/tests:etrecord_test_library",
         "//executorch/sdk/inspector:inspector",
         "//executorch/sdk/inspector:lib",
diff --git a/sdk/inspector/tests/inspector_test.py b/sdk/inspector/tests/inspector_test.py
index 472f56f767d..e1625bec755 100644
--- a/sdk/inspector/tests/inspector_test.py
+++ b/sdk/inspector/tests/inspector_test.py
@@ -17,9 +17,15 @@
 from executorch.exir import ExportedProgram
 from executorch.sdk import generate_etrecord, parse_etrecord
 from executorch.sdk.debug_format.et_schema import OperatorNode
+from executorch.sdk.etdump.schema_flatcc import ProfileEvent
 from executorch.sdk.etrecord.tests.etrecord_test import TestETRecord
 
 from executorch.sdk.inspector import _inspector, Event, EventBlock, Inspector, PerfData
+from executorch.sdk.inspector._inspector import (
+    InstructionEvent,
+    InstructionEventSignature,
+    ProfileEventSignature,
+)
 
 
 OP_TYPE = "aten::add"
@@ -183,6 +189,49 @@ def test_inspector_associate_with_op_graph_nodes_multiple_debug_handles(self):
         expected_ops = ["op_0", "op_1"]
         self.assertEqual(event_with_multiple_debug_handles.op_types, expected_ops)
 
+    def test_inspector_delegate_time_scale_converter(self):
+        def time_scale_converter(event_name, time):
+            return time / 10
+
+        event = Event(
+            name="",
+            _delegate_metadata_parser=None,
+            _delegate_time_scale_converter=None,
+        )
+        event_signature = ProfileEventSignature(
+            name="",
+            instruction_id=0,
+            delegate_id_str="test_event",
+        )
+        instruction_events = [
+            InstructionEvent(
+                signature=InstructionEventSignature(0, 0),
+                profile_events=[
+                    ProfileEvent(
+                        name="test_event",
+                        chain_index=0,
+                        instruction_id=0,
+                        delegate_debug_id_int=None,
+                        delegate_debug_id_str="test_event_delegated",
+                        start_time=100,
+                        end_time=200,
+                        delegate_debug_metadata=None,
+                    )
+                ],
+            )
+        ]
+        Event._populate_profiling_related_fields(
+            event, event_signature, instruction_events, 1
+        )
+        # Value of the perf data before scaling is done.
+        self.assertEqual(event.perf_data.raw[0], 100)
+        event._delegate_time_scale_converter = time_scale_converter
+        Event._populate_profiling_related_fields(
+            event, event_signature, instruction_events, 1
+        )
+        # Value of the perf data after scaling is done. 200/10 - 100/10.
+        self.assertEqual(event.perf_data.raw[0], 10)
+
     def test_inspector_get_exported_program(self):
         # Create a context manager to patch functions called by Inspector.__init__
         with patch.object(
diff --git a/setup.py b/setup.py
index bef57764b9d..92fd1e3f778 100644
--- a/setup.py
+++ b/setup.py
@@ -56,6 +56,7 @@
 from distutils import log
 from distutils.sysconfig import get_python_lib
 from pathlib import Path
+from typing import Optional
 
 from setuptools import Extension, setup
 from setuptools.command.build import build
@@ -83,10 +84,76 @@ def _is_env_enabled(env_var: str, default: bool = False) -> bool:
     def pybindings(cls) -> bool:
         return cls._is_env_enabled("EXECUTORCH_BUILD_PYBIND", default=False)
 
+
+class Version:
+    """Static properties that describe the version of the pip package."""
+
+    # Cached values returned by the properties.
+    __root_dir_attr: Optional[str] = None
+    __string_attr: Optional[str] = None
+    __git_hash_attr: Optional[str] = None
+
+    @classmethod
+    @property
+    def _root_dir(cls) -> str:
+        """The path to the root of the git repo."""
+        if cls.__root_dir_attr is None:
+            # This setup.py file lives in the root of the repo.
+            cls.__root_dir_attr = str(Path(__file__).parent.resolve())
+        return str(cls.__root_dir_attr)
+
+    @classmethod
+    @property
+    def git_hash(cls) -> Optional[str]:
+        """The current git hash, if known."""
+        if cls.__git_hash_attr is None:
+            import subprocess
+
+            try:
+                cls.__git_hash_attr = (
+                    subprocess.check_output(
+                        ["git", "rev-parse", "HEAD"], cwd=cls._root_dir
+                    )
+                    .decode("ascii")
+                    .strip()
+                )
+            except subprocess.CalledProcessError:
+                cls.__git_hash_attr = ""  # Non-None but empty.
+        # A non-None but empty value indicates that we don't know it.
+        return cls.__git_hash_attr if cls.__git_hash_attr else None
+
     @classmethod
     @property
-    def xnnpack(cls) -> bool:
-        return cls._is_env_enabled("EXECUTORCH_BUILD_XNNPACK", default=False)
+    def string(cls) -> str:
+        """The version string."""
+        if cls.__string_attr is None:
+            # If set, BUILD_VERSION should override any local version
+            # information. CI will use this to manage, e.g., release vs. nightly
+            # versions.
+            version = os.getenv("BUILD_VERSION", "").strip()
+            if not version:
+                # Otherwise, read the version from a local file and add the git
+                # commit if available.
+                version = (
+                    open(os.path.join(cls._root_dir, "version.txt")).read().strip()
+                )
+                if cls.git_hash:
+                    version += "+" + cls.git_hash[:7]
+            cls.__string_attr = version
+        return cls.__string_attr
+
+    @classmethod
+    def write_to_python_file(cls, path: str) -> None:
+        """Creates a file similar to PyTorch core's `torch/version.py`."""
+        lines = [
+            "from typing import Optional",
+            '__all__ = ["__version__", "git_version"]',
+            f'__version__ = "{cls.string}"',
+            # A string or None.
+            f"git_version: Optional[str] = {repr(cls.git_hash)}",
+        ]
+        with open(path, "w") as fp:
+            fp.write("\n".join(lines) + "\n")
 
 
 class _BaseExtension(Extension):
@@ -274,6 +341,9 @@ def run(self):
         # package subdirectory.
         dst_root = os.path.join(self.build_lib, self.get_package_dir("executorch"))
 
+        # Create the version file.
+        Version.write_to_python_file(os.path.join(dst_root, "version.py"))
+
         # Manually copy files into the output package directory. These are
         # typically python "resource" files that will live alongside the python
         # code that uses them.
@@ -359,6 +429,7 @@ def run(self):
             # useful error information to users.
             "-DEXECUTORCH_ENABLE_LOGGING=ON",
             "-DEXECUTORCH_LOG_LEVEL=Info",
+            "-DCMAKE_OSX_DEPLOYMENT_TARGET=10.15",
         ]
 
         build_args = [f"-j{self.parallel}"]
@@ -372,13 +443,9 @@ def run(self):
                 "-DEXECUTORCH_BUILD_PYBIND=ON",
             ]
             build_args += ["--target", "portable_lib"]
-            if ShouldBuild.xnnpack:
-                cmake_args += [
-                    "-DEXECUTORCH_BUILD_XNNPACK=ON",
-                ]
-                # No target needed; the cmake arg will link xnnpack
-                # into the portable_lib target.
-            # TODO(dbort): Add MPS/CoreML backends when building on macos.
+            # To link backends into the portable_lib target, callers should
+            # add entries like `-DEXECUTORCH_BUILD_XNNPACK=ON` to the CMAKE_ARGS
+            # environment variable.
 
         # Allow adding extra cmake args through the environment. Used by some
         # tests and demos to expand the set of targets included in the pip
@@ -398,7 +465,17 @@ def run(self):
         if not self.dry_run:
             # Dry run should log the command but not actually run it.
             (Path(cmake_cache_dir) / "CMakeCache.txt").unlink(missing_ok=True)
-        self.spawn(["cmake", "-S", repo_root, "-B", cmake_cache_dir, *cmake_args])
+        try:
+            # This script is sometimes run as root in docker containers. buck2
+            # doesn't allow running as root unless $HOME is owned by root or
+            # does not exist. So temporarily undefine it while configuring
+            # cmake, which runs buck2 to get some source lists.
+            old_home = os.environ.pop("HOME", None)
+            # Generate the build system files.
+            self.spawn(["cmake", "-S", repo_root, "-B", cmake_cache_dir, *cmake_args])
+        finally:
+            if old_home is not None:
+                os.environ["HOME"] = old_home
 
         # Build the system.
         self.spawn(["cmake", "--build", cmake_cache_dir, *build_args])
@@ -434,7 +511,7 @@ def get_ext_modules() -> list[Extension]:
             # portable kernels, and a selection of backends. This lets users
             # load and execute .pte files from python.
             BuiltExtension(
-                "portable_lib.*", "executorch.extension.pybindings.portable_lib"
+                "_portable_lib.*", "executorch.extension.pybindings._portable_lib"
             )
         )
 
@@ -446,6 +523,7 @@ def get_ext_modules() -> list[Extension]:
 
 
 setup(
+    version=Version.string,
     # TODO(dbort): Could use py_modules to restrict the set of modules we
     # package, and package_data to restrict the set up non-python files we
     # include. See also setuptools/discovery.py for custom finders.
diff --git a/third-party/pytorch b/third-party/pytorch
index 0a038cf0cff..23961cef856 160000
--- a/third-party/pytorch
+++ b/third-party/pytorch
@@ -1 +1 @@
-Subproject commit 0a038cf0cff2d071b7359ac0491fd2ba7798a438
+Subproject commit 23961cef8565b2d01db5280ab518939b74bd5ff5