From 7c5e6e8fdc5b4d981e20b9019f30297d4c3711ff Mon Sep 17 00:00:00 2001
From: Scott Roy <161522778+metascroy@users.noreply.github.com>
Date: Mon, 23 Sep 2024 14:09:54 -0700
Subject: [PATCH 01/15] init

---
 .github/workflows/pull.yml                   | 83 ++++++++++++++++++++
 .gitignore                                   |  1 +
 docs/quantization.md                         | 69 ++++++++++++++++
 install/.pins/torchao-pin.txt                |  1 +
 runner/aoti.cmake                            |  4 +
 runner/et.cmake                              |  7 ++
 torchchat/utils/quantize.py                  | 47 ++++++++++-
 torchchat/utils/scripts/build_native.sh      | 24 +++++-
 torchchat/utils/scripts/build_torchao_ops.sh | 16 ++++
 torchchat/utils/scripts/install_utils.sh     | 49 ++++++++++++
 10 files changed, 295 insertions(+), 6 deletions(-)
 create mode 100644 install/.pins/torchao-pin.txt
 create mode 100644 torchchat/utils/scripts/build_torchao_ops.sh

diff --git a/.github/workflows/pull.yml b/.github/workflows/pull.yml
index 3e92ed9c0..8e69e4bac 100644
--- a/.github/workflows/pull.yml
+++ b/.github/workflows/pull.yml
@@ -1023,3 +1023,86 @@ jobs:
           git submodule update --init
           ./runner/build_android.sh
           echo "Tests complete."
+
+  test-torchao-experimental:
+    strategy:
+      matrix:
+        runner: [macos-14-xlarge]
+    runs-on: ${{matrix.runner}}
+    steps:
+      - name: Checkout repo
+        uses: actions/checkout@v3
+        with:
+          submodules: true
+      - name: Setup Python
+        uses: actions/setup-python@v2
+        with:
+          python-version: 3.10.11
+      - name: Setup Xcode
+        if: runner.os == 'macOS'
+        uses: maxim-lobanov/setup-xcode@v1
+        with:
+          xcode-version: '15.3'
+      - name: Print machine info
+        run: |
+          uname -a
+          if [ $(uname -s) == Darwin ]; then
+            sysctl machdep.cpu.brand_string
+            sysctl machdep.cpu.core_count
+          fi
+      - name: Install torchchat
+        run: |
+          echo "Intalling pip3 packages"
+          ./install/install_requirements.sh
+          pip3 list
+          python3 -c 'import torch;print(f"torch: {torch.__version__, torch.version.git_version}")'
+      - name: Install torchao-ops
+        id: install-torchao-ops
+        run: |
+          bash torchchat/utils/scripts/build_torchao_ops.sh
+      - name: Set git shas
+        id: setup-hash
+        run: |
+          export TORCHCHAT_ROOT=${PWD}
+          echo "et-git-hash=$(cat ${TORCHCHAT_ROOT}/install/.pins/et-pin.txt)" >> "$GITHUB_ENV"
+      - name: Load or install ET
+        id: install-et
+        uses: actions/cache@v3
+        env:
+          cache-key: et-build-${{runner.os}}-${{runner.arch}}-${{env.et-git-hash}}
+        with:
+          path: ./et-build
+          key: ${{env.cache-key}}
+          restore-keys: |
+            ${{env.cache-key}}
+      - if: ${{ steps.install-et.outputs.cache-hit != 'true' }}
+        continue-on-error: true
+        run: |
+          echo "Installing ExecuTorch"
+          bash torchchat/utils/scripts/install_et.sh
+      - name: Install runner
+        run: |
+          echo "Installing runner"
+          bash torchchat/utils/scripts/build_native.sh et link_torchao_ops
+      - name: Install runner AOTI
+        id: install-runner-aoti
+        run: |
+          bash torchchat/utils/scripts/build_native.sh aoti link_torchao_ops
+      - name: Run inference
+        run: |
+          python torchchat.py download stories110M
+          wget -O ./tokenizer.model https://github.com/karpathy/llama2.c/raw/master/tokenizer.model
+          export PRMT="Once upon a time in a land far away"
+          echo "Generate eager"
+          python torchchat.py generate stories110M --temperature 0 --prompt "${PRMT}"  --device cpu --quantize '{"linear:a8wxdq": {"bitwidth": 4, "groupsize": 256, "has_weight_zeros": false}}'
+          echo "Generate compile"
+          python torchchat.py generate stories110M --temperature 0 --prompt "${PRMT}"  --device cpu --quantize '{"linear:a8wxdq": {"bitwidth": 4, "groupsize": 256, "has_weight_zeros": false}}' --compile
+          echo "Export and run ET (C++ runner)"
+          python torchchat.py export stories110M --output-pte-path ./model.pte --quantize '{"linear:a8wxdq": {"bitwidth": 4, "groupsize": 256, "has_weight_zeros": false}}'
+          ./cmake-out/et_run ./model.pte -z ./tokenizer.model -t 0 -i "${PRMT}"
+          echo "Export and run AOTI (C++ runner)"
+          python torchchat.py export stories110M --output-dso-path ./model.so --quantize '{"linear:a8wxdq": {"bitwidth": 4, "groupsize": 256, "has_weight_zeros": false}}'
+          ./cmake-out/aoti_run ./model.so -z ./tokenizer.model -t 0 -i "${PRMT}"
+          echo "Generate AOTI"
+          python torchchat.py generate stories110M --dso-path ./model.so --prompt "${PRMT}"
+          echo "Tests complete."
diff --git a/.gitignore b/.gitignore
index 044bad856..74d0a28fa 100644
--- a/.gitignore
+++ b/.gitignore
@@ -14,6 +14,7 @@ __pycache__/
 # Build directories
 build/android/*
 et-build/*
+torchao-build/*
 runner-et/cmake-out/*
 runner-aoti/cmake-out/*
 cmake-out/
diff --git a/docs/quantization.md b/docs/quantization.md
index 1f619e58e..c0899adee 100644
--- a/docs/quantization.md
+++ b/docs/quantization.md
@@ -118,6 +118,75 @@ python3 torchchat.py export llama3 --quantize '{"embedding": {"bitwidth": 4, "gr
 python3 torchchat.py generate llama3 --pte-path llama3.pte  --prompt "Hello my name is"
 ```
 
+## Experimental TorchAO lowbit kernels
+
+### Use
+The quantization scheme a8wxdq dynamically quantizes activations to 8 bits, and quantizes the weights in a groupwise manner with a specified bitwidth and groupsize.
+It takes arguments bitwidth (2, 3, 4, 5, 6, 7), groupsize, and has_weight_zeros (true, false).
+The argument has_weight_zeros indicates whether the weights are quantized with scales only (has_weight_zeros: false) or with both scales and zeros (has_weight_zeros: true).
+Roughly speaking, {bitwidth: 4, groupsize: 256, has_weight_zeros: false} is similar to GGML's Q40 quantization scheme.
+
+You should expect high performance on ARM CPU if bitwidth is 2, 3, 4, or 5 and groupsize is divisible by 16.  With other platforms and argument choices, a slow fallback kernel will be used.  You will see warnings about this during quantization.
+
+### Setup
+To use a8wxdq, you must set up the torchao experimental kernels.  These will only work on devices with ARM CPUs, for example on Mac computers with Apple Silicon.
+
+From the torchchat root directory, run
+```
+sh torchchat/utils/scripts/build_torchao_ops.sh
+```
+
+This should take about 10 seconds to complete.  Once finished, you can use a8wxdq in torchchat.
+
+Note: if you want to use the new kernels in the AOTI and C++ runners, you must pass the flag link_torchao when running the scripts the build the runners.
+
+```
+sh torchchat/utils/scripts/build_native.sh aoti link_torchao_ops
+```
+
+```
+sh torchchat/utils/scripts/build_native.sh et link_torchao_ops
+```
+
+Note before running `sh torchchat/utils/scripts/build_native.sh et link_torchao_ops`, you must first install executorch with `sh torchchat/utils/scripts/install_et.sh` if you have not done so already.
+
+### Examples
+
+Below we show how to use the new kernels.  Except for ExecuTorch, you can specify the number of threads used by setting OMP_NUM_THREADS (as is the case with PyTorch in general).  Doing so is optional and a default number of threads will be chosen automatically if you do not specify.
+
+#### Eager mode
+```
+OMP_NUM_THREADS=6 python3 torchchat.py generate llama3.1 --device cpu --dtype float32 --quantize '{"linear:a8wxdq": {"bitwidth": 4, "groupsize": 256, "has_weight_zeros": false}}' --prompt "Once upon a time," --num-samples 5
+```
+
+#### torch.compile
+```
+OMP_NUM_THREADS=6 python3 torchchat.py generate llama3.1 --device cpu --dtype float32 --quantize '{"linear:a8wxdq": {"bitwidth": 4, "groupsize": 256, "has_weight_zeros": false}}' --compile --prompt "Once upon a time,"  --num-samples 5
+```
+
+#### AOTI
+```
+OMP_NUM_THREADS=6 python torchchat.py export llama3.1 --device cpu --dtype float32 --quantize '{"linear:a8wxdq": {"bitwidth": 4, "groupsize": 256, "has_weight_zeros": false}}' --output-dso llama3_1.so
+OMP_NUM_THREADS=6 python3 torchchat.py generate llama3.1 --dso-path llama3_1.so --prompt "Once upon a time,"  --num-samples 5
+```
+
+If you built the AOTI runner with link_torchao_ops as discussed in the setup section, you can also use the C++ runner:
+
+```
+OMP_NUM_THREADS=6 ./cmake-out/aoti_run llama3_1.so -z $HOME/.torchchat/model-cache/meta-llama/Meta-Llama-3.1-8B-Instruct/tokenizer.model -l 3 -i "Once upon a time,"
+```
+
+#### ExecuTorch
+```
+python torchchat.py export llama3.1 --device cpu --dtype float32 --quantize '{"linear:a8wxdq": {"bitwidth": 4, "groupsize": 256, "has_weight_zeros": false}}' --output-pte llama3_1.pte
+```
+
+Note: only the ExecuTorch C++ runner in torchchat when built using the instructions in the setup can run the exported *.pte file.  It will not work with the `python torchchat.py generate` command.
+
+```
+./cmake-out/et_run llama3_1.pte -z $HOME/.torchchat/model-cache/meta-llama/Meta-Llama-3.1-8B-Instruct/tokenizer.model -l 3 -i "Once upon a time,"
+```
+
 ## Quantization Profiles
 
 Four [sample profiles](https://github.com/pytorch/torchchat/tree/main/torchchat/quant_config/) are included with the torchchat distribution: `cuda.json`, `desktop.json`, `mobile.json`, `pi5.json`
diff --git a/install/.pins/torchao-pin.txt b/install/.pins/torchao-pin.txt
new file mode 100644
index 000000000..9b101777d
--- /dev/null
+++ b/install/.pins/torchao-pin.txt
@@ -0,0 +1 @@
+3fa38aaf1276e36845a82fb399e5054718a441c4
diff --git a/runner/aoti.cmake b/runner/aoti.cmake
index 156e9bcce..ae907b391 100644
--- a/runner/aoti.cmake
+++ b/runner/aoti.cmake
@@ -28,3 +28,7 @@ if(Torch_FOUND)
     target_link_libraries(aoti_run "${TORCH_LIBRARIES}" m)
     set_property(TARGET aoti_run PROPERTY CXX_STANDARD 17)
 endif()
+
+if (LINK_TORCHAO_OPS)
+    target_link_libraries(aoti_run "${TORCHCHAT_ROOT}/torchao-build/cmake-out/lib/libtorchao_ops_aten${CMAKE_SHARED_LIBRARY_SUFFIX}")
+endif()
diff --git a/runner/et.cmake b/runner/et.cmake
index 99e67a025..916ce9ea8 100644
--- a/runner/et.cmake
+++ b/runner/et.cmake
@@ -116,6 +116,13 @@ if(executorch_FOUND)
     target_link_libraries(et_run PRIVATE log)
   endif()
 
+  if(LINK_TORCHAO_OPS)
+    target_link_libraries(et_run PRIVATE "$<LINK_LIBRARY:WHOLE_ARCHIVE,${TORCHCHAT_ROOT}/torchao-build/cmake-out/lib/libtorchao_ops_executorch.a>")
+    target_link_libraries(et_run PRIVATE
+      "${TORCHCHAT_ROOT}/torchao-build/cmake-out/lib/libtorchao_kernels_aarch64.a"
+    )
+  endif()
+
 else()
   MESSAGE(WARNING "ExecuTorch package not found")
 endif()
diff --git a/torchchat/utils/quantize.py b/torchchat/utils/quantize.py
index a0d9248a9..aa0b11e1e 100644
--- a/torchchat/utils/quantize.py
+++ b/torchchat/utils/quantize.py
@@ -96,10 +96,19 @@ def quantize_model(
                 precision = get_precision()
 
             try:
-                # Easier to ask forgiveness than permission
-                quant_handler = ao_quantizer_class_dict[quantizer](
-                    groupsize=q_kwargs["groupsize"], device=device, precision=precision
-                )
+                if quantizer == "linear:a8wxdq":
+                    quant_handler = ao_quantizer_class_dict[quantizer](
+                        device=device,
+                        precision=precision,
+                        bitwidth=q_kwargs.get("bitwidth", 4),
+                        groupsize=q_kwargs.get("groupsize", 128),
+                        has_weight_zeros=q_kwargs.get("has_weight_zeros", False),
+                    )
+                else:
+                    # Easier to ask forgiveness than permission
+                    quant_handler = ao_quantizer_class_dict[quantizer](
+                        groupsize=q_kwargs["groupsize"], device=device, precision=precision
+                    )
             except TypeError as e:
                 if "unexpected keyword argument 'device'" in str(e):
                     quant_handler = ao_quantizer_class_dict[quantizer](
@@ -861,3 +870,33 @@ def quantized_model(self) -> nn.Module:
     "linear:int4": Int4WeightOnlyQuantizer,
     "linear:a8w4dq": Int8DynActInt4WeightQuantizer,
 }
+
+try:
+    import importlib.util
+    import sys
+    import os
+    torchao_build_path = f"{os.getcwd()}/torchao-build"
+
+    # Try loading quantizer
+    torchao_experimental_quant_api_spec = importlib.util.spec_from_file_location(
+        "torchao_experimental_quant_api",
+        f"{torchao_build_path}/src/ao/torchao/experimental/quant_api.py",
+    )
+    torchao_experimental_quant_api = importlib.util.module_from_spec(torchao_experimental_quant_api_spec)
+    sys.modules["torchao_experimental_quant_api"] = torchao_experimental_quant_api
+    torchao_experimental_quant_api_spec.loader.exec_module(torchao_experimental_quant_api)
+    from torchao_experimental_quant_api import Int8DynActIntxWeightQuantizer
+    ao_quantizer_class_dict["linear:a8wxdq"] = Int8DynActIntxWeightQuantizer
+
+    # Try loading custom op
+    try:
+        import glob
+        libs = glob.glob(f"{torchao_build_path}/cmake-out/lib/libtorchao_ops_aten.*")
+        libs = list(filter(lambda l: (l.endswith("so") or l.endswith("dylib")), libs))
+        torch.ops.load_library(libs[0])
+    except Exception as e:
+        print("Failed to torchao ops library with error: ", e)
+        print("Slow fallback kernels will be used.")
+
+except Exception as e:
+    print(f"Failed to load torchao experimental a8wxdq quantizer with error: {e}")
diff --git a/torchchat/utils/scripts/build_native.sh b/torchchat/utils/scripts/build_native.sh
index 924b86a65..3c2c1c846 100755
--- a/torchchat/utils/scripts/build_native.sh
+++ b/torchchat/utils/scripts/build_native.sh
@@ -26,6 +26,7 @@ if [ $# -eq 0 ]; then
     exit 1
 fi
 
+LINK_TORCHAO_OPS=OFF
 while (( "$#" )); do
   case "$1" in
     -h|--help)
@@ -42,6 +43,11 @@ while (( "$#" )); do
       TARGET="et"
       shift
       ;;
+    link_torchao_ops)
+      echo "Linking with torchao ops..."
+      LINK_TORCHAO_OPS=ON
+      shift
+      ;;
     *)
       echo "Invalid option: $1"
       show_help
@@ -66,14 +72,28 @@ if [[ "$TARGET" == "et" ]]; then
     echo "Make sure you run install_executorch_libs"
     exit 1
   fi
+
+  if [[ "$LINK_TORCHAO_OPS" == "ON" ]]; then
+    if [ ! -d "${TORCHCHAT_ROOT}/torchao-build" ]; then
+      echo "Directory ${TORCHCHAT_ROOT}/torchao-build does not exist."
+      echo "Make sure you run clone_torchao"
+      exit 1
+    fi
+
+    source "$(dirname "${BASH_SOURCE[0]}")/install_utils.sh"
+    find_cmake_prefix_path
+    EXECUTORCH_INCLUDE_DIRS="${TORCHCHAT_ROOT}/${ET_BUILD_DIR}/install/include;${TORCHCHAT_ROOT}/${ET_BUILD_DIR}/src"
+    EXECUTORCH_LIBRARIES="${TORCHCHAT_ROOT}/${ET_BUILD_DIR}/install/lib/libexecutorch_no_prim_ops.a;${TORCHCHAT_ROOT}/${ET_BUILD_DIR}/install/lib/libextension_threadpool.a;${TORCHCHAT_ROOT}/${ET_BUILD_DIR}/install/lib/libcpuinfo.a;${TORCHCHAT_ROOT}/${ET_BUILD_DIR}/install/lib/libpthreadpool.a"
+    install_torchao_executorch_ops
+  fi
 fi
 popd
 
 # CMake commands
 if [[ "$TARGET" == "et" ]]; then
-    cmake -S . -B ./cmake-out -DCMAKE_PREFIX_PATH=`python3 -c 'import torch;print(torch.utils.cmake_prefix_path)'` -DET_USE_ADAPTIVE_THREADS=ON -DCMAKE_CXX_FLAGS="-D_GLIBCXX_USE_CXX11_ABI=1" -G Ninja
+    cmake -S . -B ./cmake-out -DCMAKE_PREFIX_PATH=`python3 -c 'import torch;print(torch.utils.cmake_prefix_path)'` -DLINK_TORCHAO_OPS="${LINK_TORCHAO_OPS}" -DET_USE_ADAPTIVE_THREADS=ON -DCMAKE_CXX_FLAGS="-D_GLIBCXX_USE_CXX11_ABI=1" -G Ninja
 else
-    cmake -S . -B ./cmake-out -DCMAKE_PREFIX_PATH=`python3 -c 'import torch;print(torch.utils.cmake_prefix_path)'` -DCMAKE_CXX_FLAGS="-D_GLIBCXX_USE_CXX11_ABI=0" -G Ninja
+    cmake -S . -B ./cmake-out -DCMAKE_PREFIX_PATH=`python3 -c 'import torch;print(torch.utils.cmake_prefix_path)'` -DLINK_TORCHAO_OPS="${LINK_TORCHAO_OPS}" -DCMAKE_CXX_FLAGS="-D_GLIBCXX_USE_CXX11_ABI=0" -G Ninja
 fi
 cmake --build ./cmake-out --target "${TARGET}"_run
 
diff --git a/torchchat/utils/scripts/build_torchao_ops.sh b/torchchat/utils/scripts/build_torchao_ops.sh
new file mode 100644
index 000000000..a8fd8bea2
--- /dev/null
+++ b/torchchat/utils/scripts/build_torchao_ops.sh
@@ -0,0 +1,16 @@
+#!/bin/bash
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+
+
+source "$(dirname "${BASH_SOURCE[0]}")/install_utils.sh"
+
+pushd ${TORCHCHAT_ROOT}
+find_cmake_prefix_path
+clone_torchao
+install_torchao_aten_ops
+popd
diff --git a/torchchat/utils/scripts/install_utils.sh b/torchchat/utils/scripts/install_utils.sh
index 0ff4608c6..2add4ba7d 100644
--- a/torchchat/utils/scripts/install_utils.sh
+++ b/torchchat/utils/scripts/install_utils.sh
@@ -161,3 +161,52 @@ install_executorch_libs() {
   install_executorch_cpp_libs
   install_executorch_python_libs $1
 }
+
+clone_torchao() {
+  echo "Cloning torchao to ${TORCHCHAT_ROOT}/torchao-build/src"
+  rm -rf ${TORCHCHAT_ROOT}/torchao-build/src
+  mkdir -p ${TORCHCHAT_ROOT}/torchao-build/src
+  pushd ${TORCHCHAT_ROOT}/torchao-build/src
+  echo $pwd
+
+  cp -R ${HOME}/fbsource/fbcode/pytorch/ao .
+  # git clone https://github.com/pytorch/ao.git
+  # cd ao
+  # git checkout $(cat ${TORCHCHAT_ROOT}/intstall/.pins/torchao-experimental-pin.txt)
+
+  popd
+}
+
+install_torchao_aten_ops() {
+  echo "Building torchao custom ops for ATen"
+  pushd ${TORCHCHAT_ROOT}/torchao-build/src/ao/torchao/experimental
+
+  CMAKE_OUT_DIR=${TORCHCHAT_ROOT}/torchao-build/cmake-out
+  cmake -DCMAKE_PREFIX_PATH=${MY_CMAKE_PREFIX_PATH} \
+    -DCMAKE_INSTALL_PREFIX=${CMAKE_OUT_DIR} \
+    -DCMAKE_BUILD_TYPE="Release" \
+    -DTORCHAO_OP_TARGET="aten" \
+    -S . \
+    -B ${CMAKE_OUT_DIR} -G Ninja
+  cmake --build  ${CMAKE_OUT_DIR} --target install --config Release
+
+  popd
+}
+
+install_torchao_executorch_ops() {
+  echo "Building torchao custom ops for ExecuTorch"
+  pushd ${TORCHCHAT_ROOT}/torchao-build/src/ao/torchao/experimental
+
+  CMAKE_OUT_DIR="${TORCHCHAT_ROOT}/torchao-build/cmake-out"
+  cmake -DCMAKE_PREFIX_PATH=${MY_CMAKE_PREFIX_PATH} \
+    -DCMAKE_INSTALL_PREFIX=${CMAKE_OUT_DIR} \
+    -DCMAKE_BUILD_TYPE="Release" \
+    -DTORCHAO_OP_TARGET="executorch" \
+    -DEXECUTORCH_INCLUDE_DIRS="${EXECUTORCH_INCLUDE_DIRS}" \
+    -DEXECUTORCH_LIBRARIES="${EXECUTORCH_LIBRARIES}" \
+    -S . \
+    -B ${CMAKE_OUT_DIR} -G Ninja
+  cmake --build  ${CMAKE_OUT_DIR} --target install --config Release
+
+  popd
+}

From 97606b40a7c5651e6362502be47b510bf0c3702f Mon Sep 17 00:00:00 2001
From: Scott Roy <161522778+metascroy@users.noreply.github.com>
Date: Mon, 23 Sep 2024 16:00:44 -0700
Subject: [PATCH 02/15] update install utils

---
 torchchat/utils/scripts/install_utils.sh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/torchchat/utils/scripts/install_utils.sh b/torchchat/utils/scripts/install_utils.sh
index 2add4ba7d..4a3d50bbe 100644
--- a/torchchat/utils/scripts/install_utils.sh
+++ b/torchchat/utils/scripts/install_utils.sh
@@ -164,7 +164,7 @@ install_executorch_libs() {
 
 clone_torchao() {
   echo "Cloning torchao to ${TORCHCHAT_ROOT}/torchao-build/src"
-  rm -rf ${TORCHCHAT_ROOT}/torchao-build/src
+  rm -rf ${TORCHCHAT_ROOT}/torchao-build
   mkdir -p ${TORCHCHAT_ROOT}/torchao-build/src
   pushd ${TORCHCHAT_ROOT}/torchao-build/src
   echo $pwd
@@ -172,7 +172,7 @@ clone_torchao() {
   cp -R ${HOME}/fbsource/fbcode/pytorch/ao .
   # git clone https://github.com/pytorch/ao.git
   # cd ao
-  # git checkout $(cat ${TORCHCHAT_ROOT}/intstall/.pins/torchao-experimental-pin.txt)
+  # git checkout $(cat ${TORCHCHAT_ROOT}/intstall/.pins/torchao-pin.txt)
 
   popd
 }

From d1fc4fc6c437c7d56f0bcdfdd724763e4bab01e7 Mon Sep 17 00:00:00 2001
From: Scott Roy <161522778+metascroy@users.noreply.github.com>
Date: Mon, 23 Sep 2024 17:09:35 -0700
Subject: [PATCH 03/15] update

---
 torchchat/utils/quantize.py              | 2 +-
 torchchat/utils/scripts/install_utils.sh | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/torchchat/utils/quantize.py b/torchchat/utils/quantize.py
index aa0b11e1e..77b03fcba 100644
--- a/torchchat/utils/quantize.py
+++ b/torchchat/utils/quantize.py
@@ -891,7 +891,7 @@ def quantized_model(self) -> nn.Module:
     # Try loading custom op
     try:
         import glob
-        libs = glob.glob(f"{torchao_build_path}/cmake-out/lib/libtorchao_ops_aten.*")
+        libs = glob.glob(f"{torchao_build_path}/cmake-out/lib/liblinear_a8wxdq_ATEN.*")
         libs = list(filter(lambda l: (l.endswith("so") or l.endswith("dylib")), libs))
         torch.ops.load_library(libs[0])
     except Exception as e:
diff --git a/torchchat/utils/scripts/install_utils.sh b/torchchat/utils/scripts/install_utils.sh
index 4a3d50bbe..0a816f20d 100644
--- a/torchchat/utils/scripts/install_utils.sh
+++ b/torchchat/utils/scripts/install_utils.sh
@@ -185,7 +185,7 @@ install_torchao_aten_ops() {
   cmake -DCMAKE_PREFIX_PATH=${MY_CMAKE_PREFIX_PATH} \
     -DCMAKE_INSTALL_PREFIX=${CMAKE_OUT_DIR} \
     -DCMAKE_BUILD_TYPE="Release" \
-    -DTORCHAO_OP_TARGET="aten" \
+    -DTORCHAO_OP_TARGET="ATEN" \
     -S . \
     -B ${CMAKE_OUT_DIR} -G Ninja
   cmake --build  ${CMAKE_OUT_DIR} --target install --config Release
@@ -201,7 +201,7 @@ install_torchao_executorch_ops() {
   cmake -DCMAKE_PREFIX_PATH=${MY_CMAKE_PREFIX_PATH} \
     -DCMAKE_INSTALL_PREFIX=${CMAKE_OUT_DIR} \
     -DCMAKE_BUILD_TYPE="Release" \
-    -DTORCHAO_OP_TARGET="executorch" \
+    -DTORCHAO_OP_TARGET="EXECUTORCH" \
     -DEXECUTORCH_INCLUDE_DIRS="${EXECUTORCH_INCLUDE_DIRS}" \
     -DEXECUTORCH_LIBRARIES="${EXECUTORCH_LIBRARIES}" \
     -S . \

From 3237f2752d003922804185185abf5ef45c245662 Mon Sep 17 00:00:00 2001
From: Scott Roy <161522778+metascroy@users.noreply.github.com>
Date: Mon, 23 Sep 2024 17:21:30 -0700
Subject: [PATCH 04/15] update libs

---
 runner/aoti.cmake | 2 +-
 runner/et.cmake   | 3 ++-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/runner/aoti.cmake b/runner/aoti.cmake
index ae907b391..082a6f5ce 100644
--- a/runner/aoti.cmake
+++ b/runner/aoti.cmake
@@ -30,5 +30,5 @@ if(Torch_FOUND)
 endif()
 
 if (LINK_TORCHAO_OPS)
-    target_link_libraries(aoti_run "${TORCHCHAT_ROOT}/torchao-build/cmake-out/lib/libtorchao_ops_aten${CMAKE_SHARED_LIBRARY_SUFFIX}")
+    target_link_libraries(aoti_run "${TORCHCHAT_ROOT}/torchao-build/cmake-out/lib/liblinear_a8wxdq_ATEN${CMAKE_SHARED_LIBRARY_SUFFIX}")
 endif()
diff --git a/runner/et.cmake b/runner/et.cmake
index 916ce9ea8..c788ead56 100644
--- a/runner/et.cmake
+++ b/runner/et.cmake
@@ -117,9 +117,10 @@ if(executorch_FOUND)
   endif()
 
   if(LINK_TORCHAO_OPS)
-    target_link_libraries(et_run PRIVATE "$<LINK_LIBRARY:WHOLE_ARCHIVE,${TORCHCHAT_ROOT}/torchao-build/cmake-out/lib/libtorchao_ops_executorch.a>")
+    target_link_libraries(et_run PRIVATE "$<LINK_LIBRARY:WHOLE_ARCHIVE,${TORCHCHAT_ROOT}/torchao-build/cmake-out/lib/liblinear_a8wxdq_EXECUTORCH.a>")
     target_link_libraries(et_run PRIVATE
       "${TORCHCHAT_ROOT}/torchao-build/cmake-out/lib/libtorchao_kernels_aarch64.a"
+       "${TORCHCHAT_ROOT}/torchao-build/cmake-out/lib/libtorchao_ops_linear_EXECUTORCH.a"
     )
   endif()
 

From ffc09c695b6b3d5be5fafad655b1727c8deb1ecb Mon Sep 17 00:00:00 2001
From: Scott Roy <161522778+metascroy@users.noreply.github.com>
Date: Thu, 26 Sep 2024 13:19:28 -0700
Subject: [PATCH 05/15] update torchao pin

---
 install/.pins/torchao-pin.txt            | 2 +-
 torchchat/utils/scripts/install_utils.sh | 7 +++----
 2 files changed, 4 insertions(+), 5 deletions(-)

diff --git a/install/.pins/torchao-pin.txt b/install/.pins/torchao-pin.txt
index 9b101777d..b28bd09cd 100644
--- a/install/.pins/torchao-pin.txt
+++ b/install/.pins/torchao-pin.txt
@@ -1 +1 @@
-3fa38aaf1276e36845a82fb399e5054718a441c4
+63cb7a9857654784f726fec75c0dc36167094d8a
diff --git a/torchchat/utils/scripts/install_utils.sh b/torchchat/utils/scripts/install_utils.sh
index 0a816f20d..d908451f0 100644
--- a/torchchat/utils/scripts/install_utils.sh
+++ b/torchchat/utils/scripts/install_utils.sh
@@ -169,10 +169,9 @@ clone_torchao() {
   pushd ${TORCHCHAT_ROOT}/torchao-build/src
   echo $pwd
 
-  cp -R ${HOME}/fbsource/fbcode/pytorch/ao .
-  # git clone https://github.com/pytorch/ao.git
-  # cd ao
-  # git checkout $(cat ${TORCHCHAT_ROOT}/intstall/.pins/torchao-pin.txt)
+  git clone https://github.com/pytorch/ao.git
+  cd ao
+  git checkout $(cat ${TORCHCHAT_ROOT}/intstall/.pins/torchao-pin.txt)
 
   popd
 }

From 23b285d4e1998cdbf962334a1f7dace8975bb15d Mon Sep 17 00:00:00 2001
From: Scott Roy <161522778+metascroy@users.noreply.github.com>
Date: Thu, 26 Sep 2024 13:29:49 -0700
Subject: [PATCH 06/15] fix ci test

---
 .github/workflows/pull.yml | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/pull.yml b/.github/workflows/pull.yml
index 8e69e4bac..68fa2b126 100644
--- a/.github/workflows/pull.yml
+++ b/.github/workflows/pull.yml
@@ -1094,14 +1094,14 @@ jobs:
           wget -O ./tokenizer.model https://github.com/karpathy/llama2.c/raw/master/tokenizer.model
           export PRMT="Once upon a time in a land far away"
           echo "Generate eager"
-          python torchchat.py generate stories110M --temperature 0 --prompt "${PRMT}"  --device cpu --quantize '{"linear:a8wxdq": {"bitwidth": 4, "groupsize": 256, "has_weight_zeros": false}}'
+          python torchchat.py generate stories110M --temperature 0 --prompt "${PRMT}"  --device cpu --dtype float32 --quantize '{"linear:a8wxdq": {"bitwidth": 4, "groupsize": 256, "has_weight_zeros": false}}'
           echo "Generate compile"
-          python torchchat.py generate stories110M --temperature 0 --prompt "${PRMT}"  --device cpu --quantize '{"linear:a8wxdq": {"bitwidth": 4, "groupsize": 256, "has_weight_zeros": false}}' --compile
+          python torchchat.py generate stories110M --temperature 0 --prompt "${PRMT}"  --device cpu --dtype float32 --quantize '{"linear:a8wxdq": {"bitwidth": 4, "groupsize": 256, "has_weight_zeros": false}}' --compile
           echo "Export and run ET (C++ runner)"
-          python torchchat.py export stories110M --output-pte-path ./model.pte --quantize '{"linear:a8wxdq": {"bitwidth": 4, "groupsize": 256, "has_weight_zeros": false}}'
+          python torchchat.py export stories110M --output-pte-path ./model.pte --dtype float32 --quantize '{"linear:a8wxdq": {"bitwidth": 4, "groupsize": 256, "has_weight_zeros": false}}'
           ./cmake-out/et_run ./model.pte -z ./tokenizer.model -t 0 -i "${PRMT}"
           echo "Export and run AOTI (C++ runner)"
-          python torchchat.py export stories110M --output-dso-path ./model.so --quantize '{"linear:a8wxdq": {"bitwidth": 4, "groupsize": 256, "has_weight_zeros": false}}'
+          python torchchat.py export stories110M --output-dso-path ./model.so --dtype float32 --quantize '{"linear:a8wxdq": {"bitwidth": 4, "groupsize": 256, "has_weight_zeros": false}}'
           ./cmake-out/aoti_run ./model.so -z ./tokenizer.model -t 0 -i "${PRMT}"
           echo "Generate AOTI"
           python torchchat.py generate stories110M --dso-path ./model.so --prompt "${PRMT}"

From 088cc06670102689ea077b7f88c91cdb61f97044 Mon Sep 17 00:00:00 2001
From: Scott Roy <161522778+metascroy@users.noreply.github.com>
Date: Thu, 26 Sep 2024 13:36:32 -0700
Subject: [PATCH 07/15] add python et install to ci

---
 .github/workflows/pull.yml | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/.github/workflows/pull.yml b/.github/workflows/pull.yml
index 68fa2b126..aa9813f38 100644
--- a/.github/workflows/pull.yml
+++ b/.github/workflows/pull.yml
@@ -1080,6 +1080,14 @@ jobs:
         run: |
           echo "Installing ExecuTorch"
           bash torchchat/utils/scripts/install_et.sh
+      - name: Install ExecuTorch python
+        run: |
+          echo "Install ExecuTorch python"
+          pushd et-build/src/executorch
+          chmod +x ./install_requirements.sh
+          chmod +x ./install_requirements.py
+          ./install_requirements.sh
+          popd
       - name: Install runner
         run: |
           echo "Installing runner"

From 7114449f4294e19618c3d694be426875fd302e7f Mon Sep 17 00:00:00 2001
From: Scott Roy <161522778+metascroy@users.noreply.github.com>
Date: Fri, 27 Sep 2024 10:46:44 -0700
Subject: [PATCH 08/15] fix ci errors

---
 .github/workflows/pull.yml               | 16 ++++++++--------
 torchchat/utils/scripts/install_et.sh    |  6 ------
 torchchat/utils/scripts/install_utils.sh |  7 +++++++
 3 files changed, 15 insertions(+), 14 deletions(-)

diff --git a/.github/workflows/pull.yml b/.github/workflows/pull.yml
index aa9813f38..ea91e84d9 100644
--- a/.github/workflows/pull.yml
+++ b/.github/workflows/pull.yml
@@ -918,10 +918,10 @@ jobs:
       - name: Install ExecuTorch python
         run: |
           echo "Install ExecuTorch python"
-          pushd et-build/src/executorch
-          chmod +x ./install_requirements.sh
-          chmod +x ./install_requirements.py
-          ./install_requirements.sh
+          export TORCHCHAT_ROOT=$PWD
+          ENABLE_ET_PYBIND="${1:-true}"
+          source "torchchat/utils/scripts/install_utils.sh"
+          install_executorch_python_libs $ENABLE_ET_PYBIND
           popd
       - name: Install runner
         run: |
@@ -1083,10 +1083,10 @@ jobs:
       - name: Install ExecuTorch python
         run: |
           echo "Install ExecuTorch python"
-          pushd et-build/src/executorch
-          chmod +x ./install_requirements.sh
-          chmod +x ./install_requirements.py
-          ./install_requirements.sh
+          export TORCHCHAT_ROOT=$PWD
+          ENABLE_ET_PYBIND="${1:-true}"
+          source "torchchat/utils/scripts/install_utils.sh"
+          install_executorch_python_libs $ENABLE_ET_PYBIND
           popd
       - name: Install runner
         run: |
diff --git a/torchchat/utils/scripts/install_et.sh b/torchchat/utils/scripts/install_et.sh
index 04db3b287..8062a8316 100755
--- a/torchchat/utils/scripts/install_et.sh
+++ b/torchchat/utils/scripts/install_et.sh
@@ -19,10 +19,4 @@ pushd ${TORCHCHAT_ROOT}
 find_cmake_prefix_path
 clone_executorch
 install_executorch_libs $ENABLE_ET_PYBIND
-install_executorch_python_libs $ENABLE_ET_PYBIND
-# TODO: figure out the root cause of 'AttributeError: module 'evaluate'
-# has no attribute 'utils'' error from evaluate CI jobs and remove
-# `import lm_eval` from torchchat.py since it requires a specific version
-# of numpy.
-pip install numpy=='1.26.4'
 popd
diff --git a/torchchat/utils/scripts/install_utils.sh b/torchchat/utils/scripts/install_utils.sh
index d908451f0..ec9677373 100644
--- a/torchchat/utils/scripts/install_utils.sh
+++ b/torchchat/utils/scripts/install_utils.sh
@@ -93,6 +93,13 @@ install_executorch_python_libs() {
       echo "Installing pybind"
       bash ./install_requirements.sh --pybind xnnpack
   fi
+
+  # TODO: figure out the root cause of 'AttributeError: module 'evaluate'
+  # has no attribute 'utils'' error from evaluate CI jobs and remove
+  # `import lm_eval` from torchchat.py since it requires a specific version
+  # of numpy.
+  pip install numpy=='1.26.4'
+
   pip3 list
   popd
 }

From ca95554c883a3d83c163605feaa30ed9115bbfc6 Mon Sep 17 00:00:00 2001
From: Scott Roy <161522778+metascroy@users.noreply.github.com>
Date: Fri, 27 Sep 2024 10:59:13 -0700
Subject: [PATCH 09/15] fixes

---
 .github/workflows/pull.yml | 133 +++++++++++++++++++------------------
 1 file changed, 69 insertions(+), 64 deletions(-)

diff --git a/.github/workflows/pull.yml b/.github/workflows/pull.yml
index ea91e84d9..f0d411400 100644
--- a/.github/workflows/pull.yml
+++ b/.github/workflows/pull.yml
@@ -737,63 +737,68 @@ jobs:
     uses: pytorch/test-infra/.github/workflows/macos_job.yml@main
     with:
       runner: macos-m1-stable  # needs MPS, was macos-m1-stable
-      script: |
-        export PYTHON_VERSION="3.10"
-        set -x
-        # NS/MC: Remove previous installation of torch and torchao first
-        # as this script does not install anything into conda env but rather as system dep
-        pip3 uninstall -y torch || true
-        set -eou pipefail
-
-        pip3 uninstall -y torchao || true
-        set -eou pipefail
-
-        echo "::group::Print machine info"
-        uname -a
-        sysctl machdep.cpu.brand_string
-        sysctl machdep.cpu.core_count
-        echo "::endgroup::"
-
-        echo "::group::Install requirements"
-        # Install requirements
-        ./install/install_requirements.sh
-        ls -la
-        pwd
-        pip3 list
-        python3 -c 'import torch;print(f"torch: {torch.__version__, torch.version.git_version}")'
-        echo "::endgroup::"
-
-        echo "::group::Download checkpoints"
-        (
-          mkdir -p checkpoints/stories15M
-          pushd checkpoints/stories15M
-          curl -fsSL -O https://huggingface.co/karpathy/tinyllamas/resolve/main/stories15M.pt
-          curl -fsSL -O https://github.com/karpathy/llama2.c/raw/master/tokenizer.model
-          popd
-        )
-        echo "::endgroup::"
-
-        echo "::group::Run inference"
-        export MODEL_PATH=checkpoints/stories15M/stories15M.pt
-        export MODEL_NAME=stories15M
-        export MODEL_DIR=/tmp
-        for DTYPE in float16 float32; do
-          # if [ $(uname -s) == Darwin ]; then
-          #   export DTYPE=float16
-          # fi
-
-          python3 torchchat.py generate --dtype ${DTYPE} --device mps --checkpoint-path ${MODEL_PATH} --temperature 0
-
-          python3 torchchat.py generate --dtype ${DTYPE} --device mps --quant '{"embedding" : {"bitwidth": 8, "groupsize": 0}}' --checkpoint-path ${MODEL_PATH} --temperature 0
-
-          python3 torchchat.py generate --dtype ${DTYPE} --device mps --quant '{"embedding" : {"bitwidth": 8, "groupsize": 8}}' --checkpoint-path ${MODEL_PATH} --temperature 0
-
-          python3 torchchat.py generate --dtype ${DTYPE} --device mps --quant '{"linear:int8" : {"bitwidth": 8, "groupsize": 0}}' --checkpoint-path ${MODEL_PATH} --temperature 0
-
-          python3 torchchat.py generate --dtype ${DTYPE} --device mps --quant '{"linear:int8" : {"bitwidth": 8, "groupsize": 8}}' --checkpoint-path ${MODEL_PATH} --temperature 0
-
-          PYTORCH_ENABLE_MPS_FALLBACK=1 python3 torchchat.py generate --dtype ${DTYPE} --device mps --quant '{"linear:int4" : {"groupsize": 32}}' --checkpoint-path ${MODEL_PATH} --temperature 0
-        done
+      steps:
+        - name: Setup Python
+          uses: actions/setup-python@v2
+          with:
+            python-version: 3.10.11
+        - name: Run test
+          run: |
+            export PYTHON_VERSION="3.10"
+            set -x
+            # NS/MC: Remove previous installation of torch and torchao first
+            # as this script does not install anything into conda env but rather as system dep
+            pip3 uninstall -y torch || true
+            set -eou pipefail
+
+            pip3 uninstall -y torchao || true
+            set -eou pipefail
+
+            echo "::group::Print machine info"
+            uname -a
+            sysctl machdep.cpu.brand_string
+            sysctl machdep.cpu.core_count
+            echo "::endgroup::"
+
+            echo "::group::Install requirements"
+            # Install requirements
+            ./install/install_requirements.sh
+            ls -la
+            pwd
+            pip3 list
+            python3 -c 'import torch;print(f"torch: {torch.__version__, torch.version.git_version}")'
+            echo "::endgroup::"
+
+            echo "::group::Download checkpoints"
+            (
+              mkdir -p checkpoints/stories15M
+              pushd checkpoints/stories15M
+              curl -fsSL -O https://huggingface.co/karpathy/tinyllamas/resolve/main/stories15M.pt
+              curl -fsSL -O https://github.com/karpathy/llama2.c/raw/master/tokenizer.model
+              popd
+            )
+            echo "::endgroup::"
+
+            echo "::group::Run inference"
+            export MODEL_PATH=checkpoints/stories15M/stories15M.pt
+            export MODEL_NAME=stories15M
+            export MODEL_DIR=/tmp
+            for DTYPE in float16 float32; do
+              # if [ $(uname -s) == Darwin ]; then
+              #   export DTYPE=float16
+              # fi
+
+              python3 torchchat.py generate --dtype ${DTYPE} --device mps --checkpoint-path ${MODEL_PATH} --temperature 0
+
+              python3 torchchat.py generate --dtype ${DTYPE} --device mps --quant '{"embedding" : {"bitwidth": 8, "groupsize": 0}}' --checkpoint-path ${MODEL_PATH} --temperature 0
+
+              python3 torchchat.py generate --dtype ${DTYPE} --device mps --quant '{"embedding" : {"bitwidth": 8, "groupsize": 8}}' --checkpoint-path ${MODEL_PATH} --temperature 0
+
+              python3 torchchat.py generate --dtype ${DTYPE} --device mps --quant '{"linear:int8" : {"bitwidth": 8, "groupsize": 0}}' --checkpoint-path ${MODEL_PATH} --temperature 0
+
+              python3 torchchat.py generate --dtype ${DTYPE} --device mps --quant '{"linear:int8" : {"bitwidth": 8, "groupsize": 8}}' --checkpoint-path ${MODEL_PATH} --temperature 0
+
+              PYTORCH_ENABLE_MPS_FALLBACK=1 python3 torchchat.py generate --dtype ${DTYPE} --device mps --quant '{"linear:int4" : {"groupsize": 32}}' --checkpoint-path ${MODEL_PATH} --temperature 0
   compile-gguf:
     strategy:
       matrix:
@@ -919,6 +924,7 @@ jobs:
         run: |
           echo "Install ExecuTorch python"
           export TORCHCHAT_ROOT=$PWD
+          export ET_BUILD_DIR="et-build"
           ENABLE_ET_PYBIND="${1:-true}"
           source "torchchat/utils/scripts/install_utils.sh"
           install_executorch_python_libs $ENABLE_ET_PYBIND
@@ -1067,14 +1073,12 @@ jobs:
           echo "et-git-hash=$(cat ${TORCHCHAT_ROOT}/install/.pins/et-pin.txt)" >> "$GITHUB_ENV"
       - name: Load or install ET
         id: install-et
-        uses: actions/cache@v3
-        env:
-          cache-key: et-build-${{runner.os}}-${{runner.arch}}-${{env.et-git-hash}}
+        uses: actions/cache@v4
         with:
-          path: ./et-build
-          key: ${{env.cache-key}}
-          restore-keys: |
-            ${{env.cache-key}}
+          path: |
+            ./et-build
+            ./torchchat/utils/scripts
+          key: et-build-${{runner.os}}-${{runner.arch}}-${{env.et-git-hash}}-${{ hashFiles('**/install_et.sh') }}
       - if: ${{ steps.install-et.outputs.cache-hit != 'true' }}
         continue-on-error: true
         run: |
@@ -1084,6 +1088,7 @@ jobs:
         run: |
           echo "Install ExecuTorch python"
           export TORCHCHAT_ROOT=$PWD
+          export ET_BUILD_DIR="et-build"
           ENABLE_ET_PYBIND="${1:-true}"
           source "torchchat/utils/scripts/install_utils.sh"
           install_executorch_python_libs $ENABLE_ET_PYBIND

From 28566749f69a6c44ce179986877d00dce5e4b9dd Mon Sep 17 00:00:00 2001
From: Scott Roy <161522778+metascroy@users.noreply.github.com>
Date: Fri, 27 Sep 2024 11:05:04 -0700
Subject: [PATCH 10/15] fixes

---
 .github/workflows/pull.yml | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/pull.yml b/.github/workflows/pull.yml
index f0d411400..21992c03e 100644
--- a/.github/workflows/pull.yml
+++ b/.github/workflows/pull.yml
@@ -734,9 +734,10 @@ jobs:
 
           echo "Tests complete."
   test-mps-dtype:
-    uses: pytorch/test-infra/.github/workflows/macos_job.yml@main
-    with:
-      runner: macos-m1-stable  # needs MPS, was macos-m1-stable
+    strategy:
+      matrix:
+        runner: [macos-m1-stable ]
+    runs-on: ${{matrix.runner}}
       steps:
         - name: Setup Python
           uses: actions/setup-python@v2

From c5ce896b4acdb4a2b5094c94b38f2fa53b4c3652 Mon Sep 17 00:00:00 2001
From: Scott Roy <161522778+metascroy@users.noreply.github.com>
Date: Fri, 27 Sep 2024 11:08:40 -0700
Subject: [PATCH 11/15] fixes

---
 .github/workflows/pull.yml | 124 ++++++++++++++++++-------------------
 1 file changed, 62 insertions(+), 62 deletions(-)

diff --git a/.github/workflows/pull.yml b/.github/workflows/pull.yml
index 21992c03e..f04ab986d 100644
--- a/.github/workflows/pull.yml
+++ b/.github/workflows/pull.yml
@@ -738,68 +738,68 @@ jobs:
       matrix:
         runner: [macos-m1-stable ]
     runs-on: ${{matrix.runner}}
-      steps:
-        - name: Setup Python
-          uses: actions/setup-python@v2
-          with:
-            python-version: 3.10.11
-        - name: Run test
-          run: |
-            export PYTHON_VERSION="3.10"
-            set -x
-            # NS/MC: Remove previous installation of torch and torchao first
-            # as this script does not install anything into conda env but rather as system dep
-            pip3 uninstall -y torch || true
-            set -eou pipefail
-
-            pip3 uninstall -y torchao || true
-            set -eou pipefail
-
-            echo "::group::Print machine info"
-            uname -a
-            sysctl machdep.cpu.brand_string
-            sysctl machdep.cpu.core_count
-            echo "::endgroup::"
-
-            echo "::group::Install requirements"
-            # Install requirements
-            ./install/install_requirements.sh
-            ls -la
-            pwd
-            pip3 list
-            python3 -c 'import torch;print(f"torch: {torch.__version__, torch.version.git_version}")'
-            echo "::endgroup::"
-
-            echo "::group::Download checkpoints"
-            (
-              mkdir -p checkpoints/stories15M
-              pushd checkpoints/stories15M
-              curl -fsSL -O https://huggingface.co/karpathy/tinyllamas/resolve/main/stories15M.pt
-              curl -fsSL -O https://github.com/karpathy/llama2.c/raw/master/tokenizer.model
-              popd
-            )
-            echo "::endgroup::"
-
-            echo "::group::Run inference"
-            export MODEL_PATH=checkpoints/stories15M/stories15M.pt
-            export MODEL_NAME=stories15M
-            export MODEL_DIR=/tmp
-            for DTYPE in float16 float32; do
-              # if [ $(uname -s) == Darwin ]; then
-              #   export DTYPE=float16
-              # fi
-
-              python3 torchchat.py generate --dtype ${DTYPE} --device mps --checkpoint-path ${MODEL_PATH} --temperature 0
-
-              python3 torchchat.py generate --dtype ${DTYPE} --device mps --quant '{"embedding" : {"bitwidth": 8, "groupsize": 0}}' --checkpoint-path ${MODEL_PATH} --temperature 0
-
-              python3 torchchat.py generate --dtype ${DTYPE} --device mps --quant '{"embedding" : {"bitwidth": 8, "groupsize": 8}}' --checkpoint-path ${MODEL_PATH} --temperature 0
-
-              python3 torchchat.py generate --dtype ${DTYPE} --device mps --quant '{"linear:int8" : {"bitwidth": 8, "groupsize": 0}}' --checkpoint-path ${MODEL_PATH} --temperature 0
-
-              python3 torchchat.py generate --dtype ${DTYPE} --device mps --quant '{"linear:int8" : {"bitwidth": 8, "groupsize": 8}}' --checkpoint-path ${MODEL_PATH} --temperature 0
-
-              PYTORCH_ENABLE_MPS_FALLBACK=1 python3 torchchat.py generate --dtype ${DTYPE} --device mps --quant '{"linear:int4" : {"groupsize": 32}}' --checkpoint-path ${MODEL_PATH} --temperature 0
+    steps:
+      - name: Setup Python
+        uses: actions/setup-python@v2
+        with:
+          python-version: 3.10.11
+      - name: Run test
+        run: |
+          export PYTHON_VERSION="3.10"
+          set -x
+          # NS/MC: Remove previous installation of torch and torchao first
+          # as this script does not install anything into conda env but rather as system dep
+          pip3 uninstall -y torch || true
+          set -eou pipefail
+
+          pip3 uninstall -y torchao || true
+          set -eou pipefail
+
+          echo "::group::Print machine info"
+          uname -a
+          sysctl machdep.cpu.brand_string
+          sysctl machdep.cpu.core_count
+          echo "::endgroup::"
+
+          echo "::group::Install requirements"
+          # Install requirements
+          ./install/install_requirements.sh
+          ls -la
+          pwd
+          pip3 list
+          python3 -c 'import torch;print(f"torch: {torch.__version__, torch.version.git_version}")'
+          echo "::endgroup::"
+
+          echo "::group::Download checkpoints"
+          (
+            mkdir -p checkpoints/stories15M
+            pushd checkpoints/stories15M
+            curl -fsSL -O https://huggingface.co/karpathy/tinyllamas/resolve/main/stories15M.pt
+            curl -fsSL -O https://github.com/karpathy/llama2.c/raw/master/tokenizer.model
+            popd
+          )
+          echo "::endgroup::"
+
+          echo "::group::Run inference"
+          export MODEL_PATH=checkpoints/stories15M/stories15M.pt
+          export MODEL_NAME=stories15M
+          export MODEL_DIR=/tmp
+          for DTYPE in float16 float32; do
+            # if [ $(uname -s) == Darwin ]; then
+            #   export DTYPE=float16
+            # fi
+
+            python3 torchchat.py generate --dtype ${DTYPE} --device mps --checkpoint-path ${MODEL_PATH} --temperature 0
+
+            python3 torchchat.py generate --dtype ${DTYPE} --device mps --quant '{"embedding" : {"bitwidth": 8, "groupsize": 0}}' --checkpoint-path ${MODEL_PATH} --temperature 0
+
+            python3 torchchat.py generate --dtype ${DTYPE} --device mps --quant '{"embedding" : {"bitwidth": 8, "groupsize": 8}}' --checkpoint-path ${MODEL_PATH} --temperature 0
+
+            python3 torchchat.py generate --dtype ${DTYPE} --device mps --quant '{"linear:int8" : {"bitwidth": 8, "groupsize": 0}}' --checkpoint-path ${MODEL_PATH} --temperature 0
+
+            python3 torchchat.py generate --dtype ${DTYPE} --device mps --quant '{"linear:int8" : {"bitwidth": 8, "groupsize": 8}}' --checkpoint-path ${MODEL_PATH} --temperature 0
+
+            PYTORCH_ENABLE_MPS_FALLBACK=1 python3 torchchat.py generate --dtype ${DTYPE} --device mps --quant '{"linear:int4" : {"groupsize": 32}}' --checkpoint-path ${MODEL_PATH} --temperature 0
   compile-gguf:
     strategy:
       matrix:

From 871ce4ae2b1c86384a4d8811b95ba211b7e5f181 Mon Sep 17 00:00:00 2001
From: Scott Roy <161522778+metascroy@users.noreply.github.com>
Date: Fri, 27 Sep 2024 11:15:30 -0700
Subject: [PATCH 12/15] fixes

---
 .github/workflows/pull.yml | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/.github/workflows/pull.yml b/.github/workflows/pull.yml
index f04ab986d..9ff14d7a6 100644
--- a/.github/workflows/pull.yml
+++ b/.github/workflows/pull.yml
@@ -739,10 +739,19 @@ jobs:
         runner: [macos-m1-stable ]
     runs-on: ${{matrix.runner}}
     steps:
+      - name: Checkout repo
+        uses: actions/checkout@v2
       - name: Setup Python
         uses: actions/setup-python@v2
         with:
           python-version: 3.10.11
+      - name: Print machine info
+        run: |
+          uname -a
+          if [ $(uname -s) == Darwin ]; then
+            sysctl machdep.cpu.brand_string
+            sysctl machdep.cpu.core_count
+          fi
       - name: Run test
         run: |
           export PYTHON_VERSION="3.10"

From 5f4f1d19ec75f47a2d5576341674a0e4bf6981a6 Mon Sep 17 00:00:00 2001
From: Scott Roy <161522778+metascroy@users.noreply.github.com>
Date: Fri, 27 Sep 2024 11:18:23 -0700
Subject: [PATCH 13/15] fixes

---
 .github/workflows/pull.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.github/workflows/pull.yml b/.github/workflows/pull.yml
index 9ff14d7a6..07119e20f 100644
--- a/.github/workflows/pull.yml
+++ b/.github/workflows/pull.yml
@@ -809,6 +809,7 @@ jobs:
             python3 torchchat.py generate --dtype ${DTYPE} --device mps --quant '{"linear:int8" : {"bitwidth": 8, "groupsize": 8}}' --checkpoint-path ${MODEL_PATH} --temperature 0
 
             PYTORCH_ENABLE_MPS_FALLBACK=1 python3 torchchat.py generate --dtype ${DTYPE} --device mps --quant '{"linear:int4" : {"groupsize": 32}}' --checkpoint-path ${MODEL_PATH} --temperature 0
+          end
   compile-gguf:
     strategy:
       matrix:

From 667d78ab619e84681a148998881ff8e2d6495786 Mon Sep 17 00:00:00 2001
From: Scott Roy <161522778+metascroy@users.noreply.github.com>
Date: Fri, 27 Sep 2024 11:23:39 -0700
Subject: [PATCH 14/15] fixes

---
 .github/workflows/pull.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/pull.yml b/.github/workflows/pull.yml
index 07119e20f..10dd40e44 100644
--- a/.github/workflows/pull.yml
+++ b/.github/workflows/pull.yml
@@ -809,7 +809,7 @@ jobs:
             python3 torchchat.py generate --dtype ${DTYPE} --device mps --quant '{"linear:int8" : {"bitwidth": 8, "groupsize": 8}}' --checkpoint-path ${MODEL_PATH} --temperature 0
 
             PYTORCH_ENABLE_MPS_FALLBACK=1 python3 torchchat.py generate --dtype ${DTYPE} --device mps --quant '{"linear:int4" : {"groupsize": 32}}' --checkpoint-path ${MODEL_PATH} --temperature 0
-          end
+          done
   compile-gguf:
     strategy:
       matrix:

From 5ab3f26b93deb3c8d0a7e865afc524fec87b3a65 Mon Sep 17 00:00:00 2001
From: Scott Roy <161522778+metascroy@users.noreply.github.com>
Date: Fri, 27 Sep 2024 11:39:45 -0700
Subject: [PATCH 15/15] fixes

---
 .github/workflows/pull.yml | 124 +++++++++++++++++++++----------------
 1 file changed, 69 insertions(+), 55 deletions(-)

diff --git a/.github/workflows/pull.yml b/.github/workflows/pull.yml
index 10dd40e44..9d3ad63e3 100644
--- a/.github/workflows/pull.yml
+++ b/.github/workflows/pull.yml
@@ -621,71 +621,87 @@ jobs:
           python torchchat.py remove stories15m
 
   test-mps:
-    uses: pytorch/test-infra/.github/workflows/macos_job.yml@main
-    with:
-      runner: macos-m1-stable  # neeps MPS, was macos-m1-stable
-      script: |
-        export PYTHON_VERSION="3.10"
-        set -x
-        # NS/MC: Remove previous installation of torch and torchao first
-        # as this script does not install anything into conda env but rather as system dep
-        pip3 uninstall -y torch || true
-        set -eou pipefail
+    strategy:
+      matrix:
+        runner: [macos-m1-stable ]
+    runs-on: ${{matrix.runner}}
+    steps:
+      - name: Checkout repo
+        uses: actions/checkout@v2
+      - name: Setup Python
+        uses: actions/setup-python@v2
+        with:
+          python-version: 3.10.11
+      - name: Print machine info
+        run: |
+          uname -a
+          if [ $(uname -s) == Darwin ]; then
+            sysctl machdep.cpu.brand_string
+            sysctl machdep.cpu.core_count
+          fi
+      - name: Run test
+        run: |
+          export PYTHON_VERSION="3.10"
+          set -x
+          # NS/MC: Remove previous installation of torch and torchao first
+          # as this script does not install anything into conda env but rather as system dep
+          pip3 uninstall -y torch || true
+          set -eou pipefail
 
-        pip3 uninstall -y torchao || true
-        set -eou pipefail
+          pip3 uninstall -y torchao || true
+          set -eou pipefail
 
-        echo "::group::Print machine info"
-        uname -a
-        sysctl machdep.cpu.brand_string
-        sysctl machdep.cpu.core_count
-        echo "::endgroup::"
+          echo "::group::Print machine info"
+          uname -a
+          sysctl machdep.cpu.brand_string
+          sysctl machdep.cpu.core_count
+          echo "::endgroup::"
 
-        echo "::group::Install requirements"
-        # Install requirements
-        ./install/install_requirements.sh
-        ls -la
-        pwd
-        pip3 list
-        python3 -c 'import torch;print(f"torch: {torch.__version__, torch.version.git_version}")'
-        echo "::endgroup::"
+          echo "::group::Install requirements"
+          # Install requirements
+          ./install/install_requirements.sh
+          ls -la
+          pwd
+          pip3 list
+          python3 -c 'import torch;print(f"torch: {torch.__version__, torch.version.git_version}")'
+          echo "::endgroup::"
 
-        echo "::group::Download checkpoints"
-        (
-          mkdir -p checkpoints/stories15M
-          pushd checkpoints/stories15M
-          curl -fsSL -O https://huggingface.co/karpathy/tinyllamas/resolve/main/stories15M.pt
-          curl -fsSL -O https://github.com/karpathy/llama2.c/raw/master/tokenizer.model
-          popd
-        )
-        echo "::endgroup::"
+          echo "::group::Download checkpoints"
+          (
+            mkdir -p checkpoints/stories15M
+            pushd checkpoints/stories15M
+            curl -fsSL -O https://huggingface.co/karpathy/tinyllamas/resolve/main/stories15M.pt
+            curl -fsSL -O https://github.com/karpathy/llama2.c/raw/master/tokenizer.model
+            popd
+          )
+          echo "::endgroup::"
 
-        echo "::group::Run inference"
-        export MODEL_PATH=checkpoints/stories15M/stories15M.pt
-        export MODEL_NAME=stories15M
-        export MODEL_DIR=/tmp
+          echo "::group::Run inference"
+          export MODEL_PATH=checkpoints/stories15M/stories15M.pt
+          export MODEL_NAME=stories15M
+          export MODEL_DIR=/tmp
 
-        python3 torchchat.py generate --device mps --checkpoint-path ${MODEL_PATH} --temperature 0
+          python3 torchchat.py generate --device mps --checkpoint-path ${MODEL_PATH} --temperature 0
 
-        echo "************************************************************"
-        echo "*** embedding"
-        echo "************************************************************"
+          echo "************************************************************"
+          echo "*** embedding"
+          echo "************************************************************"
 
-        python3 torchchat.py generate --device mps --quant '{"embedding" : {"bitwidth": 8, "groupsize": 0}}' --checkpoint-path ${MODEL_PATH} --temperature 0
-        python3 torchchat.py generate --device mps --quant '{"embedding" : {"bitwidth": 8, "groupsize": 8}}' --checkpoint-path ${MODEL_PATH} --temperature 0
+          python3 torchchat.py generate --device mps --quant '{"embedding" : {"bitwidth": 8, "groupsize": 0}}' --checkpoint-path ${MODEL_PATH} --temperature 0
+          python3 torchchat.py generate --device mps --quant '{"embedding" : {"bitwidth": 8, "groupsize": 8}}' --checkpoint-path ${MODEL_PATH} --temperature 0
 
-        echo "************************************************************"
-        echo "*** linear int8"
-        echo "************************************************************"
+          echo "************************************************************"
+          echo "*** linear int8"
+          echo "************************************************************"
 
-        python3 torchchat.py generate --device mps --quant '{"linear:int8" : {"bitwidth": 8, "groupsize": 0}}' --checkpoint-path ${MODEL_PATH} --temperature 0
-        python3 torchchat.py generate --device mps --quant '{"linear:int8" : {"bitwidth": 8, "groupsize": 8}}' --checkpoint-path ${MODEL_PATH} --temperature 0
+          python3 torchchat.py generate --device mps --quant '{"linear:int8" : {"bitwidth": 8, "groupsize": 0}}' --checkpoint-path ${MODEL_PATH} --temperature 0
+          python3 torchchat.py generate --device mps --quant '{"linear:int8" : {"bitwidth": 8, "groupsize": 8}}' --checkpoint-path ${MODEL_PATH} --temperature 0
 
-        echo "************************************************************"
-        echo "*** linear int4"
-        echo "************************************************************"
+          echo "************************************************************"
+          echo "*** linear int4"
+          echo "************************************************************"
 
-        PYTORCH_ENABLE_MPS_FALLBACK=1 python3 torchchat.py generate --device mps --quant '{"linear:int4" : {"groupsize": 32}}' --checkpoint-path ${MODEL_PATH} --temperature 0
+          PYTORCH_ENABLE_MPS_FALLBACK=1 python3 torchchat.py generate --device mps --quant '{"linear:int4" : {"groupsize": 32}}' --checkpoint-path ${MODEL_PATH} --temperature 0
   test-gguf-util:
     strategy:
       matrix:
@@ -939,7 +955,6 @@ jobs:
           ENABLE_ET_PYBIND="${1:-true}"
           source "torchchat/utils/scripts/install_utils.sh"
           install_executorch_python_libs $ENABLE_ET_PYBIND
-          popd
       - name: Install runner
         run: |
           echo "Installing runner"
@@ -1103,7 +1118,6 @@ jobs:
           ENABLE_ET_PYBIND="${1:-true}"
           source "torchchat/utils/scripts/install_utils.sh"
           install_executorch_python_libs $ENABLE_ET_PYBIND
-          popd
       - name: Install runner
         run: |
           echo "Installing runner"