diff --git a/.ci/pytorch/test.sh b/.ci/pytorch/test.sh
index b8db850a3943..01075259e9fe 100755
--- a/.ci/pytorch/test.sh
+++ b/.ci/pytorch/test.sh
@@ -1250,6 +1250,97 @@ test_custom_script_ops() {
   assert_git_not_dirty
 }
 
+test_libtorch_agnostic_targetting() {
+    echo "Testing libtorch_agnostic runs correctly on TORCH_TARGET_VERSION"
+
+    REPO_DIR=$(pwd)
+    WHEEL_DIR="${REPO_DIR}/test/cpp_extensions/.wheels"
+
+    # Build wheel with current PyTorch (this has TORCH_TARGET_VERSION 2_9_0)
+    echo "Building 2.9 extension wheel with current PyTorch..."
+    pushd test/cpp_extensions/libtorch_agnostic_2_9_extension
+    time python setup.py bdist_wheel
+
+    # Save the wheel
+    mkdir -p "$WHEEL_DIR"
+    cp dist/*.whl "$WHEEL_DIR/"
+    WHEEL_FILE=$(find "$WHEEL_DIR" -maxdepth 1 -name "*.whl" -type f | head -1)
+    echo "Built wheel: $(basename "$WHEEL_FILE")"
+    popd
+
+    # Create venv and install PyTorch 2.9
+    python -m venv venv_pytorch_2_9
+    # shellcheck disable=SC1091
+    . venv_pytorch_2_9/bin/activate
+
+    # Clear PYTHONPATH to avoid using the development PyTorch
+    echo "Clearing PYTHONPATH to use only venv packages..."
+    unset PYTHONPATH
+
+    # Upgrade pip to latest version
+    echo "Upgrading pip to latest version..."
+    pip install --upgrade pip
+    pip --version
+
+    echo "Installing PyTorch 2.9..."
+
+    # Install from release channel only
+    PYTORCH_VERSION="2.9.0"
+
+    # Extract CUDA version from BUILD_ENVIRONMENT (e.g., "cuda12.1" -> "cu121")
+    if [[ "$BUILD_ENVIRONMENT" =~ cuda([0-9]+)\.([0-9]+) ]]; then
+        CUDA_MAJOR="${BASH_REMATCH[1]}"
+        CUDA_MINOR="${BASH_REMATCH[2]}"
+        CUDA_VERSION="cu${CUDA_MAJOR}${CUDA_MINOR}"
+        echo "  Detected CUDA ${CUDA_MAJOR}.${CUDA_MINOR} from BUILD_ENVIRONMENT, using ${CUDA_VERSION}"
+    else
+        # Default to CPU build
+        CUDA_VERSION="cpu"
+        echo "  No CUDA detected in BUILD_ENVIRONMENT, using CPU build"
+    fi
+
+    if pip install torch=="${PYTORCH_VERSION}" --index-url https://download.pytorch.org/whl/${CUDA_VERSION}/; then
+        echo "Installed PyTorch ${PYTORCH_VERSION} from release channel (${CUDA_VERSION})"
+    else
+        echo "  FAILED to install PyTorch 2.9.0 from release channel"
+        echo "  URL: https://download.pytorch.org/whl/${CUDA_VERSION}/"
+        deactivate
+        rm -rf venv_pytorch_2_9
+        return 1
+    fi
+
+    INSTALLED_VERSION=$(python -c "import torch; print(torch.__version__)" 2>/dev/null || echo "unknown")
+    echo "  Installed version: $INSTALLED_VERSION"
+
+    # Install test dependencies
+    echo "Installing test dependencies..."
+    pip install expecttest numpy unittest-xml-reporting
+
+    # Install the pre-built wheel
+    echo ""
+    echo "Installing pre-built 2.9 extension wheel (built with PyTorch 2.10)..."
+    pip install "$WHEEL_FILE"
+    echo "Installed $(basename "$WHEEL_FILE") into PyTorch 2.9 environment"
+
+    # Run tests with PyTorch 2.9 runtime (2.10 tests will be skipped automatically)
+    echo ""
+    echo "Running tests with PyTorch 2.9 runtime (using wheel built on PyTorch 2.10)..."
+    if time python test/cpp_extensions/test_libtorch_agnostic.py -v; then
+        echo ""
+        echo "  Wheel built with current torch and TORCH_TARGET_VERSION 2_9_0 works with PyTorch 2.9 runtime!"
+    else
+        echo "targeting test failed"
+        deactivate
+        rm -rf venv_pytorch_2_9 "$WHEEL_DIR"
+        return 1
+    fi
+
+    deactivate
+    rm -rf venv_pytorch_2_9 "$WHEEL_DIR"
+
+    assert_git_not_dirty
+}
+
 test_jit_hooks() {
   echo "Testing jit hooks in cpp"
   HOOK_BUILD="${CUSTOM_TEST_ARTIFACT_BUILD_DIR}/jit-hook-build"
@@ -1722,6 +1813,8 @@ elif [[ "${BUILD_ENVIRONMENT}" == *aarch64* && "${TEST_CONFIG}" == 'default' ]];
 elif [[ "${TEST_CONFIG}" == *backward* ]]; then
   test_forward_backward_compatibility
   # Do NOT add tests after bc check tests, see its comment.
+elif [[ "${TEST_CONFIG}" == *libtorch_agnostic_targetting* ]]; then
+  test_libtorch_agnostic_targetting
 elif [[ "${TEST_CONFIG}" == *xla* ]]; then
   install_torchvision
   build_xla
diff --git a/.github/workflows/pull.yml b/.github/workflows/pull.yml
index e5fd10c70db6..51e211a5ad2a 100644
--- a/.github/workflows/pull.yml
+++ b/.github/workflows/pull.yml
@@ -70,6 +70,7 @@ jobs:
           { config: "distributed", shard: 1, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
           { config: "distributed", shard: 2, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
           { config: "numpy_2_x", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.c7i.2xlarge" },
+          { config: "libtorch_agnostic_targetting", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
         ]}
     secrets: inherit
 
diff --git a/.github/workflows/trunk.yml b/.github/workflows/trunk.yml
index 6ba810c3a958..667c37727045 100644
--- a/.github/workflows/trunk.yml
+++ b/.github/workflows/trunk.yml
@@ -83,6 +83,7 @@ jobs:
           { config: "distributed", shard: 2, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g4dn.12xlarge.nvidia.gpu" },
           { config: "distributed", shard: 3, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g4dn.12xlarge.nvidia.gpu" },
           { config: "pr_time_benchmarks", shard: 1, num_shards: 1, runner: "linux.g4dn.metal.nvidia.gpu" },
+          { config: "libtorch_agnostic_targetting", shard: 1, num_shards: 1, runner: "linux.g4dn.metal.nvidia.gpu" },
         ]}
     secrets: inherit
 
diff --git a/test/cpp_extensions/libtorch_agnostic_extension/libtorch_agnostic/__init__.py b/test/cpp_extensions/libtorch_agnostic_2_10_extension/libtorch_agnostic_2_10/__init__.py
similarity index 100%
rename from test/cpp_extensions/libtorch_agnostic_extension/libtorch_agnostic/__init__.py
rename to test/cpp_extensions/libtorch_agnostic_2_10_extension/libtorch_agnostic_2_10/__init__.py
diff --git a/test/cpp_extensions/libtorch_agnostic_2_10_extension/libtorch_agnostic_2_10/csrc/make_tensor_clones_and_call_foreach.cpp b/test/cpp_extensions/libtorch_agnostic_2_10_extension/libtorch_agnostic_2_10/csrc/make_tensor_clones_and_call_foreach.cpp
new file mode 100644
index 000000000000..d3dbab589139
--- /dev/null
+++ b/test/cpp_extensions/libtorch_agnostic_2_10_extension/libtorch_agnostic_2_10/csrc/make_tensor_clones_and_call_foreach.cpp
@@ -0,0 +1,41 @@
+#include <torch/csrc/stable/library.h>
+#include <torch/csrc/stable/ops.h>
+#include <torch/csrc/stable/tensor.h>
+
+#include <vector>
+
+using torch::stable::Tensor;
+
+// Declare my__foreach_mul (defined in my__foreach_mul.cpp)
+extern std::vector<Tensor> my__foreach_mul(
+    torch::headeronly::HeaderOnlyArrayRef<Tensor> self,
+    torch::headeronly::HeaderOnlyArrayRef<Tensor> other);
+
+// Helper function for cloning
+Tensor my_clone(Tensor t) {
+  return clone(t);
+}
+
+std::vector<Tensor> make_tensor_clones_and_call_foreach(Tensor t1, Tensor t2) {
+  // This function tests that my__foreach_mul can take in std::initializer_lists
+  // in addition to std::vectors.
+  Tensor t1_1 = my_clone(t1);
+  Tensor t1_2 = my_clone(t1);
+  Tensor t2_1 = my_clone(t2);
+  Tensor t2_2 = my_clone(t2);
+  return my__foreach_mul({t1_1, t2_1}, {t1_2, t2_2});
+}
+
+STABLE_TORCH_LIBRARY_FRAGMENT(libtorch_agnostic_2_10, m) {
+  m.def(
+      "make_tensor_clones_and_call_foreach(Tensor t1, Tensor t2) -> Tensor[]");
+}
+
+STABLE_TORCH_LIBRARY_IMPL(
+    libtorch_agnostic_2_10,
+    CompositeExplicitAutograd,
+    m) {
+  m.impl(
+      "make_tensor_clones_and_call_foreach",
+      TORCH_BOX(&make_tensor_clones_and_call_foreach));
+}
diff --git a/test/cpp_extensions/libtorch_agnostic_2_10_extension/libtorch_agnostic_2_10/csrc/mv_tensor_accessor_cpu.cpp b/test/cpp_extensions/libtorch_agnostic_2_10_extension/libtorch_agnostic_2_10/csrc/mv_tensor_accessor_cpu.cpp
new file mode 100644
index 000000000000..705439efffe6
--- /dev/null
+++ b/test/cpp_extensions/libtorch_agnostic_2_10_extension/libtorch_agnostic_2_10/csrc/mv_tensor_accessor_cpu.cpp
@@ -0,0 +1,40 @@
+// This is duplicated from the libtorch_agnostic_2_9_extension
+// as a negative test for test_version_compatibility.py
+
+#include <torch/csrc/stable/library.h>
+#include <torch/csrc/stable/tensor.h>
+#include <torch/csrc/stable/ops.h>
+#include <torch/headeronly/util/Exception.h>
+#include <torch/headeronly/core/ScalarType.h>
+#include <torch/headeronly/core/Dispatch_v2.h>
+#include <torch/headeronly/core/TensorAccessor.h>
+
+#include "tensor_accessor_kernel.h"
+
+using torch::stable::Tensor;
+
+Tensor mv_tensor_accessor_cpu(Tensor m, Tensor v) {
+  STD_TORCH_CHECK(m.dim() == 2, "m must be 2D");
+  STD_TORCH_CHECK(v.dim() == 1, "v must be 1D");
+  STD_TORCH_CHECK(m.size(1) == v.size(0), "m.shape[1] == v.shape[0] must hold");
+  STD_TORCH_CHECK(m.scalar_type() == v.scalar_type(), "m and v must have the same dtype");
+  STD_TORCH_CHECK(m.device() == v.device(), "m and v must be on the same device");
+  Tensor res = new_empty(m, {m.size(0)});
+  THO_DISPATCH_V2(m.scalar_type(), "mv_tensor_accessor_cpu",
+                  AT_WRAP(([&]() {
+                    auto resa = Accessor_cpu<scalar_t, 1>(reinterpret_cast<scalar_t*>(res.data_ptr()), res.sizes().data(), res.strides().data());
+                    auto ma = Accessor_cpu<scalar_t, 2>(reinterpret_cast<scalar_t*>(m.data_ptr()), m.sizes().data(), m.strides().data());
+                    auto va = Accessor_cpu<scalar_t, 1>(reinterpret_cast<scalar_t*>(v.data_ptr()), v.sizes().data(), v.strides().data());
+                    mv_tensor_accessor_kernel<Accessor_cpu, scalar_t>(resa, ma, va);
+                  })),
+                  AT_FLOATING_TYPES);
+  return res;
+}
+
+STABLE_TORCH_LIBRARY_FRAGMENT(libtorch_agnostic_2_10, m) {
+  m.def("mv_tensor_accessor_cpu(Tensor res, Tensor m, Tensor v) -> Tensor");
+}
+
+STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic_2_10, CompositeExplicitAutograd, m) {
+  m.impl("mv_tensor_accessor_cpu", TORCH_BOX(&mv_tensor_accessor_cpu));
+}
diff --git a/test/cpp_extensions/libtorch_agnostic_2_10_extension/libtorch_agnostic_2_10/csrc/mv_tensor_accessor_cuda.cu b/test/cpp_extensions/libtorch_agnostic_2_10_extension/libtorch_agnostic_2_10/csrc/mv_tensor_accessor_cuda.cu
new file mode 100644
index 000000000000..7773210a089e
--- /dev/null
+++ b/test/cpp_extensions/libtorch_agnostic_2_10_extension/libtorch_agnostic_2_10/csrc/mv_tensor_accessor_cuda.cu
@@ -0,0 +1,47 @@
+// This is duplicated from the libtorch_agnostic_2_9_extension
+// as a negative test for test_version_compatibility.py
+
+#include "tensor_accessor_kernel.h"
+
+#include <cuda_runtime.h>
+#include <torch/csrc/stable/library.h>
+#include <torch/csrc/stable/ops.h>
+#include <torch/csrc/stable/tensor.h>
+
+using torch::stable::Tensor;
+
+Tensor mv_tensor_accessor_cuda(Tensor m, Tensor v) {
+  STD_TORCH_CHECK(m.dim() == 2, "m must be 2D");
+  STD_TORCH_CHECK(v.dim() == 1, "v must be 1D");
+  STD_TORCH_CHECK(m.size(1) == v.size(0), "m.shape[1] == v.shape[0] must hold");
+  STD_TORCH_CHECK(
+      m.scalar_type() == v.scalar_type(), "m and v must have the same dtype");
+  STD_TORCH_CHECK(
+      m.device() == v.device(), "m and v must be on the same device");
+  Tensor res = new_empty(m, {m.size(0)});
+  THO_DISPATCH_V2(
+      m.scalar_type(),
+      "mv_tensor_accessor_cuda",
+      AT_WRAP(([&]() {
+        auto resa = Accessor_cuda<scalar_t, 1>(
+            reinterpret_cast<scalar_t*>(res.data_ptr()),
+            res.sizes().data(),
+            res.strides().data());
+        auto ma = Accessor_cuda<scalar_t, 2>(
+            reinterpret_cast<scalar_t*>(m.data_ptr()),
+            m.sizes().data(),
+            m.strides().data());
+        auto va = Accessor_cuda<scalar_t, 1>(
+            reinterpret_cast<scalar_t*>(v.data_ptr()),
+            v.sizes().data(),
+            v.strides().data());
+        mv_tensor_accessor_kernel<Accessor_cuda, scalar_t>
+            <<<1, 1, 0, 0>>>(resa, ma, va);
+      })),
+      AT_FLOATING_TYPES);
+  return res;
+}
+
+STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic_2_10, CUDA, m) {
+  m.impl("mv_tensor_accessor", TORCH_BOX(&mv_tensor_accessor_cuda));
+}
diff --git a/test/cpp_extensions/libtorch_agnostic_2_10_extension/libtorch_agnostic_2_10/csrc/my__foreach_mul.cpp b/test/cpp_extensions/libtorch_agnostic_2_10_extension/libtorch_agnostic_2_10/csrc/my__foreach_mul.cpp
new file mode 100644
index 000000000000..834a63afea64
--- /dev/null
+++ b/test/cpp_extensions/libtorch_agnostic_2_10_extension/libtorch_agnostic_2_10/csrc/my__foreach_mul.cpp
@@ -0,0 +1,20 @@
+#include <torch/csrc/stable/library.h>
+#include <torch/csrc/stable/tensor.h>
+#include <torch/csrc/inductor/aoti_torch/c/shim.h>
+#include <vector>
+
+using torch::stable::Tensor;
+
+std::vector<Tensor> my__foreach_mul(torch::headeronly::HeaderOnlyArrayRef<Tensor> self, torch::headeronly::HeaderOnlyArrayRef<Tensor> other) {
+  std::array<StableIValue, 2> stack = {torch::stable::detail::from(self), torch::stable::detail::from(other)};
+  aoti_torch_call_dispatcher("aten::_foreach_mul", "List", stack.data());
+  return torch::stable::detail::to<std::vector<Tensor>>(stack[0]);
+}
+
+STABLE_TORCH_LIBRARY_FRAGMENT(libtorch_agnostic_2_10, m) {
+  m.def("my__foreach_mul(Tensor[] self, Tensor[] other) -> Tensor[]");
+}
+
+STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic_2_10, CompositeExplicitAutograd, m) {
+  m.impl("my__foreach_mul", TORCH_BOX(&my__foreach_mul));
+}
diff --git a/test/cpp_extensions/libtorch_agnostic_2_10_extension/libtorch_agnostic_2_10/csrc/my__foreach_mul_.cpp b/test/cpp_extensions/libtorch_agnostic_2_10_extension/libtorch_agnostic_2_10/csrc/my__foreach_mul_.cpp
new file mode 100644
index 000000000000..8409e6890bdd
--- /dev/null
+++ b/test/cpp_extensions/libtorch_agnostic_2_10_extension/libtorch_agnostic_2_10/csrc/my__foreach_mul_.cpp
@@ -0,0 +1,19 @@
+#include <torch/csrc/stable/library.h>
+#include <torch/csrc/stable/tensor.h>
+#include <torch/csrc/stable/stableivalue_conversions.h>
+#include <torch/csrc/inductor/aoti_torch/c/shim.h>
+
+using torch::stable::Tensor;
+
+void my__foreach_mul_(torch::headeronly::HeaderOnlyArrayRef<Tensor> self, torch::headeronly::HeaderOnlyArrayRef<Tensor> other) {
+  std::array<StableIValue, 2> stack = {torch::stable::detail::from(self), torch::stable::detail::from(other)};
+  aoti_torch_call_dispatcher("aten::_foreach_mul_", "List", stack.data());
+}
+
+STABLE_TORCH_LIBRARY_FRAGMENT(libtorch_agnostic_2_10, m) {
+  m.def("my__foreach_mul_(Tensor(a!)[] self, Tensor[] other) -> ()");
+}
+
+STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic_2_10, CompositeExplicitAutograd, m) {
+  m.impl("my__foreach_mul_", TORCH_BOX(&my__foreach_mul_));
+}
diff --git a/test/cpp_extensions/libtorch_agnostic_2_10_extension/libtorch_agnostic_2_10/csrc/my_empty.cpp b/test/cpp_extensions/libtorch_agnostic_2_10_extension/libtorch_agnostic_2_10/csrc/my_empty.cpp
new file mode 100644
index 000000000000..6278dca9f281
--- /dev/null
+++ b/test/cpp_extensions/libtorch_agnostic_2_10_extension/libtorch_agnostic_2_10/csrc/my_empty.cpp
@@ -0,0 +1,25 @@
+#include <torch/csrc/stable/library.h>
+#include <torch/csrc/stable/tensor.h>
+#include <torch/csrc/stable/device.h>
+#include <torch/csrc/stable/ops.h>
+
+#include <optional>
+
+using torch::stable::Tensor;
+
+Tensor my_empty(
+    torch::headeronly::HeaderOnlyArrayRef<int64_t> size,
+    std::optional<torch::headeronly::ScalarType> dtype,
+    std::optional<torch::stable::Device> device,
+    std::optional<bool> pin_memory) {
+  return empty(size, dtype, device, pin_memory);
+}
+
+STABLE_TORCH_LIBRARY_FRAGMENT(libtorch_agnostic_2_10, m) {
+  m.def(
+      "my_empty(int[] size, ScalarType? dtype=None, Device? device=None, bool? pin_memory=None) -> Tensor");
+}
+
+STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic_2_10, CompositeExplicitAutograd, m) {
+  m.impl("my_empty", TORCH_BOX(&my_empty));
+}
diff --git a/test/cpp_extensions/libtorch_agnostic_2_10_extension/libtorch_agnostic_2_10/csrc/my_reshape.cpp b/test/cpp_extensions/libtorch_agnostic_2_10_extension/libtorch_agnostic_2_10/csrc/my_reshape.cpp
new file mode 100644
index 000000000000..0a2b1f70f215
--- /dev/null
+++ b/test/cpp_extensions/libtorch_agnostic_2_10_extension/libtorch_agnostic_2_10/csrc/my_reshape.cpp
@@ -0,0 +1,17 @@
+#include <torch/csrc/stable/library.h>
+#include <torch/csrc/stable/tensor.h>
+#include <torch/csrc/stable/ops.h>
+
+using torch::stable::Tensor;
+
+Tensor my_reshape(Tensor t, torch::headeronly::HeaderOnlyArrayRef<int64_t> shape) {
+  return reshape(t, shape);
+}
+
+STABLE_TORCH_LIBRARY_FRAGMENT(libtorch_agnostic_2_10, m) {
+  m.def("my_reshape(Tensor t, int[] shape) -> Tensor");
+}
+
+STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic_2_10, CompositeExplicitAutograd, m) {
+  m.impl("my_reshape", TORCH_BOX(&my_reshape));
+}
diff --git a/test/cpp_extensions/libtorch_agnostic_2_10_extension/libtorch_agnostic_2_10/csrc/my_view.cpp b/test/cpp_extensions/libtorch_agnostic_2_10_extension/libtorch_agnostic_2_10/csrc/my_view.cpp
new file mode 100644
index 000000000000..25d8c5458924
--- /dev/null
+++ b/test/cpp_extensions/libtorch_agnostic_2_10_extension/libtorch_agnostic_2_10/csrc/my_view.cpp
@@ -0,0 +1,20 @@
+#include <torch/csrc/stable/library.h>
+#include <torch/csrc/stable/tensor.h>
+#include <torch/csrc/stable/ops.h>
+
+using torch::stable::Tensor;
+
+Tensor my_view(Tensor t, torch::headeronly::HeaderOnlyArrayRef<int64_t> size) {
+  return view(t, size);
+}
+
+STABLE_TORCH_LIBRARY_FRAGMENT(libtorch_agnostic_2_10, m) {
+  m.def("my_view(Tensor t, int[] size) -> Tensor");
+}
+
+STABLE_TORCH_LIBRARY_IMPL(
+    libtorch_agnostic_2_10,
+    CompositeExplicitAutograd,
+    m) {
+  m.impl("my_view", TORCH_BOX(&my_view));
+}
diff --git a/test/cpp_extensions/libtorch_agnostic_2_10_extension/libtorch_agnostic_2_10/csrc/tensor_accessor_kernel.h b/test/cpp_extensions/libtorch_agnostic_2_10_extension/libtorch_agnostic_2_10/csrc/tensor_accessor_kernel.h
new file mode 100644
index 000000000000..f1031f38060c
--- /dev/null
+++ b/test/cpp_extensions/libtorch_agnostic_2_10_extension/libtorch_agnostic_2_10/csrc/tensor_accessor_kernel.h
@@ -0,0 +1,28 @@
+#pragma once
+
+#include <torch/headeronly/core/Dispatch_v2.h>
+#include <torch/headeronly/core/TensorAccessor.h>
+
+template <typename T, size_t N>
+using Accessor_cpu = torch::headeronly::HeaderOnlyTensorAccessor<T, N>;
+
+#if defined(__CUDACC__) || defined(__HIPCC__)
+#define MAYBE_GLOBAL __global__
+
+template <typename T, size_t N>
+using Accessor_cuda = torch::headeronly::HeaderOnlyGenericPackedTensorAccessor<T, N, torch::headeronly::RestrictPtrTraits>;
+
+#else
+#define MAYBE_GLOBAL
+#endif
+
+template <template <typename, size_t> class Accessor, typename scalar_t>
+MAYBE_GLOBAL void mv_tensor_accessor_kernel(Accessor<scalar_t, 1> resa, Accessor<scalar_t, 2> ma, Accessor<scalar_t, 1> va) {
+  for (int64_t i = 0; i < resa.size(0); i++) {
+    scalar_t val = 0;
+    for (int64_t j = 0; j < ma.size(1); j++) {
+      val += ma[i][j] * va[j];
+    }
+    resa[i] = val;
+  }
+}
diff --git a/test/cpp_extensions/libtorch_agnostic_2_10_extension/libtorch_agnostic_2_10/csrc/test_device_constructor.cpp b/test/cpp_extensions/libtorch_agnostic_2_10_extension/libtorch_agnostic_2_10/csrc/test_device_constructor.cpp
new file mode 100644
index 000000000000..67d5a300fc04
--- /dev/null
+++ b/test/cpp_extensions/libtorch_agnostic_2_10_extension/libtorch_agnostic_2_10/csrc/test_device_constructor.cpp
@@ -0,0 +1,37 @@
+#include <torch/csrc/stable/library.h>
+#include <torch/csrc/stable/device.h>
+
+#include <string>
+
+torch::stable::Device test_device_constructor(
+    bool is_cuda,
+    torch::stable::DeviceIndex index,
+    bool use_str) {
+  using torch::stable::Device;
+  using torch::stable::DeviceType;
+
+  if (use_str) {
+    std::string device_str;
+    if (is_cuda) {
+      device_str = "cuda:" + std::to_string(index);
+    } else {
+      device_str = "cpu";
+    }
+    return Device(device_str);
+  } else {
+    if (is_cuda) {
+      return Device(DeviceType::CUDA, index);
+    } else {
+      return Device(DeviceType::CPU);
+    }
+  }
+}
+
+STABLE_TORCH_LIBRARY_FRAGMENT(libtorch_agnostic_2_10, m) {
+  m.def(
+      "test_device_constructor(bool is_cuda, DeviceIndex index, bool use_str) -> Device");
+}
+
+STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic_2_10, CompositeExplicitAutograd, m) {
+  m.impl("test_device_constructor", TORCH_BOX(&test_device_constructor));
+}
diff --git a/test/cpp_extensions/libtorch_agnostic_2_10_extension/libtorch_agnostic_2_10/csrc/test_device_equality.cpp b/test/cpp_extensions/libtorch_agnostic_2_10_extension/libtorch_agnostic_2_10/csrc/test_device_equality.cpp
new file mode 100644
index 000000000000..247cd727175f
--- /dev/null
+++ b/test/cpp_extensions/libtorch_agnostic_2_10_extension/libtorch_agnostic_2_10/csrc/test_device_equality.cpp
@@ -0,0 +1,14 @@
+#include <torch/csrc/stable/library.h>
+#include <torch/csrc/stable/device.h>
+
+bool test_device_equality(torch::stable::Device d1, torch::stable::Device d2) {
+  return d1 == d2;
+}
+
+STABLE_TORCH_LIBRARY_FRAGMENT(libtorch_agnostic_2_10, m) {
+  m.def("test_device_equality(Device d1, Device d2) -> bool");
+}
+
+STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic_2_10, CompositeExplicitAutograd, m) {
+  m.impl("test_device_equality", TORCH_BOX(&test_device_equality));
+}
diff --git a/test/cpp_extensions/libtorch_agnostic_2_10_extension/libtorch_agnostic_2_10/csrc/test_device_index.cpp b/test/cpp_extensions/libtorch_agnostic_2_10_extension/libtorch_agnostic_2_10/csrc/test_device_index.cpp
new file mode 100644
index 000000000000..dba40ea289e6
--- /dev/null
+++ b/test/cpp_extensions/libtorch_agnostic_2_10_extension/libtorch_agnostic_2_10/csrc/test_device_index.cpp
@@ -0,0 +1,14 @@
+#include <torch/csrc/stable/library.h>
+#include <torch/csrc/stable/device.h>
+
+torch::stable::DeviceIndex test_device_index(torch::stable::Device device) {
+  return device.index();
+}
+
+STABLE_TORCH_LIBRARY_FRAGMENT(libtorch_agnostic_2_10, m) {
+  m.def("test_device_index(Device device) -> DeviceIndex");
+}
+
+STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic_2_10, CompositeExplicitAutograd, m) {
+  m.impl("test_device_index", TORCH_BOX(&test_device_index));
+}
diff --git a/test/cpp_extensions/libtorch_agnostic_2_10_extension/libtorch_agnostic_2_10/csrc/test_device_is_cpu.cpp b/test/cpp_extensions/libtorch_agnostic_2_10_extension/libtorch_agnostic_2_10/csrc/test_device_is_cpu.cpp
new file mode 100644
index 000000000000..58e1af91dfd5
--- /dev/null
+++ b/test/cpp_extensions/libtorch_agnostic_2_10_extension/libtorch_agnostic_2_10/csrc/test_device_is_cpu.cpp
@@ -0,0 +1,14 @@
+#include <torch/csrc/stable/library.h>
+#include <torch/csrc/stable/device.h>
+
+bool test_device_is_cpu(torch::stable::Device device) {
+  return device.is_cpu();
+}
+
+STABLE_TORCH_LIBRARY_FRAGMENT(libtorch_agnostic_2_10, m) {
+  m.def("test_device_is_cpu(Device device) -> bool");
+}
+
+STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic_2_10, CompositeExplicitAutograd, m) {
+  m.impl("test_device_is_cpu", TORCH_BOX(&test_device_is_cpu));
+}
diff --git a/test/cpp_extensions/libtorch_agnostic_2_10_extension/libtorch_agnostic_2_10/csrc/test_device_is_cuda.cpp b/test/cpp_extensions/libtorch_agnostic_2_10_extension/libtorch_agnostic_2_10/csrc/test_device_is_cuda.cpp
new file mode 100644
index 000000000000..e08709f30c2d
--- /dev/null
+++ b/test/cpp_extensions/libtorch_agnostic_2_10_extension/libtorch_agnostic_2_10/csrc/test_device_is_cuda.cpp
@@ -0,0 +1,14 @@
+#include <torch/csrc/stable/library.h>
+#include <torch/csrc/stable/device.h>
+
+bool test_device_is_cuda(torch::stable::Device device) {
+  return device.is_cuda();
+}
+
+STABLE_TORCH_LIBRARY_FRAGMENT(libtorch_agnostic_2_10, m) {
+  m.def("test_device_is_cuda(Device device) -> bool");
+}
+
+STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic_2_10, CompositeExplicitAutograd, m) {
+  m.impl("test_device_is_cuda", TORCH_BOX(&test_device_is_cuda));
+}
diff --git a/test/cpp_extensions/libtorch_agnostic_2_10_extension/libtorch_agnostic_2_10/csrc/test_device_set_index.cpp b/test/cpp_extensions/libtorch_agnostic_2_10_extension/libtorch_agnostic_2_10/csrc/test_device_set_index.cpp
new file mode 100644
index 000000000000..a588db4d4e31
--- /dev/null
+++ b/test/cpp_extensions/libtorch_agnostic_2_10_extension/libtorch_agnostic_2_10/csrc/test_device_set_index.cpp
@@ -0,0 +1,17 @@
+#include <torch/csrc/stable/library.h>
+#include <torch/csrc/stable/device.h>
+
+torch::stable::Device test_device_set_index(
+    torch::stable::Device device,
+    torch::stable::DeviceIndex index) {
+  device.set_index(index);
+  return device;
+}
+
+STABLE_TORCH_LIBRARY_FRAGMENT(libtorch_agnostic_2_10, m) {
+  m.def("test_device_set_index(Device device, DeviceIndex index) -> Device");
+}
+
+STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic_2_10, CompositeExplicitAutograd, m) {
+  m.impl("test_device_set_index", TORCH_BOX(&test_device_set_index));
+}
diff --git a/test/cpp_extensions/libtorch_agnostic_2_10_extension/libtorch_agnostic_2_10/csrc/test_get_num_threads.cpp b/test/cpp_extensions/libtorch_agnostic_2_10_extension/libtorch_agnostic_2_10/csrc/test_get_num_threads.cpp
new file mode 100644
index 000000000000..0c1666183061
--- /dev/null
+++ b/test/cpp_extensions/libtorch_agnostic_2_10_extension/libtorch_agnostic_2_10/csrc/test_get_num_threads.cpp
@@ -0,0 +1,14 @@
+#include <torch/csrc/stable/library.h>
+#include <torch/csrc/stable/ops.h>
+
+uint32_t test_get_num_threads() {
+  return torch::stable::get_num_threads();
+}
+
+STABLE_TORCH_LIBRARY_FRAGMENT(libtorch_agnostic_2_10, m) {
+  m.def("test_get_num_threads() -> int");
+}
+
+STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic_2_10, CompositeExplicitAutograd, m) {
+  m.impl("test_get_num_threads", TORCH_BOX(&test_get_num_threads));
+}
diff --git a/test/cpp_extensions/libtorch_agnostic_2_10_extension/libtorch_agnostic_2_10/csrc/test_parallel_for.cpp b/test/cpp_extensions/libtorch_agnostic_2_10_extension/libtorch_agnostic_2_10/csrc/test_parallel_for.cpp
new file mode 100644
index 000000000000..3c4be2eb5f55
--- /dev/null
+++ b/test/cpp_extensions/libtorch_agnostic_2_10_extension/libtorch_agnostic_2_10/csrc/test_parallel_for.cpp
@@ -0,0 +1,49 @@
+#include <torch/csrc/stable/library.h>
+#include <torch/csrc/stable/tensor.h>
+#include <torch/csrc/stable/ops.h>
+#include <torch/csrc/stable/device.h>
+#include <torch/csrc/inductor/aoti_torch/c/shim.h>
+#include <torch/csrc/inductor/aoti_torch/generated/c_shim_aten.h>
+
+using torch::stable::Tensor;
+
+Tensor test_parallel_for(int64_t size, int64_t grain_size) {
+  AtenTensorHandle tensor_handle;
+  int64_t stride = 1;
+
+  aoti_torch_empty_strided(
+      1,
+      &size,
+      &stride,
+      aoti_torch_dtype_int64(),
+      aoti_torch_device_type_cpu(),
+      0,
+      &tensor_handle);
+
+  Tensor tensor(tensor_handle);
+  int64_t* data_ptr = reinterpret_cast<int64_t*>(tensor.data_ptr());
+
+  torch::stable::zero_(tensor);
+
+  // Use parallel_for to fill each element with its index
+  // If using a parallel path, the thread id is encoded in the upper 32 bits
+  torch::stable::parallel_for(
+      0, size, grain_size, [data_ptr](int64_t begin, int64_t end) {
+        for (auto i = begin; i < end; i++) {
+          STD_TORCH_CHECK(i <= UINT32_MAX);
+          uint32_t thread_id;
+          torch_get_thread_idx(&thread_id);
+          data_ptr[i] = i | (static_cast<int64_t>(thread_id) << 32);
+        }
+      });
+
+  return tensor;
+}
+
+STABLE_TORCH_LIBRARY_FRAGMENT(libtorch_agnostic_2_10, m) {
+  m.def("test_parallel_for(int size, int grain_size) -> Tensor");
+}
+
+STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic_2_10, CompositeExplicitAutograd, m) {
+  m.impl("test_parallel_for", TORCH_BOX(&test_parallel_for));
+}
diff --git a/test/cpp_extensions/libtorch_agnostic_2_10_extension/libtorch_agnostic_2_10/csrc/test_tensor_device.cpp b/test/cpp_extensions/libtorch_agnostic_2_10_extension/libtorch_agnostic_2_10/csrc/test_tensor_device.cpp
new file mode 100644
index 000000000000..de00b6318a1a
--- /dev/null
+++ b/test/cpp_extensions/libtorch_agnostic_2_10_extension/libtorch_agnostic_2_10/csrc/test_tensor_device.cpp
@@ -0,0 +1,17 @@
+#include <torch/csrc/stable/library.h>
+#include <torch/csrc/stable/tensor.h>
+#include <torch/csrc/stable/device.h>
+
+using torch::stable::Tensor;
+
+torch::stable::Device test_tensor_device(torch::stable::Tensor tensor) {
+  return tensor.device();
+}
+
+STABLE_TORCH_LIBRARY_FRAGMENT(libtorch_agnostic_2_10, m) {
+  m.def("test_tensor_device(Tensor t) -> Device");
+}
+
+STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic_2_10, CompositeExplicitAutograd, m) {
+  m.impl("test_tensor_device", TORCH_BOX(&test_tensor_device));
+}
diff --git a/test/cpp_extensions/libtorch_agnostic_2_10_extension/libtorch_agnostic_2_10/ops.py b/test/cpp_extensions/libtorch_agnostic_2_10_extension/libtorch_agnostic_2_10/ops.py
new file mode 100644
index 000000000000..42c437ebf755
--- /dev/null
+++ b/test/cpp_extensions/libtorch_agnostic_2_10_extension/libtorch_agnostic_2_10/ops.py
@@ -0,0 +1,199 @@
+import torch
+from torch import Tensor
+
+
+def my__foreach_mul_(tensors, others) -> ():
+    """
+    Updates tensors to be the result of pointwise multiplying with others.
+
+    Args:
+        tensors: list of tensors
+        others: list of tensors (with the same corresponding shapes as tensors)
+
+    Returns: nothing, tensors is updated in place.
+    """
+    torch.ops.libtorch_agnostic_2_10.my__foreach_mul_.default(tensors, others)
+
+
+def my__foreach_mul(tensors, others) -> list[Tensor]:
+    """
+    Returns a list of tensors that are the results of pointwise multiplying
+    tensors and others.
+
+    Args:
+        tensors: list of tensors
+        others: list of tensors (with the same corresponding shapes as tensors)
+
+    Returns: list of multiplied tensors
+    """
+    return torch.ops.libtorch_agnostic_2_10.my__foreach_mul.default(tensors, others)
+
+
+def make_tensor_clones_and_call_foreach(t1, t2) -> list[Tensor]:
+    """
+    Returns a list of 2 tensors corresponding to the square of the inputs.
+
+    Args:
+        t1: Tensor
+        t2: Tensor
+
+    Returns: list of [t1^2, t2^2]
+    """
+    return torch.ops.libtorch_agnostic_2_10.make_tensor_clones_and_call_foreach.default(
+        t1, t2
+    )
+
+
+def test_tensor_device(t):
+    """
+    Tests Tensor device() method.
+
+    Args:
+        t: Tensor - tensor to get device from
+
+    Returns: Device - device of the tensor
+    """
+    return torch.ops.libtorch_agnostic_2_10.test_tensor_device.default(t)
+
+
+def test_device_constructor(is_cuda, index, use_str):
+    """
+    Tests creating a Device from DeviceType and index, or from a string.
+
+    Args:
+        is_cuda: bool - if True, creates CUDA device; if False, creates CPU device
+        index: int - device index
+        use_str: bool - if True, constructs from string; if False, constructs from DeviceType
+
+    Returns: Device - A device with the specified type and index
+    """
+    return torch.ops.libtorch_agnostic_2_10.test_device_constructor.default(
+        is_cuda, index, use_str
+    )
+
+
+def test_device_equality(d1, d2) -> bool:
+    """
+    Tests Device equality operator.
+
+    Args:
+        d1: Device - first device
+        d2: Device - second device
+
+    Returns: bool - True if devices are equal
+    """
+    return torch.ops.libtorch_agnostic_2_10.test_device_equality.default(d1, d2)
+
+
+def test_device_set_index(device, index):
+    """
+    Tests Device set_index() method.
+
+    Args:
+        device: Device - device to modify
+        index: int - new device index
+
+    Returns: Device - device with updated index
+    """
+    return torch.ops.libtorch_agnostic_2_10.test_device_set_index.default(device, index)
+
+
+def test_device_index(device) -> int:
+    """
+    Tests Device index() method.
+
+    Args:
+        device: Device - device to query
+
+    Returns: int - device index
+    """
+    return torch.ops.libtorch_agnostic_2_10.test_device_index.default(device)
+
+
+def test_device_is_cuda(device) -> bool:
+    """
+    Tests Device is_cuda() method.
+
+    Args:
+        device: Device - device to check
+
+    Returns: bool - True if device is CUDA
+    """
+    return torch.ops.libtorch_agnostic_2_10.test_device_is_cuda.default(device)
+
+
+def test_device_is_cpu(device) -> bool:
+    """
+    Tests Device is_cpu() method.
+
+    Args:
+        device: Device - device to check
+
+    Returns: bool - True if device is CPU
+    """
+    return torch.ops.libtorch_agnostic_2_10.test_device_is_cpu.default(device)
+
+
+def test_parallel_for(size, grain_size) -> Tensor:
+    """
+    Tests the parallel_for functionality by using it to fill a tensor with indices.
+    Args:
+        size: int - size of the tensor to create
+        grain_size: int - grain size for parallel_for
+    Returns: Tensor - a 1D int64 tensor where each element contains its index
+        (if multiple threads are used the threadid will be encoded in the upper 32 bits)
+    """
+    return torch.ops.libtorch_agnostic_2_10.test_parallel_for.default(size, grain_size)
+
+
+def test_get_num_threads() -> int:
+    """
+    Tests the get_num_threads functionality by returning the number of threads
+    for the parallel backend.
+
+    Returns: int - the number of threads for the parallel backend
+    """
+    return torch.ops.libtorch_agnostic_2_10.test_get_num_threads.default()
+
+
+def my_empty(size, dtype=None, device=None, pin_memory=None) -> Tensor:
+    """
+    Creates an empty tensor with the specified size, dtype, device, and pin_memory.
+
+    Args:
+        size: list[int] - size of the tensor to create
+        dtype: ScalarType or None - data type of the tensor
+        device: Device or None - device on which to create the tensor
+        pin_memory: bool or None - whether to use pinned memory
+
+    Returns: Tensor - an uninitialized tensor with the specified properties
+    """
+    return torch.ops.libtorch_agnostic_2_10.my_empty.default(
+        size, dtype, device, pin_memory
+    )
+
+
+def my_reshape(t, shape) -> Tensor:
+    """
+    Returns a tensor with the same data but different shape.
+
+    Args:
+        t: Tensor - tensor to reshape
+        shape: list[int] - new shape for the tensor
+
+    Returns: Tensor - reshaped tensor
+    """
+    return torch.ops.libtorch_agnostic_2_10.my_reshape.default(t, shape)
+
+
+def my_view(t, size) -> Tensor:
+    """
+    Returns a new tensor with the same data as the input tensor but of a different shape.
+
+    Args:
+        t: Tensor - tensor to view
+        size: list[int] - new size for the tensor
+
+    Returns: Tensor - tensor with new view
+    """
+    return torch.ops.libtorch_agnostic_2_10.my_view.default(t, size)
diff --git a/test/cpp_extensions/libtorch_agnostic_extension/setup.py b/test/cpp_extensions/libtorch_agnostic_2_10_extension/setup.py
similarity index 74%
rename from test/cpp_extensions/libtorch_agnostic_extension/setup.py
rename to test/cpp_extensions/libtorch_agnostic_2_10_extension/setup.py
index 471a2b08eb11..ff2aeff5e932 100644
--- a/test/cpp_extensions/libtorch_agnostic_extension/setup.py
+++ b/test/cpp_extensions/libtorch_agnostic_2_10_extension/setup.py
@@ -9,7 +9,7 @@
 
 
 ROOT_DIR = Path(__file__).parent
-CSRC_DIR = ROOT_DIR / "libtorch_agnostic" / "csrc"
+CSRC_DIR = ROOT_DIR / "libtorch_agnostic_2_10" / "csrc"
 
 
 class clean(distutils.command.clean.clean):
@@ -18,13 +18,13 @@ def run(self):
         distutils.command.clean.clean.run(self)
 
         # Remove extension
-        for path in (ROOT_DIR / "libtorch_agnostic").glob("**/*.so"):
+        for path in (ROOT_DIR / "libtorch_agnostic_2_10").glob("**/*.so"):
             path.unlink()
         # Remove build and dist and egg-info directories
         dirs = [
             ROOT_DIR / "build",
             ROOT_DIR / "dist",
-            ROOT_DIR / "libtorch_agnostic.egg-info",
+            ROOT_DIR / "libtorch_agnostic_2_10.egg-info",
         ]
         for path in dirs:
             if path.exists():
@@ -33,7 +33,11 @@ def run(self):
 
 def get_extension():
     extra_compile_args = {
-        "cxx": ["-fdiagnostics-color=always"],
+        "cxx": [
+            "-fdiagnostics-color=always",
+            "-DTORCH_STABLE_ONLY",
+            "-DTORCH_TARGET_VERSION=0x020a000000000000",
+        ],
     }
     sources = list(CSRC_DIR.glob("**/*.cpp"))
 
@@ -47,7 +51,7 @@ def get_extension():
 
     return [
         extension(
-            "libtorch_agnostic._C",
+            "libtorch_agnostic_2_10._C",
             sources=sorted(str(s) for s in sources),
             py_limited_api=True,
             extra_compile_args=extra_compile_args,
@@ -57,12 +61,12 @@ def get_extension():
 
 
 setup(
-    name="libtorch_agnostic",
+    name="libtorch_agnostic_2_10",
     version="0.0",
     author="PyTorch Core Team",
-    description="Example of libtorch agnostic extension",
+    description="Example of libtorch agnostic extension for PyTorch 2.10+",
     packages=find_packages(exclude=("test",)),
-    package_data={"libtorch_agnostic": ["*.dll", "*.dylib", "*.so"]},
+    package_data={"libtorch_agnostic_2_10": ["*.dll", "*.dylib", "*.so"]},
     install_requires=[
         "torch",
     ],
diff --git a/test/cpp_extensions/libtorch_agnostic_2_10_extension/test_version_compatibility.py b/test/cpp_extensions/libtorch_agnostic_2_10_extension/test_version_compatibility.py
new file mode 100644
index 000000000000..cdc187d892ed
--- /dev/null
+++ b/test/cpp_extensions/libtorch_agnostic_2_10_extension/test_version_compatibility.py
@@ -0,0 +1,300 @@
+# Owner(s): ["module: cpp"]
+
+"""
+Unit tests to verify that each function file requires PyTorch 2.10+.
+
+This test suite compiles each .cpp file in the csrc directory with
+TORCH_TARGET_VERSION=2.9.0 and expects compilation to fail.
+If compilation succeeds, it means that either
+
+(1) The test function works with 2.9.0 and should not be in this directory.
+(2) The test function tests APIs that do not have proper TORCH_FEATURE_VERSION
+    guards. If this is the case, and you incorrectly move the test function into
+    libtorch_agnostic_2_9_extension the libtorch_agnostic_targetting CI workflow
+    will catch this.
+
+Run this script with VERSION_COMPAT_DEBUG=1 to see compilation errors.
+"""
+
+import os
+import subprocess
+import tempfile
+from pathlib import Path
+
+from torch.testing._internal.common_utils import run_tests, TestCase
+from torch.utils.cpp_extension import CUDA_HOME, include_paths as torch_include_paths
+
+
+class FunctionVersionCompatibilityTest(TestCase):
+    """Test that all function files require PyTorch 2.10+."""
+
+    @classmethod
+    def setUpClass(cls):
+        """Set up test environment once for all tests."""
+        cls.csrc_dir = Path(__file__).parent / "libtorch_agnostic_2_10" / "csrc"
+        cls.build_dir = Path(tempfile.mkdtemp(prefix="version_check_"))
+
+        cls.pytorch_includes = [
+            f"-I{path}" for path in torch_include_paths(device_type="cpu")
+        ]
+        cls.cuda_includes = []
+        if CUDA_HOME:
+            cuda_include_path = os.path.join(CUDA_HOME, "include")
+            if os.path.exists(cuda_include_path):
+                cls.cuda_includes = [f"-I{cuda_include_path}"]
+
+        cls.cuda_available = cls._check_cuda_available()
+
+    @classmethod
+    def tearDownClass(cls):
+        """Clean up build directory."""
+        import shutil
+
+        if cls.build_dir.exists():
+            shutil.rmtree(cls.build_dir)
+
+    @staticmethod
+    def _check_cuda_available() -> bool:
+        """Check if CUDA is available."""
+        try:
+            import torch
+
+            return torch.cuda.is_available()
+        except ImportError:
+            return False
+
+    def _compile_cpp_file(
+        self, source_file: Path, output_file: Path
+    ) -> tuple[bool, str]:
+        """
+        Compile a C++ file with TORCH_TARGET_VERSION=2.9.0.
+        Returns (success, error_message).
+        """
+        torch_version_2_9 = "0x0209000000000000"
+
+        cmd = [
+            "g++",
+            "-c",
+            "-std=c++17",
+            f"-DTORCH_TARGET_VERSION={torch_version_2_9}",
+            f"-I{source_file.parent}",  # For includes in same directory
+            *self.pytorch_includes,
+        ]
+
+        # Add CUDA flags if available
+        if self.cuda_available:
+            cmd.extend(self.cuda_includes)
+
+        cmd.extend([str(source_file), "-o", str(output_file)])
+
+        result = subprocess.run(cmd, capture_output=True, text=True, timeout=30)
+
+        if result.returncode == 0:
+            return True, ""
+        else:
+            return False, result.stderr
+
+    def _compile_cu_file(
+        self, source_file: Path, output_file: Path
+    ) -> tuple[bool, str]:
+        """
+        Compile a CUDA file with TORCH_TARGET_VERSION=2.9.0.
+        Returns (success, error_message).
+        """
+        if not CUDA_HOME:
+            return False, "CUDA_HOME not set"
+
+        torch_version_2_9 = "0x0209000000000000"
+
+        cmd = [
+            os.path.join(CUDA_HOME, "bin", "nvcc"),
+            "-c",
+            "-std=c++17",
+            f"-DTORCH_TARGET_VERSION={torch_version_2_9}",
+            f"-I{source_file.parent}",  # For includes in same directory
+            *self.pytorch_includes,
+            *self.cuda_includes,
+        ]
+
+        cmd.extend([str(source_file), "-o", str(output_file)])
+
+        result = subprocess.run(cmd, capture_output=True, text=True, timeout=30)
+
+        if result.returncode == 0:
+            return True, ""
+        else:
+            return False, result.stderr
+
+    def _test_function_file(self, source_file: Path):
+        """Test that a function file fails to compile with TORCH_TARGET_VERSION=2.9.0."""
+        func_name = source_file.stem
+        obj_file = self.build_dir / f"{func_name}.o"
+
+        # Choose the appropriate compiler based on file extension
+        if source_file.suffix == ".cu":
+            if not self.cuda_available:
+                self.skipTest(f"CUDA not available, skipping {source_file.name}")
+            success, error_msg = self._compile_cu_file(source_file, obj_file)
+        else:
+            success, error_msg = self._compile_cpp_file(source_file, obj_file)
+
+        obj_file.unlink(missing_ok=True)
+
+        # Print error details for debugging
+        if not success and os.environ.get("VERSION_COMPAT_DEBUG", "0") == "1":
+            relevant_errors = self._extract_relevant_errors(error_msg)
+            if relevant_errors:
+                print(f"\n  Compilation errors for {func_name} (requires 2.10+):")
+                for err in relevant_errors[:5]:
+                    print(f"    {err}")
+                if len(relevant_errors) > 5:
+                    print(f"    ... and {len(relevant_errors) - 5} more errors")
+
+        self.assertFalse(
+            success,
+            f"Function {func_name} compiled successfully with TORCH_TARGET_VERSION=2.9.0. "
+            f"This could mean two things.\n\t1. It should run with 2.9.0 and should be "
+            "moved to libtorch_agnostic_2_9_extension\n\t2. The function(s) it tests do not use the "
+            "proper TORCH_FEATURE_VERSION guards\n\nThe libtorch_agnostic_targetting CI workflow will "
+            "verify if you incorrectly move this to the 2_9 extension instead of adding "
+            "the appropriate version guards.",
+        )
+
+    def test_mv_tensor_accessor_cpu_works_with_2_9(self):
+        """Test that mv_tensor_accessor_cpu.cpp compiles successfully with 2.9.0.
+
+        This is a negative test - it ensures that a file we expect to work with 2.9.0
+        actually does compile. This validates that our test infrastructure correctly
+        distinguishes between files that require 2.10+ and those that don't.
+        """
+        cpp_file = self.csrc_dir / "mv_tensor_accessor_cpu.cpp"
+
+        if not cpp_file.exists():
+            self.skipTest(f"{cpp_file} not found - this is a test file only")
+
+        obj_file = self.build_dir / "mv_tensor_accessor_cpu.o"
+        success, error_msg = self._compile_cpp_file(cpp_file, obj_file)
+
+        # Clean up
+        obj_file.unlink(missing_ok=True)
+
+        if not success:
+            relevant_errors = self._extract_relevant_errors(error_msg)
+            if relevant_errors:
+                print("\n  Unexpected compilation errors for mv_tensor_accessor_cpu:")
+                for err in relevant_errors[:10]:
+                    print(f"{err}")
+
+        self.assertTrue(
+            success,
+            f"mv_tensor_accessor_cpu.cpp failed to compile with TORCH_TARGET_VERSION=2.9.0. "
+            f"This file is expected to work with 2.9.0 since it doesn't use 2.10+ features. "
+            f"Error: {error_msg[:500]}",
+        )
+
+    def test_mv_tensor_accessor_cuda_works_with_2_9(self):
+        """Test that mv_tensor_accessor_cuda.cu compiles successfully with 2.9.0.
+
+        This is a negative test - it ensures that a .cu file we expect to work with 2.9.0
+        actually does compile. This validates that our test infrastructure correctly
+        compiles CUDA files and distinguishes between files that require 2.10+ and those
+        that don't.
+        """
+        if not self.cuda_available:
+            self.skipTest(
+                "CUDA not available, skipping mv_tensor_accessor_cuda.cu test"
+            )
+
+        cu_file = self.csrc_dir / "mv_tensor_accessor_cuda.cu"
+
+        if not cu_file.exists():
+            self.skipTest(f"{cu_file} not found - this is a test file only")
+
+        obj_file = self.build_dir / "cuda_kernel.o"
+        success, error_msg = self._compile_cu_file(cu_file, obj_file)
+
+        # Clean up
+        obj_file.unlink(missing_ok=True)
+
+        if not success:
+            relevant_errors = self._extract_relevant_errors(error_msg)
+            if relevant_errors:
+                print(
+                    "\n  Unexpected compilation errors for mv_tensor_accessor_cuda.cu:"
+                )
+                for err in relevant_errors[:10]:
+                    print(f"{err}")
+
+        self.assertTrue(
+            success,
+            f"mv_tensor_accessor_cuda.cu failed to compile with TORCH_TARGET_VERSION=2.9.0. "
+            f"This file is expected to work with 2.9.0 since it doesn't use 2.10+ features. "
+            f"Error: {error_msg[:500]}",
+        )
+
+    @staticmethod
+    def _extract_relevant_errors(error_msg: str) -> list[str]:
+        """Extract the most relevant error messages."""
+        error_lines = error_msg.strip().split("\n")
+        relevant_errors = []
+
+        for line in error_lines:
+            line_lower = line.lower()
+            if (
+                "error:" in line_lower
+                or "undefined" in line_lower
+                or "undeclared" in line_lower
+                or "no member named" in line_lower
+            ):
+                relevant_errors.append(line.strip())
+
+        return relevant_errors
+
+
+# Dynamically create test methods for each .cpp and .cu file
+def _create_test_method_for_file(source_file: Path):
+    """Create a test method for a specific source file."""
+
+    def test_method_impl(self):
+        self._test_function_file(source_file)
+
+    # Set a descriptive name and docstring
+    func_name = source_file.stem
+    file_ext = source_file.suffix
+    test_method_impl.__name__ = f"test_{func_name}_requires_2_10"
+    test_method_impl.__doc__ = f"Test that {func_name}{file_ext} requires PyTorch 2.10+"
+
+    return test_method_impl
+
+
+# Test discovery: generate a test for each .cpp and .cu file
+_csrc_dir = Path(__file__).parent / "libtorch_agnostic_2_10" / "csrc"
+if _csrc_dir.exists():
+    # Collect both .cpp and .cu files, excluding those used for negative tests
+    _source_files = sorted(
+        [
+            f
+            for f in _csrc_dir.rglob("*.cpp")
+            if f.name not in ("mv_tensor_accessor_cpu.cpp",)
+        ]
+        + [
+            f
+            for f in _csrc_dir.rglob("*.cu")
+            if f.name not in ("mv_tensor_accessor_cuda.cu",)
+        ]
+    )
+
+    for _source_file in _source_files:
+        _test_method = _create_test_method_for_file(_source_file)
+        setattr(FunctionVersionCompatibilityTest, _test_method.__name__, _test_method)
+
+    del (
+        _create_test_method_for_file,
+        _csrc_dir,
+        _source_files,
+        _source_file,
+        _test_method,
+    )
+
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/cpp_extensions/libtorch_agnostic_2_9_extension/libtorch_agnostic_2_9/__init__.py b/test/cpp_extensions/libtorch_agnostic_2_9_extension/libtorch_agnostic_2_9/__init__.py
new file mode 100644
index 000000000000..7fa8732335cf
--- /dev/null
+++ b/test/cpp_extensions/libtorch_agnostic_2_9_extension/libtorch_agnostic_2_9/__init__.py
@@ -0,0 +1,21 @@
+import ctypes
+from pathlib import Path
+
+import torch
+
+
+so_files = list(Path(__file__).parent.glob("_C*.so"))
+assert len(so_files) == 1, f"Expected one _C*.so file, found {len(so_files)}"
+
+# use ctypes.CDLL instead of load_library to be able to test the unload logic
+# below code is reduced from the load_library code
+with torch._ops.dl_open_guard():
+    loaded_lib = ctypes.CDLL(so_files[0])
+
+from . import ops
+
+
+__all__ = [
+    "loaded_lib",
+    "ops",
+]
diff --git a/test/cpp_extensions/libtorch_agnostic_2_9_extension/libtorch_agnostic_2_9/csrc/cuda_kernel.cu b/test/cpp_extensions/libtorch_agnostic_2_9_extension/libtorch_agnostic_2_9/csrc/cuda_kernel.cu
new file mode 100644
index 000000000000..88c19d0ebf06
--- /dev/null
+++ b/test/cpp_extensions/libtorch_agnostic_2_9_extension/libtorch_agnostic_2_9/csrc/cuda_kernel.cu
@@ -0,0 +1,44 @@
+#include "kernel.h"
+
+#include <cuda_runtime.h>
+#include <torch/csrc/stable/library.h>
+#include <torch/csrc/stable/ops.h>
+#include <torch/csrc/stable/tensor.h>
+
+using torch::stable::Tensor;
+
+Tensor mv_tensor_accessor_cuda(Tensor m, Tensor v) {
+  STD_TORCH_CHECK(m.dim() == 2, "m must be 2D");
+  STD_TORCH_CHECK(v.dim() == 1, "v must be 1D");
+  STD_TORCH_CHECK(m.size(1) == v.size(0), "m.shape[1] == v.shape[0] must hold");
+  STD_TORCH_CHECK(
+      m.scalar_type() == v.scalar_type(), "m and v must have the same dtype");
+  STD_TORCH_CHECK(
+      m.device() == v.device(), "m and v must be on the same device");
+  Tensor res = new_empty(m, {m.size(0)});
+  THO_DISPATCH_V2(
+      m.scalar_type(),
+      "mv_tensor_accessor_cuda",
+      AT_WRAP(([&]() {
+        auto resa = Accessor_cuda<scalar_t, 1>(
+            reinterpret_cast<scalar_t*>(res.data_ptr()),
+            res.sizes().data(),
+            res.strides().data());
+        auto ma = Accessor_cuda<scalar_t, 2>(
+            reinterpret_cast<scalar_t*>(m.data_ptr()),
+            m.sizes().data(),
+            m.strides().data());
+        auto va = Accessor_cuda<scalar_t, 1>(
+            reinterpret_cast<scalar_t*>(v.data_ptr()),
+            v.sizes().data(),
+            v.strides().data());
+        mv_tensor_accessor_kernel<Accessor_cuda, scalar_t>
+            <<<1, 1, 0, 0>>>(resa, ma, va);
+      })),
+      AT_FLOATING_TYPES);
+  return res;
+}
+
+STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic_2_9, CUDA, m) {
+  m.impl("mv_tensor_accessor", TORCH_BOX(&mv_tensor_accessor_cuda));
+}
diff --git a/test/cpp_extensions/libtorch_agnostic_extension/libtorch_agnostic/csrc/kernel.cpp b/test/cpp_extensions/libtorch_agnostic_2_9_extension/libtorch_agnostic_2_9/csrc/kernel.cpp
similarity index 53%
rename from test/cpp_extensions/libtorch_agnostic_extension/libtorch_agnostic/csrc/kernel.cpp
rename to test/cpp_extensions/libtorch_agnostic_2_9_extension/libtorch_agnostic_2_9/csrc/kernel.cpp
index 46bcb9633a31..9541c77a8738 100644
--- a/test/cpp_extensions/libtorch_agnostic_extension/libtorch_agnostic/csrc/kernel.cpp
+++ b/test/cpp_extensions/libtorch_agnostic_2_9_extension/libtorch_agnostic_2_9/csrc/kernel.cpp
@@ -1,8 +1,6 @@
 #include "kernel.h"
 
 #include <torch/csrc/inductor/aoti_torch/c/shim.h>
-#include <torch/csrc/stable/accelerator.h>
-#include <torch/csrc/stable/device.h>
 #include <torch/csrc/stable/library.h>
 #include <torch/csrc/stable/tensor.h>
 #include <torch/csrc/stable/ops.h>
@@ -11,6 +9,7 @@
 
 #ifdef LAE_USE_CUDA
 #include <cuda_runtime.h>
+#include <torch/csrc/stable/accelerator.h>
 #endif
 
 #include <optional>
@@ -68,11 +67,11 @@ Tensor sgd_out_of_place(
   return out;
 }
 
-STABLE_TORCH_LIBRARY(libtorch_agnostic, m) {
+STABLE_TORCH_LIBRARY(libtorch_agnostic_2_9, m) {
   m.def("sgd_out_of_place(Tensor param, Tensor grad, float weight_decay, float lr, bool maximize) -> Tensor");
 }
 
-STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic, CPU, m) {
+STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic_2_9, CPU, m) {
   m.impl("sgd_out_of_place", TORCH_BOX(&sgd_out_of_place));
 }
 
@@ -81,15 +80,15 @@ Tensor identity(Tensor t) {
 }
 
 
-STABLE_TORCH_LIBRARY_FRAGMENT(libtorch_agnostic, m) {
+STABLE_TORCH_LIBRARY_FRAGMENT(libtorch_agnostic_2_9, m) {
   m.def("identity(Tensor t) -> Tensor");
 }
 
-STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic, CUDA, m) {
+STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic_2_9, CUDA, m) {
   m.impl("identity", TORCH_BOX(&identity));
 }
 
-STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic, CPU, m) {
+STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic_2_9, CPU, m) {
   m.impl("identity", TORCH_BOX(&identity));
 }
 
@@ -101,11 +100,11 @@ Tensor my_abs(Tensor t) {
   return torch::stable::detail::to<Tensor>(stack[0]);
 }
 
-STABLE_TORCH_LIBRARY_FRAGMENT(libtorch_agnostic, m) {
+STABLE_TORCH_LIBRARY_FRAGMENT(libtorch_agnostic_2_9, m) {
   m.def("my_abs(Tensor t) -> Tensor");
 }
 
-STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic, CompositeExplicitAutograd, m) {
+STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic_2_9, CompositeExplicitAutograd, m) {
   m.impl("my_abs", TORCH_BOX(&my_abs));
 }
 
@@ -127,11 +126,11 @@ Tensor my_ones_like(Tensor t, StableIValue device) {
   return torch::stable::detail::to<Tensor>(stack[0]);
 }
 
-STABLE_TORCH_LIBRARY_FRAGMENT(libtorch_agnostic, m) {
+STABLE_TORCH_LIBRARY_FRAGMENT(libtorch_agnostic_2_9, m) {
   m.def("my_ones_like(Tensor t, Device d) -> Tensor");
 }
 
-STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic, CompositeExplicitAutograd, m) {
+STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic_2_9, CompositeExplicitAutograd, m) {
   m.impl("my_ones_like", TORCH_BOX(&my_ones_like));
 }
 
@@ -154,11 +153,11 @@ std::tuple<Tensor, Tensor, bool> exp_neg_is_leaf(Tensor t1, Tensor t2, Tensor t3
     torch::stable::detail::to<bool>(stack_is_leaf[0]));
 }
 
-STABLE_TORCH_LIBRARY_FRAGMENT(libtorch_agnostic, m) {
+STABLE_TORCH_LIBRARY_FRAGMENT(libtorch_agnostic_2_9, m) {
   m.def("exp_neg_is_leaf(Tensor t1, Tensor t2, Tensor t3) -> (Tensor, Tensor, bool)");
 }
 
-STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic, CompositeExplicitAutograd, m) {
+STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic_2_9, CompositeExplicitAutograd, m) {
   m.impl("exp_neg_is_leaf", TORCH_BOX(&exp_neg_is_leaf));
 }
 
@@ -170,11 +169,11 @@ Tensor neg_exp(Tensor t) {
   return torch::stable::detail::to<Tensor>(stack[0]);
 }
 
-STABLE_TORCH_LIBRARY_FRAGMENT(libtorch_agnostic, m) {
+STABLE_TORCH_LIBRARY_FRAGMENT(libtorch_agnostic_2_9, m) {
   m.def("neg_exp(Tensor t) -> Tensor");
 }
 
-STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic, CompositeExplicitAutograd, m) {
+STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic_2_9, CompositeExplicitAutograd, m) {
   m.impl("neg_exp", TORCH_BOX(&neg_exp));
 }
 
@@ -194,11 +193,11 @@ Tensor divide_neg_exp(Tensor t) {
   return torch::stable::detail::to<Tensor>(stack_div[0]);
 }
 
-STABLE_TORCH_LIBRARY_FRAGMENT(libtorch_agnostic, m) {
+STABLE_TORCH_LIBRARY_FRAGMENT(libtorch_agnostic_2_9, m) {
   m.def("divide_neg_exp(Tensor t) -> Tensor");
 }
 
-STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic, CompositeExplicitAutograd, m) {
+STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic_2_9, CompositeExplicitAutograd, m) {
   m.impl("divide_neg_exp", TORCH_BOX(&divide_neg_exp));
 }
 
@@ -206,11 +205,11 @@ bool is_contiguous(Tensor t) {
   return t.is_contiguous();
 }
 
-STABLE_TORCH_LIBRARY_FRAGMENT(libtorch_agnostic, m) {
+STABLE_TORCH_LIBRARY_FRAGMENT(libtorch_agnostic_2_9, m) {
   m.def("is_contiguous(Tensor t) -> bool");
 }
 
-STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic, CompositeExplicitAutograd, m) {
+STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic_2_9, CompositeExplicitAutograd, m) {
   m.impl("is_contiguous", TORCH_BOX(&is_contiguous));
 }
 
@@ -265,7 +264,7 @@ Tensor my_clone(Tensor t) {
   return clone(t);
 }
 
-STABLE_TORCH_LIBRARY_FRAGMENT(libtorch_agnostic, m) {
+STABLE_TORCH_LIBRARY_FRAGMENT(libtorch_agnostic_2_9, m) {
   m.def("my_transpose(Tensor t, int dim0, int dim1) -> Tensor");
   m.def("my_empty_like(Tensor t) -> Tensor");
   m.def("fill_infinity(Tensor(a!) t) -> Tensor(a!)");
@@ -277,7 +276,7 @@ STABLE_TORCH_LIBRARY_FRAGMENT(libtorch_agnostic, m) {
   m.def("my_clone(Tensor t) -> Tensor");
 }
 
-STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic, CompositeExplicitAutograd, m) {
+STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic_2_9, CompositeExplicitAutograd, m) {
   m.impl("my_transpose", TORCH_BOX(&my_transpose));
   m.impl("my_empty_like", TORCH_BOX(&my_empty_like));
   m.impl("fill_infinity", TORCH_BOX(&fill_infinity));
@@ -288,7 +287,7 @@ STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic, CompositeExplicitAutograd, m) {
   m.impl("my_clone", TORCH_BOX(&my_clone));
 }
 
-STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic, CompositeImplicitAutograd, m) {
+STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic_2_9, CompositeImplicitAutograd, m) {
   m.impl("my_pad", TORCH_BOX(&my_pad));
   m.impl("my_narrow", TORCH_BOX(&my_narrow));
 }
@@ -305,7 +304,7 @@ Tensor my_amax_vec(Tensor t) {
   return amax(t, {0,1}, false);
 }
 
-STABLE_TORCH_LIBRARY_FRAGMENT(libtorch_agnostic, m) {
+STABLE_TORCH_LIBRARY_FRAGMENT(libtorch_agnostic_2_9, m) {
   m.def("my_zero_(Tensor(a!) t) -> Tensor(a!)");
   m.def("my_amax(Tensor a) -> Tensor");
   m.def("my_amax_vec(Tensor a) -> Tensor");
@@ -333,189 +332,13 @@ bool test_default_constructor(bool defined) {
 }
 
 
-STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic, CompositeExplicitAutograd, m) {
+STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic_2_9, CompositeExplicitAutograd, m) {
   m.impl("my_zero_", TORCH_BOX(&my_zero_));
   m.impl("my_amax", TORCH_BOX(&my_amax));
   m.impl("my_amax_vec", TORCH_BOX(&my_amax_vec));
   m.impl("test_default_constructor", TORCH_BOX(&test_default_constructor));
 }
 
-std::vector<Tensor> my__foreach_mul(torch::headeronly::HeaderOnlyArrayRef<Tensor> self, torch::headeronly::HeaderOnlyArrayRef<Tensor> other) {
-  std::array<StableIValue, 2> stack = {torch::stable::detail::from(self), torch::stable::detail::from(other)};
-  aoti_torch_call_dispatcher("aten::_foreach_mul", "List", stack.data());
-  return torch::stable::detail::to<std::vector<Tensor>>(stack[0]);
-}
-
-void my__foreach_mul_(torch::headeronly::HeaderOnlyArrayRef<Tensor> self, torch::headeronly::HeaderOnlyArrayRef<Tensor> other) {
-  std::array<StableIValue, 2> stack = {torch::stable::detail::from(self), torch::stable::detail::from(other)};
-  aoti_torch_call_dispatcher("aten::_foreach_mul_", "List", stack.data());
-}
-
-std::vector<Tensor> make_tensor_clones_and_call_foreach(Tensor t1, Tensor t2) {
-  // This function tests that my__foreach_mul can take in std::initializer_lists
-  // in addition to std::vectors.
-  Tensor t1_1 = my_clone(t1);
-  Tensor t1_2 = my_clone(t1);
-  Tensor t2_1 = my_clone(t2);
-  Tensor t2_2 = my_clone(t2);
-  return my__foreach_mul({t1_1, t2_1}, {t1_2, t2_2});
-}
-
-STABLE_TORCH_LIBRARY_FRAGMENT(libtorch_agnostic, m) {
-  m.def("my__foreach_mul(Tensor[] self, Tensor[] other) -> Tensor[]");
-  m.def("my__foreach_mul_(Tensor(a!)[] self, Tensor[] other) -> ()");
-  m.def("make_tensor_clones_and_call_foreach(Tensor t1, Tensor t2) -> Tensor[]");
-}
-
-STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic, CompositeExplicitAutograd, m) {
-  m.impl("my__foreach_mul", TORCH_BOX(&my__foreach_mul));
-  m.impl("my__foreach_mul_", TORCH_BOX(&my__foreach_mul_));
-  m.impl("make_tensor_clones_and_call_foreach", TORCH_BOX(&make_tensor_clones_and_call_foreach));
-}
-
-// Test functions for torch::stable::Tensor device method
-
-torch::stable::Device test_tensor_device(torch::stable::Tensor tensor) {
-  return tensor.device();
-}
-
-void boxed_test_tensor_device(
-    StableIValue* stack,
-    uint64_t num_args,
-    uint64_t num_outputs) {
-  torch::stable::Device res = test_tensor_device(
-      torch::stable::detail::to<torch::stable::Tensor>(stack[0]));
-  stack[0] = torch::stable::detail::from(res);
-}
-
-// Test functions for torch::stable::Device
-
-torch::stable::Device test_device_constructor(
-    bool is_cuda,
-    torch::stable::DeviceIndex index,
-    bool use_str) {
-  using torch::stable::Device;
-  using torch::stable::DeviceType;
-
-  if (use_str) {
-    std::string device_str;
-    if (is_cuda) {
-      device_str = "cuda:" + std::to_string(index);
-    } else {
-      device_str = "cpu";
-    }
-    return Device(device_str);
-  } else {
-    if (is_cuda) {
-      return Device(DeviceType::CUDA, index);
-    } else {
-      return Device(DeviceType::CPU);
-    }
-  }
-}
-
-void boxed_test_device_constructor(
-    StableIValue* stack,
-    uint64_t num_args,
-    uint64_t num_outputs) {
-  torch::stable::Device res = test_device_constructor(
-      torch::stable::detail::to<bool>(stack[0]),
-      torch::stable::detail::to<torch::stable::DeviceIndex>(stack[1]),
-      torch::stable::detail::to<bool>(stack[2]));
-  stack[0] = torch::stable::detail::from(res);
-}
-
-bool test_device_equality(torch::stable::Device d1, torch::stable::Device d2) {
-  return d1 == d2;
-}
-
-void boxed_test_device_equality(
-    StableIValue* stack,
-    uint64_t num_args,
-    uint64_t num_outputs) {
-  bool res = test_device_equality(
-      torch::stable::detail::to<torch::stable::Device>(stack[0]),
-      torch::stable::detail::to<torch::stable::Device>(stack[1]));
-  stack[0] = torch::stable::detail::from(res);
-}
-
-torch::stable::Device test_device_set_index(
-    torch::stable::Device device,
-    torch::stable::DeviceIndex index) {
-  device.set_index(index);
-  return device;
-}
-
-void boxed_test_device_set_index(
-    StableIValue* stack,
-    uint64_t num_args,
-    uint64_t num_outputs) {
-  torch::stable::Device res = test_device_set_index(
-      torch::stable::detail::to<torch::stable::Device>(stack[0]),
-      torch::stable::detail::to<torch::stable::DeviceIndex>(stack[1]));
-  stack[0] = torch::stable::detail::from(res);
-}
-
-torch::stable::DeviceIndex test_device_index(torch::stable::Device device) {
-  return device.index();
-}
-
-void boxed_test_device_index(
-    StableIValue* stack,
-    uint64_t num_args,
-    uint64_t num_outputs) {
-  torch::stable::DeviceIndex res = test_device_index(
-      torch::stable::detail::to<torch::stable::Device>(stack[0]));
-  stack[0] = torch::stable::detail::from(res);
-}
-
-bool test_device_is_cuda(torch::stable::Device device) {
-  return device.is_cuda();
-}
-
-void boxed_test_device_is_cuda(
-    StableIValue* stack,
-    uint64_t num_args,
-    uint64_t num_outputs) {
-  bool res = test_device_is_cuda(
-      torch::stable::detail::to<torch::stable::Device>(stack[0]));
-  stack[0] = torch::stable::detail::from(res);
-}
-
-bool test_device_is_cpu(torch::stable::Device device) {
-  return device.is_cpu();
-}
-
-void boxed_test_device_is_cpu(
-    StableIValue* stack,
-    uint64_t num_args,
-    uint64_t num_outputs) {
-  bool res = test_device_is_cpu(
-      torch::stable::detail::to<torch::stable::Device>(stack[0]));
-  stack[0] = torch::stable::detail::from(res);
-}
-
-STABLE_TORCH_LIBRARY_FRAGMENT(libtorch_agnostic, m) {
-  m.def("test_tensor_device(Tensor t) -> Device");
-  m.def(
-      "test_device_constructor(bool is_cuda, DeviceIndex index, bool use_str) -> Device");
-  m.def("test_device_equality(Device d1, Device d2) -> bool");
-  m.def("test_device_set_index(Device device, DeviceIndex index) -> Device");
-  m.def("test_device_index(Device device) -> DeviceIndex");
-  m.def("test_device_is_cuda(Device device) -> bool");
-  m.def("test_device_is_cpu(Device device) -> bool");
-}
-
-STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic, CompositeExplicitAutograd, m) {
-  m.impl("test_tensor_device", &boxed_test_tensor_device);
-  m.impl("test_device_constructor", &boxed_test_device_constructor);
-  m.impl("test_device_equality", &boxed_test_device_equality);
-  m.impl("test_device_set_index", &boxed_test_device_set_index);
-  m.impl("test_device_index", &boxed_test_device_index);
-  m.impl("test_device_is_cuda", &boxed_test_device_is_cuda);
-  m.impl("test_device_is_cpu", &boxed_test_device_is_cpu);
-}
-
 Tensor mv_tensor_accessor_cpu(Tensor m, Tensor v) {
   STD_TORCH_CHECK(m.dim() == 2, "m must be 2D");
   STD_TORCH_CHECK(v.dim() == 1, "v must be 1D");
@@ -534,11 +357,11 @@ Tensor mv_tensor_accessor_cpu(Tensor m, Tensor v) {
   return res;
 }
 
-STABLE_TORCH_LIBRARY_FRAGMENT(libtorch_agnostic, m) {
+STABLE_TORCH_LIBRARY_FRAGMENT(libtorch_agnostic_2_9, m) {
   m.def("mv_tensor_accessor(Tensor m, Tensor v) -> Tensor");
 }
 
-STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic, CPU, m) {
+STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic_2_9, CPU, m) {
   m.impl("mv_tensor_accessor", TORCH_BOX(&mv_tensor_accessor_cpu));
 }
 
@@ -584,14 +407,14 @@ int64_t test_get_current_device_index() {
   return torch::stable::accelerator::getCurrentDeviceIndex();
 }
 
-STABLE_TORCH_LIBRARY_FRAGMENT(libtorch_agnostic, m) {
+STABLE_TORCH_LIBRARY_FRAGMENT(libtorch_agnostic_2_9, m) {
   m.def("test_device_guard(int device_index) -> int");
   m.def("test_device_guard_set_index() -> int");
   m.def("test_stream(int device_index) -> int");
   m.def("test_get_current_device_index() -> int");
 }
 
-STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic, CompositeExplicitAutograd, m) {
+STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic_2_9, CompositeExplicitAutograd, m) {
   m.impl("test_device_guard", TORCH_BOX(&test_device_guard));
   m.impl("test_device_guard_set_index", TORCH_BOX(&test_device_guard_set_index));
   m.impl("test_stream", TORCH_BOX(&test_stream));
@@ -600,100 +423,14 @@ STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic, CompositeExplicitAutograd, m) {
 
 #endif // LAE_USE_CUDA
 
-Tensor test_parallel_for(int64_t size, int64_t grain_size) {
-  AtenTensorHandle tensor_handle;
-  int64_t stride = 1;
-
-  aoti_torch_empty_strided(
-      1,
-      &size,
-      &stride,
-      aoti_torch_dtype_int64(),
-      aoti_torch_device_type_cpu(),
-      0,
-      &tensor_handle);
-
-  Tensor tensor(tensor_handle);
-  int64_t* data_ptr = reinterpret_cast<int64_t*>(tensor.data_ptr());
-
-  torch::stable::zero_(tensor);
-
-  // Use parallel_for to fill each element with its index
-  // If using a parallel path, the thread id is encoded in the upper 32 bits
-  torch::stable::parallel_for(
-      0, size, grain_size, [data_ptr](int64_t begin, int64_t end) {
-        for (auto i = begin; i < end; i++) {
-          STD_TORCH_CHECK(i <= UINT32_MAX);
-          uint32_t thread_id;
-          torch_get_thread_idx(&thread_id);
-          data_ptr[i] = i | (static_cast<int64_t>(thread_id) << 32);
-        }
-      });
-
-  return tensor;
-}
-
-void boxed_test_parallel_for(
-    StableIValue* stack,
-    uint64_t num_args,
-    uint64_t num_outputs) {
-  Tensor res = test_parallel_for(to<int64_t>(stack[0]), to<int64_t>(stack[1]));
-  stack[0] = from(res);
-}
-
-uint32_t test_get_num_threads() {
-  return torch::stable::get_num_threads();
-}
-
-void boxed_test_get_num_threads(
-    StableIValue* stack,
-    uint64_t num_args,
-    uint64_t num_outputs) {
-  uint32_t res = test_get_num_threads();
-  stack[0] = from(res);
-}
-
-STABLE_TORCH_LIBRARY_FRAGMENT(libtorch_agnostic, m) {
-  m.def("test_parallel_for(int size, int grain_size) -> Tensor");
-  m.def("test_get_num_threads() -> int");
-}
-
-STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic, CompositeExplicitAutograd, m) {
-  m.impl("test_parallel_for", &boxed_test_parallel_for);
-  m.impl("test_get_num_threads", &boxed_test_get_num_threads);
-}
-
-Tensor my_empty(
-    torch::headeronly::HeaderOnlyArrayRef<int64_t> size,
-    std::optional<torch::headeronly::ScalarType> dtype,
-    std::optional<torch::stable::Device> device,
-    std::optional<bool> pin_memory) {
-  return empty(size, dtype, device, pin_memory);
-}
-
 Tensor my_flatten(Tensor t, int64_t start_dim, int64_t end_dim) {
   return flatten(t, start_dim, end_dim);
 }
 
-Tensor my_reshape(Tensor t, torch::headeronly::HeaderOnlyArrayRef<int64_t> shape) {
-  return reshape(t, shape);
-}
-
-Tensor my_view(Tensor t, torch::headeronly::HeaderOnlyArrayRef<int64_t> size) {
-  return view(t, size);
-}
-
-STABLE_TORCH_LIBRARY_FRAGMENT(libtorch_agnostic, m) {
-  m.def(
-      "my_empty(int[] size, ScalarType? dtype=None, Device? device=None, bool? pin_memory=None) -> Tensor");
+STABLE_TORCH_LIBRARY_FRAGMENT(libtorch_agnostic_2_9, m) {
   m.def("my_flatten(Tensor t, int start_dim=0, int end_dim=-1) -> Tensor");
-  m.def("my_reshape(Tensor t, int[] shape) -> Tensor");
-  m.def("my_view(Tensor t, int[] size) -> Tensor");
 }
 
-STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic, CompositeExplicitAutograd, m) {
-  m.impl("my_empty", TORCH_BOX(&my_empty));
+STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic_2_9, CompositeExplicitAutograd, m) {
   m.impl("my_flatten", TORCH_BOX(&my_flatten));
-  m.impl("my_reshape", TORCH_BOX(&my_reshape));
-  m.impl("my_view", TORCH_BOX(&my_view));
 }
diff --git a/test/cpp_extensions/libtorch_agnostic_extension/libtorch_agnostic/csrc/kernel.h b/test/cpp_extensions/libtorch_agnostic_2_9_extension/libtorch_agnostic_2_9/csrc/kernel.h
similarity index 100%
rename from test/cpp_extensions/libtorch_agnostic_extension/libtorch_agnostic/csrc/kernel.h
rename to test/cpp_extensions/libtorch_agnostic_2_9_extension/libtorch_agnostic_2_9/csrc/kernel.h
diff --git a/test/cpp_extensions/libtorch_agnostic_2_9_extension/libtorch_agnostic_2_9/ops.py b/test/cpp_extensions/libtorch_agnostic_2_9_extension/libtorch_agnostic_2_9/ops.py
new file mode 100644
index 000000000000..04a137783655
--- /dev/null
+++ b/test/cpp_extensions/libtorch_agnostic_2_9_extension/libtorch_agnostic_2_9/ops.py
@@ -0,0 +1,363 @@
+import torch
+from torch import Tensor
+
+
+def sgd_out_of_place(param, grad, weight_decay, lr, maximize) -> Tensor:
+    """
+    Computes a single step of SGD on a single parameter Tensor with grad.
+
+    Assumes:
+    - param and grad are the same shape and are 1D.
+    - param and grad are float and on CPU
+
+    Args:
+        param: a 1D tensor of floats
+        grad: a 1D tensor of floats
+        weight_decay: a python double between 0 and 1
+        lr: a python double
+
+    Returns:
+        a 1D float Tensor the same shape as param
+
+    """
+    return torch.ops.libtorch_agnostic_2_9.sgd_out_of_place.default(
+        param, grad, weight_decay, lr, maximize
+    )
+
+
+def identity(t) -> Tensor:
+    """
+    Returns the input tensor
+
+    Args:
+        t: any Tensor
+
+    Returns:
+        a Tensor, the same as input.
+    """
+    return torch.ops.libtorch_agnostic_2_9.identity.default(t)
+
+
+def my_abs(t) -> Tensor:
+    """
+    Returns abs on the input tensor, outputs a new Tensor
+
+    Args:
+        t: any Tensor
+
+    Returns:
+        a Tensor
+    """
+    return torch.ops.libtorch_agnostic_2_9.my_abs.default(t)
+
+
+def my_is_cpu(t) -> bool:
+    """
+    Returns is_cpu on the input tensor.
+
+    Args:
+        t: any Tensor
+
+    Returns:
+        a bool
+    """
+    return torch.ops.libtorch_agnostic_2_9.my_is_cpu.default(t)
+
+
+def my_ones_like(tensor, device) -> Tensor:
+    """
+    Returns a new Tensor like the input tensor, but with all ones
+
+    Args:
+        tensor: any Tensor
+        device: a device string
+
+    Returns:
+        a ones Tensor with the same dtype and shape and other attributes
+        like the input tensor
+    """
+    return torch.ops.libtorch_agnostic_2_9.my_ones_like.default(tensor, device)
+
+
+def exp_neg_is_leaf(t1, t2, t3) -> tuple[Tensor, Tensor, bool]:
+    """
+    Returns a Tensor, Tensor, bool tuple corresponding to the respective inputs
+    t1, t2, and t3.
+
+    Args:
+        t1: Tensor
+        t2: Tensor
+        t3: Tensor
+
+    Returns:
+        (exp(t1), neg(t2), is_leaf(t3))
+    """
+    return torch.ops.libtorch_agnostic_2_9.exp_neg_is_leaf.default(t1, t2, t3)
+
+
+def neg_exp(t) -> Tensor:
+    """
+    Returns a Tensor composing neg of exp
+
+    Args:
+        t: Tensor
+
+    Returns: neg(exp(t))
+    """
+    return torch.ops.libtorch_agnostic_2_9.neg_exp.default(t)
+
+
+def divide_neg_exp(t) -> Tensor:
+    """
+    Returns a Tensor division of neg and exp
+
+    Args:
+        t: Tensor
+
+    Returns: divide(neg(t), exp(t))
+    """
+    return torch.ops.libtorch_agnostic_2_9.divide_neg_exp.default(t)
+
+
+def is_contiguous(t) -> bool:
+    """
+    Returns a bool indicating if the input tensor is contiguous
+
+    Args:
+        t: Tensor
+
+    Returns: is_contiguous(t)
+    """
+    return torch.ops.libtorch_agnostic_2_9.is_contiguous.default(t)
+
+
+def my_transpose(t, dim0, dim1) -> Tensor:
+    """
+    Returns t.transpose(dim0, dim1)
+
+    Args:
+        t: Tensor
+
+    Returns: my_transpose(t, dim0, dim1)
+    """
+    return torch.ops.libtorch_agnostic_2_9.my_transpose.default(t, dim0, dim1)
+
+
+def my_empty_like(t) -> Tensor:
+    """
+    Returns t.empty_like()
+
+    Args:
+        t: Tensor
+
+    Returns: my_empty_like(t)
+    """
+    return torch.ops.libtorch_agnostic_2_9.my_empty_like.default(t)
+
+
+def my_zero_(t) -> Tensor:
+    """
+    Returns t.zero_()
+
+    Args:
+        t: Tensor
+
+    Returns: my_zero_(t)
+    """
+    return torch.ops.libtorch_agnostic_2_9.my_zero_.default(t)
+
+
+def my_amax(t) -> Tensor:
+    """
+    Returns t.amax()
+
+    Args:
+        t: Tensor
+
+    Returns: amax(t)
+    """
+    return torch.ops.libtorch_agnostic_2_9.my_amax.default(t)
+
+
+def my_amax_vec(t) -> Tensor:
+    """
+    Returns t.amax()
+
+    Args:
+        t: Tensor
+
+    Returns: amax(t)
+    """
+    return torch.ops.libtorch_agnostic_2_9.my_amax_vec.default(t)
+
+
+def fill_infinity(t) -> Tensor:
+    """
+    Fills the tensor with inf.
+
+    Args:
+        t: Tensor to fill
+
+    Returns: The modified tensor (same as input)
+    """
+    return torch.ops.libtorch_agnostic_2_9.fill_infinity.default(t)
+
+
+def test_default_constructor(defined) -> bool:
+    """
+    Tests the default constructor for torch::stable::Tensor.
+
+    Args:
+        defined: bool - if True, tests defined tensor; if False, tests undefined tensor
+
+    Returns: bool - result of calling .defined() on the tensor
+    """
+    return torch.ops.libtorch_agnostic_2_9.test_default_constructor.default(defined)
+
+
+def mv_tensor_accessor(m, v) -> Tensor:
+    """
+    Returns matrix-vector product.
+
+    Args:
+        m: any 2-D Tensor with shape (N, M)
+        v: any 1-D Tensor with shape (M,)
+
+    Returns:
+        a 1-D Tensor with shape (N,)
+    """
+    return torch.ops.libtorch_agnostic_2_9.mv_tensor_accessor.default(m, v)
+
+
+def my_pad(t) -> Tensor:
+    """
+    Pads the input tensor with hardcoded padding parameters.
+
+    Args:
+        t: Input tensor
+
+    Returns: Padded tensor with padding [1, 2, 2, 1], mode "constant", value 0.0
+    """
+    return torch.ops.libtorch_agnostic_2_9.my_pad.default(t)
+
+
+def my_narrow(t, dim, start, length) -> Tensor:
+    """
+    Returns a new tensor that is a narrowed version of the input tensor.
+
+    Args:
+        t: Input tensor
+        dim: Dimension along which to narrow
+        start: Starting position
+        length: Length of the narrowed section
+
+    Returns: Narrowed tensor
+    """
+    return torch.ops.libtorch_agnostic_2_9.my_narrow.default(t, dim, start, length)
+
+
+def my_copy_(dst, src, non_blocking) -> Tensor:
+    """
+    Returns tensor dst that is updated with src elements.
+
+    Args:
+        dst: Destination tensor
+        src: Source tensor
+        non_blocking: bool
+
+    Returns: Updated tensor
+    """
+    return torch.ops.libtorch_agnostic_2_9.my_copy_.default(dst, src, non_blocking)
+
+
+def my_clone(t) -> Tensor:
+    """
+    Returns a clone of input tensor.
+
+    Args:
+        t: Input tensor
+
+    Returns: Cloned tensor
+    """
+    return torch.ops.libtorch_agnostic_2_9.my_clone.default(t)
+
+
+def test_device_guard(device_index) -> int:
+    """
+    Tests the DeviceGuard functionality by creating a device guard and returning an empty tensor.
+
+    Args:
+        device_index: Device index to set the guard to
+
+    Returns: result of cudaGetDevice() as an integer after using the guard
+    """
+    return torch.ops.libtorch_agnostic_2_9.test_device_guard.default(device_index)
+
+
+def test_device_guard_set_index() -> int:
+    """
+    Tests the DeviceGuard set_index functionality by creating a device guard with index 1,
+    then setting it to index 0, and returning the current device.
+
+    Returns: result of cudaGetDevice() as an integer after using set_index
+    """
+    return torch.ops.libtorch_agnostic_2_9.test_device_guard_set_index.default()
+
+
+def test_stream(device_index) -> int:
+    """
+    Tests the Stream functionality by getting the current stream ID for the specified device.
+
+    Args:
+        device_index: Device index to get the stream for
+
+    Returns: Stream ID as an integer
+    """
+    return torch.ops.libtorch_agnostic_2_9.test_stream.default(device_index)
+
+
+def test_get_current_device_index() -> int:
+    """
+    Tests the getCurrentDeviceIndex functionality by getting the current device index.
+
+    Returns: Current device index as an integer
+    """
+    return torch.ops.libtorch_agnostic_2_9.test_get_current_device_index.default()
+
+
+def my_new_empty_dtype_variant(t) -> Tensor:
+    """
+    Returns a new empty tensor with shape [2, 5] and dtype bfloat16
+
+    Args:
+        t: Input tensor used as a reference for device and other properties
+
+    Returns: New empty tensor with shape [2, 5] and dtype bfloat16
+    """
+    return torch.ops.libtorch_agnostic_2_9.my_new_empty_dtype_variant.default(t)
+
+
+def my_new_zeros_dtype_variant(t) -> Tensor:
+    """
+    Returns a new tensor filled with 0s with shape [2, 5] and dtype Float
+
+    Args:
+        t: Input tensor used as a reference for device and other properties
+
+    Returns: New zeros tensor
+    """
+    return torch.ops.libtorch_agnostic_2_9.my_new_zeros_dtype_variant.default(t)
+
+
+def my_flatten(t, start_dim=0, end_dim=-1) -> Tensor:
+    """
+    Flattens the input tensor from start_dim to end_dim into a single dimension.
+
+    Args:
+        t: Tensor - tensor to flatten
+        start_dim: int - first dimension to flatten (default: 0)
+        end_dim: int - last dimension to flatten (default: -1)
+
+    Returns: Tensor - flattened tensor
+    """
+    return torch.ops.libtorch_agnostic_2_9.my_flatten.default(t, start_dim, end_dim)
diff --git a/test/cpp_extensions/libtorch_agnostic_2_9_extension/setup.py b/test/cpp_extensions/libtorch_agnostic_2_9_extension/setup.py
new file mode 100644
index 000000000000..8543d496a432
--- /dev/null
+++ b/test/cpp_extensions/libtorch_agnostic_2_9_extension/setup.py
@@ -0,0 +1,82 @@
+import distutils.command.clean
+import shutil
+from pathlib import Path
+
+from setuptools import find_packages, setup
+
+import torch
+from torch.utils.cpp_extension import BuildExtension, CppExtension, CUDAExtension
+
+
+ROOT_DIR = Path(__file__).parent
+CSRC_DIR = ROOT_DIR / "libtorch_agnostic_2_9" / "csrc"
+
+
+class clean(distutils.command.clean.clean):
+    def run(self):
+        # Run default behavior first
+        distutils.command.clean.clean.run(self)
+
+        # Remove extension
+        for path in (ROOT_DIR / "libtorch_agnostic_2_9").glob("**/*.so"):
+            path.unlink()
+        # Remove build and dist and egg-info directories
+        dirs = [
+            ROOT_DIR / "build",
+            ROOT_DIR / "dist",
+            ROOT_DIR / "libtorch_agnostic_2_9.egg-info",
+        ]
+        for path in dirs:
+            if path.exists():
+                shutil.rmtree(str(path), ignore_errors=True)
+
+
+def get_extension():
+    extra_compile_args = {
+        "cxx": [
+            "-fdiagnostics-color=always",
+            "-DTORCH_STABLE_ONLY",
+            "-DTORCH_TARGET_VERSION=0x0209000000000000",
+        ],
+    }
+    sources = list(CSRC_DIR.glob("**/*.cpp"))
+
+    extension = CppExtension
+    # allow including <cuda_runtime.h>
+    if torch.cuda.is_available():
+        extra_compile_args["cxx"].append("-DLAE_USE_CUDA")
+        extra_compile_args["nvcc"] = [
+            "-O2",
+            "-DTORCH_TARGET_VERSION=0x0209000000000000",
+        ]
+        extension = CUDAExtension
+        sources.extend(CSRC_DIR.glob("**/*.cu"))
+
+    return [
+        extension(
+            "libtorch_agnostic_2_9._C",
+            sources=sorted(str(s) for s in sources),
+            py_limited_api=True,
+            extra_compile_args=extra_compile_args,
+            extra_link_args=[],
+        )
+    ]
+
+
+setup(
+    name="libtorch_agnostic_2_9",
+    version="0.0",
+    author="PyTorch Core Team",
+    description="Example of libtorch agnostic extension for PyTorch 2.9",
+    packages=find_packages(exclude=("test",)),
+    package_data={"libtorch_agnostic_2_9": ["*.dll", "*.dylib", "*.so"]},
+    install_requires=[
+        "torch",
+    ],
+    ext_modules=get_extension(),
+    cmdclass={
+        "build_ext": BuildExtension.with_options(no_python_abi_suffix=True),
+        "clean": clean,
+    },
+    options={"bdist_wheel": {"py_limited_api": "cp39"}},
+)
diff --git a/test/cpp_extensions/libtorch_agnostic_extension/libtorch_agnostic/csrc/cuda_kernel.cu b/test/cpp_extensions/libtorch_agnostic_extension/libtorch_agnostic/csrc/cuda_kernel.cu
deleted file mode 100644
index a6728b20dd37..000000000000
--- a/test/cpp_extensions/libtorch_agnostic_extension/libtorch_agnostic/csrc/cuda_kernel.cu
+++ /dev/null
@@ -1,30 +0,0 @@
-#include "kernel.h"
-
-#include <torch/csrc/stable/library.h>
-#include <torch/csrc/stable/tensor.h>
-#include <torch/csrc/stable/ops.h>
-#include <cuda_runtime.h>
-
-using torch::stable::Tensor;
-
-Tensor mv_tensor_accessor_cuda(Tensor m, Tensor v) {
-  STD_TORCH_CHECK(m.dim() == 2, "m must be 2D");
-  STD_TORCH_CHECK(v.dim() == 1, "v must be 1D");
-  STD_TORCH_CHECK(m.size(1) == v.size(0), "m.shape[1] == v.shape[0] must hold");
-  STD_TORCH_CHECK(m.scalar_type() == v.scalar_type(), "m and v must have the same dtype");
-  STD_TORCH_CHECK(m.device() == v.device(), "m and v must be on the same device");
-  Tensor res = new_empty(m, {m.size(0)});
-  THO_DISPATCH_V2(m.scalar_type(), "mv_tensor_accessor_cuda",
-                  AT_WRAP(([&]() {
-                    auto resa = Accessor_cuda<scalar_t, 1>(reinterpret_cast<scalar_t*>(res.data_ptr()), res.sizes().data(), res.strides().data());
-                    auto ma = Accessor_cuda<scalar_t, 2>(reinterpret_cast<scalar_t*>(m.data_ptr()), m.sizes().data(), m.strides().data());
-                    auto va = Accessor_cuda<scalar_t, 1>(reinterpret_cast<scalar_t*>(v.data_ptr()), v.sizes().data(), v.strides().data());
-                    mv_tensor_accessor_kernel<Accessor_cuda, scalar_t><<<1, 1, 0, 0>>>(resa, ma, va);
-                  })),
-                  AT_FLOATING_TYPES);
-  return res;
-}
-
-STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic, CUDA, m) {
-  m.impl("mv_tensor_accessor", TORCH_BOX(&mv_tensor_accessor_cuda));
-}
diff --git a/test/cpp_extensions/libtorch_agnostic_extension/libtorch_agnostic/ops.py b/test/cpp_extensions/libtorch_agnostic_extension/libtorch_agnostic/ops.py
deleted file mode 100644
index 43e24f0f2046..000000000000
--- a/test/cpp_extensions/libtorch_agnostic_extension/libtorch_agnostic/ops.py
+++ /dev/null
@@ -1,558 +0,0 @@
-import torch
-from torch import Tensor
-
-
-def sgd_out_of_place(param, grad, weight_decay, lr, maximize) -> Tensor:
-    """
-    Computes a single step of SGD on a single parameter Tensor with grad.
-
-    Assumes:
-    - param and grad are the same shape and are 1D.
-    - param and grad are float and on CPU
-
-    Args:
-        param: a 1D tensor of floats
-        grad: a 1D tensor of floats
-        weight_decay: a python double between 0 and 1
-        lr: a python double
-
-    Returns:
-        a 1D float Tensor the same shape as param
-
-    """
-    return torch.ops.libtorch_agnostic.sgd_out_of_place.default(
-        param, grad, weight_decay, lr, maximize
-    )
-
-
-def identity(t) -> Tensor:
-    """
-    Returns the input tensor
-
-    Args:
-        t: any Tensor
-
-    Returns:
-        a Tensor, the same as input.
-    """
-    return torch.ops.libtorch_agnostic.identity.default(t)
-
-
-def my_abs(t) -> Tensor:
-    """
-    Returns abs on the input tensor, outputs a new Tensor
-
-    Args:
-        t: any Tensor
-
-    Returns:
-        a Tensor
-    """
-    return torch.ops.libtorch_agnostic.my_abs.default(t)
-
-
-def my_is_cpu(t) -> bool:
-    """
-    Returns is_cpu on the input tensor.
-
-    Args:
-        t: any Tensor
-
-    Returns:
-        a bool
-    """
-    return torch.ops.libtorch_agnostic.my_is_cpu.default(t)
-
-
-def my_ones_like(tensor, device) -> Tensor:
-    """
-    Returns a new Tensor like the input tensor, but with all ones
-
-    Args:
-        tensor: any Tensor
-        device: a device string
-
-    Returns:
-        a ones Tensor with the same dtype and shape and other attributes
-        like the input tensor
-    """
-    return torch.ops.libtorch_agnostic.my_ones_like.default(tensor, device)
-
-
-def exp_neg_is_leaf(t1, t2, t3) -> tuple[Tensor, Tensor, bool]:
-    """
-    Returns a Tensor, Tensor, bool tuple corresponding to the respective inputs
-    t1, t2, and t3.
-
-    Args:
-        t1: Tensor
-        t2: Tensor
-        t3: Tensor
-
-    Returns:
-        (exp(t1), neg(t2), is_leaf(t3))
-    """
-    return torch.ops.libtorch_agnostic.exp_neg_is_leaf.default(t1, t2, t3)
-
-
-def neg_exp(t) -> Tensor:
-    """
-    Returns a Tensor composing neg of exp
-
-    Args:
-        t: Tensor
-
-    Returns: neg(exp(t))
-    """
-    return torch.ops.libtorch_agnostic.neg_exp.default(t)
-
-
-def divide_neg_exp(t) -> Tensor:
-    """
-    Returns a Tensor division of neg and exp
-
-    Args:
-        t: Tensor
-
-    Returns: divide(neg(t), exp(t))
-    """
-    return torch.ops.libtorch_agnostic.divide_neg_exp.default(t)
-
-
-def is_contiguous(t) -> bool:
-    """
-    Returns a bool indicating if the input tensor is contiguous
-
-    Args:
-        t: Tensor
-
-    Returns: is_contiguous(t)
-    """
-    return torch.ops.libtorch_agnostic.is_contiguous.default(t)
-
-
-def my_transpose(t, dim0, dim1) -> Tensor:
-    """
-    Returns t.transpose(dim0, dim1)
-
-    Args:
-        t: Tensor
-
-    Returns: my_transpose(t, dim0, dim1)
-    """
-    return torch.ops.libtorch_agnostic.my_transpose.default(t, dim0, dim1)
-
-
-def my_empty_like(t) -> Tensor:
-    """
-    Returns t.empty_like()
-
-    Args:
-        t: Tensor
-
-    Returns: my_empty_like(t)
-    """
-    return torch.ops.libtorch_agnostic.my_empty_like.default(t)
-
-
-def my_zero_(t) -> Tensor:
-    """
-    Returns t.zero_()
-
-    Args:
-        t: Tensor
-
-    Returns: my_zero_(t)
-    """
-    return torch.ops.libtorch_agnostic.my_zero_.default(t)
-
-
-def my_amax(t) -> Tensor:
-    """
-    Returns t.amax()
-
-    Args:
-        t: Tensor
-
-    Returns: amax(t)
-    """
-    return torch.ops.libtorch_agnostic.my_amax.default(t)
-
-
-def my_amax_vec(t) -> Tensor:
-    """
-    Returns t.amax()
-
-    Args:
-        t: Tensor
-
-    Returns: amax(t)
-    """
-    return torch.ops.libtorch_agnostic.my_amax_vec.default(t)
-
-
-def fill_infinity(t) -> Tensor:
-    """
-    Fills the tensor with inf.
-
-    Args:
-        t: Tensor to fill
-
-    Returns: The modified tensor (same as input)
-    """
-    return torch.ops.libtorch_agnostic.fill_infinity.default(t)
-
-
-def test_default_constructor(defined) -> bool:
-    """
-    Tests the default constructor for torch::stable::Tensor.
-
-    Args:
-        defined: bool - if True, tests defined tensor; if False, tests undefined tensor
-
-    Returns: bool - result of calling .defined() on the tensor
-    """
-    return torch.ops.libtorch_agnostic.test_default_constructor.default(defined)
-
-
-def test_tensor_device(t):
-    """
-    Tests Tensor device() method.
-
-    Args:
-        t: Tensor - tensor to get device from
-
-    Returns: Device - device of the tensor
-    """
-    return torch.ops.libtorch_agnostic.test_tensor_device.default(t)
-
-
-def my_pad(t) -> Tensor:
-    """
-    Pads the input tensor with hardcoded padding parameters.
-
-    Args:
-        t: Input tensor
-
-    Returns: Padded tensor with padding [1, 2, 2, 1], mode "constant", value 0.0
-    """
-    return torch.ops.libtorch_agnostic.my_pad.default(t)
-
-
-def my_narrow(t, dim, start, length) -> Tensor:
-    """
-    Returns a new tensor that is a narrowed version of the input tensor.
-
-    Args:
-        t: Input tensor
-        dim: Dimension along which to narrow
-        start: Starting position
-        length: Length of the narrowed section
-
-    Returns: Narrowed tensor
-    """
-    return torch.ops.libtorch_agnostic.my_narrow.default(t, dim, start, length)
-
-
-def my_copy_(dst, src, non_blocking) -> Tensor:
-    """
-    Returns tensor dst that is updated with src elements.
-
-    Args:
-        dst: Destination tensor
-        src: Source tensor
-        non_blocking: bool
-
-    Returns: Updated tensor
-    """
-    return torch.ops.libtorch_agnostic.my_copy_.default(dst, src, non_blocking)
-
-
-def my_clone(t) -> Tensor:
-    """
-    Returns a clone of input tensor.
-
-    Args:
-        t: Input tensor
-
-    Returns: Cloned tensor
-    """
-    return torch.ops.libtorch_agnostic.my_clone.default(t)
-
-
-def test_device_guard(device_index) -> int:
-    """
-    Tests the DeviceGuard functionality by creating a device guard and returning an empty tensor.
-
-    Args:
-        device_index: Device index to set the guard to
-
-    Returns: result of cudaGetDevice() as an integer after using the guard
-    """
-    return torch.ops.libtorch_agnostic.test_device_guard.default(device_index)
-
-
-def test_device_guard_set_index() -> int:
-    """
-    Tests the DeviceGuard set_index functionality by creating a device guard with index 1,
-    then setting it to index 0, and returning the current device.
-
-    Returns: result of cudaGetDevice() as an integer after using set_index
-    """
-    return torch.ops.libtorch_agnostic.test_device_guard_set_index.default()
-
-
-def test_stream(device_index) -> int:
-    """
-    Tests the Stream functionality by getting the current stream ID for the specified device.
-
-    Args:
-        device_index: Device index to get the stream for
-
-    Returns: Stream ID as an integer
-    """
-    return torch.ops.libtorch_agnostic.test_stream.default(device_index)
-
-
-def test_get_current_device_index() -> int:
-    """
-    Tests the getCurrentDeviceIndex functionality by getting the current device index.
-
-    Returns: Current device index as an integer
-    """
-    return torch.ops.libtorch_agnostic.test_get_current_device_index.default()
-
-
-def my_new_empty_dtype_variant(t) -> Tensor:
-    """
-    Returns a new empty tensor with shape [2, 5] and dtype bfloat16
-
-    Args:
-        t: Input tensor used as a reference for device and other properties
-
-    Returns: New empty tensor with shape [2, 5] and dtype bfloat16
-    """
-    return torch.ops.libtorch_agnostic.my_new_empty_dtype_variant.default(t)
-
-
-def my_new_zeros_dtype_variant(t) -> Tensor:
-    """
-    Returns a new tensor filled with 0s with shape [2, 5] and dtype Float
-
-    Args:
-        t: Input tensor used as a reference for device and other properties
-
-    Returns: New zeros tensor
-    """
-    return torch.ops.libtorch_agnostic.my_new_zeros_dtype_variant.default(t)
-
-
-def my__foreach_mul_(tensors, others) -> ():
-    """
-    Updates tensors to be the result of pointwise multiplying with others.
-
-    Args:
-        tensors: list of tensors
-        others: list of tensors (with the same corresponding shapes as tensors)
-
-    Returns: nothing, tensors is updated in place.
-    """
-    torch.ops.libtorch_agnostic.my__foreach_mul_.default(tensors, others)
-
-
-def my__foreach_mul(tensors, others) -> list[Tensor]:
-    """
-    Returns a list of tensors that are the results of pointwise multiplying
-    tensors and others.
-
-    Args:
-        tensors: list of tensors
-        others: list of tensors (with the same corresponding shapes as tensors)
-
-    Returns: list of multiplied tensors
-    """
-    return torch.ops.libtorch_agnostic.my__foreach_mul.default(tensors, others)
-
-
-def make_tensor_clones_and_call_foreach(t1, t2) -> list[Tensor]:
-    """
-    Returns a list of 2 tensors corresponding to the square of the inputs.
-
-    Args:
-        t1: Tensor
-        t2: Tensor
-
-    Returns: list of [t1^2, t2^2]
-    """
-    return torch.ops.libtorch_agnostic.make_tensor_clones_and_call_foreach.default(
-        t1, t2
-    )
-
-
-def test_device_constructor(is_cuda, index, use_str):
-    """
-    Tests creating a Device from DeviceType and index, or from a string.
-
-    Args:
-        is_cuda: bool - if True, creates CUDA device; if False, creates CPU device
-        index: int - device index
-        use_str: bool - if True, constructs from string; if False, constructs from DeviceType
-
-    Returns: Device - A device with the specified type and index
-    """
-    return torch.ops.libtorch_agnostic.test_device_constructor.default(
-        is_cuda, index, use_str
-    )
-
-
-def test_device_equality(d1, d2) -> bool:
-    """
-    Tests Device equality operator.
-
-    Args:
-        d1: Device - first device
-        d2: Device - second device
-
-    Returns: bool - True if devices are equal
-    """
-    return torch.ops.libtorch_agnostic.test_device_equality.default(d1, d2)
-
-
-def test_device_set_index(device, index):
-    """
-    Tests Device set_index() method.
-
-    Args:
-        device: Device - device to modify
-        index: int - new device index
-
-    Returns: Device - device with updated index
-    """
-    return torch.ops.libtorch_agnostic.test_device_set_index.default(device, index)
-
-
-def test_device_index(device) -> int:
-    """
-    Tests Device index() method.
-
-    Args:
-        device: Device - device to query
-
-    Returns: int - device index
-    """
-    return torch.ops.libtorch_agnostic.test_device_index.default(device)
-
-
-def test_device_is_cuda(device) -> bool:
-    """
-    Tests Device is_cuda() method.
-
-    Args:
-        device: Device - device to check
-
-    Returns: bool - True if device is CUDA
-    """
-    return torch.ops.libtorch_agnostic.test_device_is_cuda.default(device)
-
-
-def test_device_is_cpu(device) -> bool:
-    """
-    Tests Device is_cpu() method.
-
-    Args:
-        device: Device - device to check
-
-    Returns: bool - True if device is CPU
-    """
-    return torch.ops.libtorch_agnostic.test_device_is_cpu.default(device)
-
-
-def test_parallel_for(size, grain_size) -> Tensor:
-    """
-    Tests the parallel_for functionality by using it to fill a tensor with indices.
-    Args:
-        size: int - size of the tensor to create
-        grain_size: int - grain size for parallel_for
-    Returns: Tensor - a 1D int64 tensor where each element contains its index
-        (if multiple threads are used the threadid will be encoded in the upper 32 bits)
-    """
-    return torch.ops.libtorch_agnostic.test_parallel_for.default(size, grain_size)
-
-
-def test_get_num_threads() -> int:
-    """
-    Tests the get_num_threads functionality by returning the number of threads
-    for the parallel backend.
-
-    Returns: int - the number of threads for the parallel backend
-    """
-    return torch.ops.libtorch_agnostic.test_get_num_threads.default()
-
-
-def my_empty(size, dtype=None, device=None, pin_memory=None) -> Tensor:
-    """
-    Creates an empty tensor with the specified size, dtype, device, and pin_memory.
-
-    Args:
-        size: list[int] - size of the tensor to create
-        dtype: ScalarType or None - data type of the tensor
-        device: Device or None - device on which to create the tensor
-        pin_memory: bool or None - whether to use pinned memory
-
-    Returns: Tensor - an uninitialized tensor with the specified properties
-    """
-    return torch.ops.libtorch_agnostic.my_empty.default(size, dtype, device, pin_memory)
-
-
-def my_flatten(t, start_dim=0, end_dim=-1) -> Tensor:
-    """
-    Flattens the input tensor from start_dim to end_dim into a single dimension.
-
-    Args:
-        t: Tensor - tensor to flatten
-        start_dim: int - first dimension to flatten (default: 0)
-        end_dim: int - last dimension to flatten (default: -1)
-
-    Returns: Tensor - flattened tensor
-    """
-    return torch.ops.libtorch_agnostic.my_flatten.default(t, start_dim, end_dim)
-
-
-def my_reshape(t, shape) -> Tensor:
-    """
-    Returns a tensor with the same data but different shape.
-
-    Args:
-        t: Tensor - tensor to reshape
-        shape: list[int] - new shape for the tensor
-
-    Returns: Tensor - reshaped tensor
-    """
-    return torch.ops.libtorch_agnostic.my_reshape.default(t, shape)
-
-
-def my_view(t, size) -> Tensor:
-    """
-    Returns a new tensor with the same data as the input tensor but of a different shape.
-
-    Args:
-        t: Tensor - tensor to view
-        size: list[int] - new size for the tensor
-
-    Returns: Tensor - tensor with new view
-    """
-    return torch.ops.libtorch_agnostic.my_view.default(t, size)
-
-
-def mv_tensor_accessor(m, v) -> Tensor:
-    """
-    Returns matrix-vector product.
-
-    Args:
-        m: any 2-D Tensor with shape (N, M)
-        v: any 1-D Tensor with shape (M,)
-
-    Returns:
-        a 1-D Tensor with shape (N,)
-    """
-    return torch.ops.libtorch_agnostic.mv_tensor_accessor.default(m, v)
diff --git a/test/cpp_extensions/libtorch_agnostic_extension/test/test_libtorch_agnostic.py b/test/cpp_extensions/test_libtorch_agnostic.py
similarity index 82%
rename from test/cpp_extensions/libtorch_agnostic_extension/test/test_libtorch_agnostic.py
rename to test/cpp_extensions/test_libtorch_agnostic.py
index 864b9fc89a1c..2ba1200f230d 100644
--- a/test/cpp_extensions/libtorch_agnostic_extension/test/test_libtorch_agnostic.py
+++ b/test/cpp_extensions/test_libtorch_agnostic.py
@@ -1,6 +1,7 @@
 # Owner(s): ["module: cpp"]
 
 import math
+import unittest
 from pathlib import Path
 
 import torch
@@ -19,21 +20,78 @@
 )
 
 
+def skipIfTorchVersionLessThan(major, minor):
+    """Skip test if PyTorch version is less than specified version."""
+
+    def decorator(func):
+        version_parts = torch.__version__.split(".")
+        current_major = int(version_parts[0])
+        current_minor = int(
+            version_parts[1].split("+")[0].split("a")[0].split("b")[0].split("rc")[0]
+        )
+
+        should_skip = (current_major < major) or (
+            current_major == major and current_minor < minor
+        )
+        reason = f"Test requires PyTorch >= {major}.{minor}, current version is {torch.__version__}"
+
+        return unittest.skipIf(should_skip, reason)(func)
+
+    return decorator
+
+
 # TODO: Fix this error in Windows:
 # LINK : error LNK2001: unresolved external symbol PyInit__C
 if not IS_WINDOWS:
 
     class TestLibtorchAgnostic(TestCase):
+        """
+        Tests for versioned libtorch_agnostic extensions.
+
+        This test class supports testing both:
+
+        - libtorch_agnostic_2_9: Extension built with TORCH_TARGET_VERSION=2.9.0
+        - libtorch_agnostic_2_10: Extension built with TORCH_TARGET_VERSION=2.10.0
+
+        Tests should be decorated with @skipIfTorchVersionLessThan to indicate the
+        version that they target.
+        """
+
         @classmethod
         def setUpClass(cls):
+            # Build both 2.9 and 2.10 extensions
+            base_dir = Path(__file__).parent
+
             try:
-                import libtorch_agnostic  # noqa: F401
+                import libtorch_agnostic_2_9  # noqa: F401
             except Exception:
-                install_cpp_extension(extension_root=Path(__file__).parent.parent)
+                install_cpp_extension(
+                    extension_root=base_dir / "libtorch_agnostic_2_9_extension"
+                )
+
+            # Only build 2.10 extension if running on PyTorch 2.10+
+            import re
+
+            version_parts = torch.__version__.split(".")
+            current_major = int(version_parts[0])
+            # Extract just the numeric part of the minor version (handles "10+git", "10a1", etc.)
+            current_minor = int(re.match(r"\d+", version_parts[1]).group())
+
+            if (current_major > 2) or (current_major == 2 and current_minor >= 10):
+                try:
+                    import libtorch_agnostic_2_10  # noqa: F401
+                except Exception:
+                    install_cpp_extension(
+                        extension_root=base_dir / "libtorch_agnostic_2_10_extension"
+                    )
+            else:
+                print(
+                    f"Skipping 2.10 extension (running on PyTorch {torch.__version__})"
+                )
 
         @onlyCPU
         def test_slow_sgd(self, device):
-            import libtorch_agnostic
+            import libtorch_agnostic_2_9 as libtorch_agnostic
 
             param = torch.rand(5, device=device)
             grad = torch.rand_like(param)
@@ -60,7 +118,7 @@ def test_slow_sgd(self, device):
 
         @onlyCUDA
         def test_identity_does_not_hog_memory(self, device):
-            import libtorch_agnostic
+            import libtorch_agnostic_2_9 as libtorch_agnostic
 
             def _run_identity(prior_mem):
                 t = torch.rand(32, 32, device=device)
@@ -76,7 +134,7 @@ def _run_identity(prior_mem):
                 self.assertEqual(curr_mem, init_mem)
 
         def test_exp_neg_is_leaf(self, device):
-            import libtorch_agnostic
+            import libtorch_agnostic_2_9 as libtorch_agnostic
 
             t1 = torch.rand(2, 3, device=device)
             t2 = torch.rand(3, 2, device=device)
@@ -88,7 +146,7 @@ def test_exp_neg_is_leaf(self, device):
             self.assertEqual(is_leaf, t3.is_leaf)
 
         def test_my_abs(self, device):
-            import libtorch_agnostic
+            import libtorch_agnostic_2_9 as libtorch_agnostic
 
             t = torch.rand(32, 16, device=device) - 0.5
             res = libtorch_agnostic.ops.my_abs(t)
@@ -107,7 +165,7 @@ def _make_cuda_tensors(prior_mem):
                     self.assertEqual(curr_mem, init_mem)
 
         def test_neg_exp(self, device):
-            import libtorch_agnostic
+            import libtorch_agnostic_2_9 as libtorch_agnostic
 
             t = torch.rand(32, 16, device=device) - 0.5
             res = libtorch_agnostic.ops.neg_exp(t)
@@ -126,7 +184,7 @@ def _make_cuda_tensors(prior_mem):
                     self.assertEqual(curr_mem, init_mem)
 
         def test_divide_neg_exp(self, device):
-            import libtorch_agnostic
+            import libtorch_agnostic_2_9 as libtorch_agnostic
 
             t = torch.zeros(2, 3, device=device) - 0.5
             res = libtorch_agnostic.ops.divide_neg_exp(t)
@@ -145,7 +203,7 @@ def _make_cuda_tensors(prior_mem):
                     self.assertEqual(curr_mem, init_mem)
 
         def test_is_contiguous(self, device):
-            import libtorch_agnostic
+            import libtorch_agnostic_2_9 as libtorch_agnostic
 
             t = torch.rand(2, 7, device=device)
             self.assertTrue(libtorch_agnostic.ops.is_contiguous(t))
@@ -157,7 +215,7 @@ def test_is_contiguous(self, device):
         # **{}): got AssertionError("tensor's device must be `meta`, got cpu instead")
         @xfailIfTorchDynamo
         def test_my_ones_like(self, device):
-            import libtorch_agnostic
+            import libtorch_agnostic_2_9 as libtorch_agnostic
 
             t = torch.rand(3, 1, device=device) - 0.5
             cpu_t = libtorch_agnostic.ops.my_ones_like(t, "cpu")
@@ -176,7 +234,7 @@ def _make_cuda_tensors(prior_mem):
                     self.assertEqual(curr_mem, init_mem)
 
         def test_my_transpose(self, device):
-            import libtorch_agnostic
+            import libtorch_agnostic_2_9 as libtorch_agnostic
 
             t = torch.rand(2, 7, device=device)
             out = libtorch_agnostic.ops.my_transpose(t, 0, 1)
@@ -186,7 +244,7 @@ def test_my_transpose(self, device):
                 libtorch_agnostic.ops.my_transpose(t, 1, 2)
 
         def test_my_empty_like(self, device):
-            import libtorch_agnostic
+            import libtorch_agnostic_2_9 as libtorch_agnostic
 
             deterministic = torch.are_deterministic_algorithms_enabled()
             try:
@@ -202,7 +260,7 @@ def test_my_empty_like(self, device):
 
         @onlyCPU
         def test_my_zero_(self, device):
-            import libtorch_agnostic
+            import libtorch_agnostic_2_9 as libtorch_agnostic
 
             t = torch.rand(2, 7, device=device)
             out = libtorch_agnostic.ops.my_zero_(t)
@@ -210,28 +268,28 @@ def test_my_zero_(self, device):
             self.assertEqual(out, torch.zeros_like(t))
 
         def test_my_amax(self, device):
-            import libtorch_agnostic
+            import libtorch_agnostic_2_9 as libtorch_agnostic
 
             t = torch.rand(2, 7, device=device)
             out = libtorch_agnostic.ops.my_amax(t)
             self.assertEqual(out, torch.amax(t, 0))
 
         def test_my_amax_vec(self, device):
-            import libtorch_agnostic
+            import libtorch_agnostic_2_9 as libtorch_agnostic
 
             t = torch.rand(2, 7, 5, device=device)
             out = libtorch_agnostic.ops.my_amax_vec(t)
             self.assertEqual(out, torch.amax(t, (0, 1)))
 
         def test_my_is_cpu(self, device):
-            import libtorch_agnostic
+            import libtorch_agnostic_2_9 as libtorch_agnostic
 
             t = torch.rand(2, 7, device=device)
             out = libtorch_agnostic.ops.my_is_cpu(t)
             self.assertEqual(out, t.is_cpu)
 
         def test_fill_infinity(self, device):
-            import libtorch_agnostic
+            import libtorch_agnostic_2_9 as libtorch_agnostic
 
             t = torch.rand(3, 4, device=device)
             out = libtorch_agnostic.ops.fill_infinity(t)
@@ -242,7 +300,7 @@ def test_fill_infinity(self, device):
 
         @onlyCPU
         def test_default_constructor(self):
-            import libtorch_agnostic
+            import libtorch_agnostic_2_9 as libtorch_agnostic
 
             defined_tensor_is_defined = libtorch_agnostic.ops.test_default_constructor(
                 True
@@ -255,7 +313,7 @@ def test_default_constructor(self):
             self.assertFalse(undefined_tensor_is_defined)
 
         def test_my_pad(self, device):
-            import libtorch_agnostic
+            import libtorch_agnostic_2_9 as libtorch_agnostic
 
             t = torch.rand(2, 3, device=device)
             out = libtorch_agnostic.ops.my_pad(t)
@@ -263,7 +321,7 @@ def test_my_pad(self, device):
             self.assertEqual(out, expected)
 
         def test_my_narrow(self, device):
-            import libtorch_agnostic
+            import libtorch_agnostic_2_9 as libtorch_agnostic
 
             t = torch.randn(2, 5, device=device)
 
@@ -277,7 +335,7 @@ def test_my_narrow(self, device):
         @onlyCUDA
         @deviceCountAtLeast(2)
         def test_device_guard(self, device):
-            import libtorch_agnostic
+            import libtorch_agnostic_2_9 as libtorch_agnostic
 
             device_index = 1
             out = libtorch_agnostic.ops.test_device_guard(device_index)
@@ -286,7 +344,7 @@ def test_device_guard(self, device):
         @onlyCUDA
         @deviceCountAtLeast(2)
         def test_device_guard_set_index(self, device):
-            import libtorch_agnostic
+            import libtorch_agnostic_2_9 as libtorch_agnostic
 
             # This test creates a DeviceGuard with index 1, then sets it to index 0
             # and returns the current device (should be 0)
@@ -295,7 +353,7 @@ def test_device_guard_set_index(self, device):
 
         @onlyCUDA
         def test_stream(self, device):
-            import libtorch_agnostic
+            import libtorch_agnostic_2_9 as libtorch_agnostic
 
             stream = torch.cuda.Stream()
             device = torch.cuda.current_device()
@@ -309,7 +367,7 @@ def test_stream(self, device):
         @onlyCUDA
         @deviceCountAtLeast(2)
         def test_get_current_device_index(self, device):
-            import libtorch_agnostic
+            import libtorch_agnostic_2_9 as libtorch_agnostic
 
             prev_device = torch.cuda.current_device()
 
@@ -323,7 +381,7 @@ def test_get_current_device_index(self, device):
                 torch.cuda.set_device(prev_device)
 
         def test_my_new_empty_dtype_variant(self, device):
-            import libtorch_agnostic
+            import libtorch_agnostic_2_9 as libtorch_agnostic
 
             deterministic = torch.are_deterministic_algorithms_enabled()
             try:
@@ -338,7 +396,7 @@ def test_my_new_empty_dtype_variant(self, device):
                 torch.use_deterministic_algorithms(deterministic)
 
         def test_my_new_zeros_dtype_variant(self, device):
-            import libtorch_agnostic
+            import libtorch_agnostic_2_9 as libtorch_agnostic
 
             t = torch.randn(3, 4, device=device)
             out = libtorch_agnostic.ops.my_new_zeros_dtype_variant(t)
@@ -346,7 +404,7 @@ def test_my_new_zeros_dtype_variant(self, device):
             self.assertEqual(out, ref_out, exact_device=True)
 
         def test_my_copy_(self, device):
-            import libtorch_agnostic
+            import libtorch_agnostic_2_9 as libtorch_agnostic
 
             dst = torch.empty(2, 5, device=device)
             src = torch.randn(2, 5, device=device)
@@ -357,7 +415,7 @@ def test_my_copy_(self, device):
             self.assertEqual(result.data_ptr(), dst.data_ptr())
 
         def test_my_clone(self, device):
-            import libtorch_agnostic
+            import libtorch_agnostic_2_9 as libtorch_agnostic
 
             t = torch.randn(2, 5, device=device)
 
@@ -367,8 +425,9 @@ def test_my_clone(self, device):
             self.assertNotEqual(result.data_ptr(), expected.data_ptr())
             self.assertEqual(result.stride(), expected.stride())
 
+        @skipIfTorchVersionLessThan(2, 10)
         def test_my__foreach_mul_(self, device):
-            import libtorch_agnostic
+            import libtorch_agnostic_2_10 as libtorch_agnostic
 
             N = 5
             tensors = [torch.rand(32, 16, device=device) for _ in range(N)]
@@ -381,8 +440,9 @@ def test_my__foreach_mul_(self, device):
             for tensor_t, expected_t in zip(tensors, expected_values):
                 self.assertEqual(tensor_t, expected_t)
 
+        @skipIfTorchVersionLessThan(2, 10)
         def test_my__foreach_mul(self, device):
-            import libtorch_agnostic
+            import libtorch_agnostic_2_10 as libtorch_agnostic
 
             N = 5
             tensors = [torch.rand(32, 16, device=device) for _ in range(N)]
@@ -409,8 +469,9 @@ def _make_cuda_tensors(prior_mem):
                     curr_mem = torch.cuda.memory_allocated(device)
                     self.assertEqual(curr_mem, init_mem)
 
+        @skipIfTorchVersionLessThan(2, 10)
         def test_make_tensor_clones_and_call_foreach(self, device):
-            import libtorch_agnostic
+            import libtorch_agnostic_2_10 as libtorch_agnostic
 
             t1 = torch.rand(2, 5, device=device)
             t2 = torch.rand(3, 4, device=device)
@@ -418,9 +479,10 @@ def test_make_tensor_clones_and_call_foreach(self, device):
             self.assertEqual(result[0], t1 * t1)
             self.assertEqual(result[1], t2 * t2)
 
+        @skipIfTorchVersionLessThan(2, 10)
         @onlyCUDA
         def test_device(self, device):
-            import libtorch_agnostic
+            import libtorch_agnostic_2_10 as libtorch_agnostic
 
             cuda_device = libtorch_agnostic.ops.test_device_constructor(
                 is_cuda=True, index=1, use_str=False
@@ -473,10 +535,11 @@ def test_device(self, device):
             ):
                 libtorch_agnostic.ops.test_device_set_index(cuda_device, 129)
 
+        @skipIfTorchVersionLessThan(2, 10)
         @onlyCUDA
         @deviceCountAtLeast(2)
         def test_tensor_device(self, device):
-            import libtorch_agnostic
+            import libtorch_agnostic_2_10 as libtorch_agnostic
 
             t = torch.randn(2, 3)
             self.assertEqual(libtorch_agnostic.ops.test_tensor_device(t), t.device)
@@ -491,6 +554,7 @@ def test_tensor_device(self, device):
                 libtorch_agnostic.ops.test_tensor_device(t_cuda_1), t_cuda_1.device
             )
 
+        @skipIfTorchVersionLessThan(2, 10)
         @onlyCPU
         # TODO: Debug this:
         # Dynamo failed to run FX node with fake tensors:
@@ -500,7 +564,7 @@ def test_tensor_device(self, device):
         # Declaration: libtorch_agnostic::test_parallel_for(int size, int grain_size) -> Tensor')
         @xfailIfTorchDynamo
         def test_parallel_for(self, device):
-            import libtorch_agnostic
+            import libtorch_agnostic_2_10 as libtorch_agnostic
 
             num_threads = torch.get_num_threads()
             size = 100
@@ -517,16 +581,18 @@ def test_parallel_for(self, device):
             self.assertEqual(result_values, expected)
             self.assertEqual(result_thread_ids, torch.arange(expected_num_threads_used))
 
+        @skipIfTorchVersionLessThan(2, 10)
         @onlyCPU
         def test_get_num_threads(self, device):
-            import libtorch_agnostic
+            import libtorch_agnostic_2_10 as libtorch_agnostic
 
             num_threads = libtorch_agnostic.ops.test_get_num_threads()
             expected_num_threads = torch.get_num_threads()
             self.assertEqual(num_threads, expected_num_threads)
 
+        @skipIfTorchVersionLessThan(2, 10)
         def test_my_empty(self, device):
-            import libtorch_agnostic
+            import libtorch_agnostic_2_10 as libtorch_agnostic
 
             deterministic = torch.are_deterministic_algorithms_enabled()
             try:
@@ -567,7 +633,7 @@ def test_my_empty(self, device):
                 torch.use_deterministic_algorithms(deterministic)
 
         def test_my_flatten(self, device):
-            import libtorch_agnostic
+            import libtorch_agnostic_2_9 as libtorch_agnostic
 
             t = torch.randn(2, 3, 4, device=device)
             result = libtorch_agnostic.ops.my_flatten(t)
@@ -582,8 +648,9 @@ def test_my_flatten(self, device):
             expected_range = torch.flatten(t, 2, -1)
             self.assertEqual(result_range, expected_range)
 
+        @skipIfTorchVersionLessThan(2, 10)
         def test_my_reshape(self, device):
-            import libtorch_agnostic
+            import libtorch_agnostic_2_10 as libtorch_agnostic
 
             t = torch.randn(2, 3, 4, device=device)
 
@@ -599,8 +666,9 @@ def test_my_reshape(self, device):
             expected_flat = torch.reshape(t, [-1])
             self.assertEqual(result_flat, expected_flat)
 
+        @skipIfTorchVersionLessThan(2, 10)
         def test_my_view(self, device):
-            import libtorch_agnostic
+            import libtorch_agnostic_2_10 as libtorch_agnostic
 
             t = torch.randn(2, 3, 4, device=device)
 
@@ -617,7 +685,7 @@ def test_my_view(self, device):
             self.assertEqual(result_flat, expected_flat)
 
         def test_mv_tensor_accessor(self, device):
-            import libtorch_agnostic
+            import libtorch_agnostic_2_9 as libtorch_agnostic
 
             m = torch.rand(3, 5, device=device)
             v = torch.rand(5, device=device)
diff --git a/test/run_test.py b/test/run_test.py
index 862d0cad8bd0..1935cbbe4b7b 100755
--- a/test/run_test.py
+++ b/test/run_test.py
@@ -884,7 +884,8 @@ def _test_cpp_extensions_aot(test_directory, options, use_ninja):
         if TEST_CUDA or TEST_XPU:
             exts_to_build.append((wheel_cmd, "python_agnostic_extension"))
         if TEST_CUDA:
-            exts_to_build.append((install_cmd, "libtorch_agnostic_extension"))
+            exts_to_build.append((install_cmd, "libtorch_agnostic_2_9_extension"))
+            exts_to_build.append((install_cmd, "libtorch_agnostic_2_10_extension"))
         for cmd, extension_dir in exts_to_build:
             return_code = shell(
                 cmd,
@@ -912,12 +913,16 @@ def _test_cpp_extensions_aot(test_directory, options, use_ninja):
                 if "-packages" in directory:
                     install_directories.append(os.path.join(root, directory))
 
-        for root, directories, _ in os.walk(
-            os.path.join(cpp_extensions, "libtorch_agnostic_extension", "install")
-        ):
-            for directory in directories:
-                if "-packages" in directory:
-                    install_directories.append(os.path.join(root, directory))
+        for extension_name in [
+            "libtorch_agnostic_2_9_extension",
+            "libtorch_agnostic_2_10_extension",
+        ]:
+            for root, directories, _ in os.walk(
+                os.path.join(cpp_extensions, extension_name, "install")
+            ):
+                for directory in directories:
+                    if "-packages" in directory:
+                        install_directories.append(os.path.join(root, directory))
 
         with extend_python_path(install_directories):
             return run_test(ShardedTest(test_module, 1, 1), test_directory, options)
diff --git a/torch/csrc/stable/ops.h b/torch/csrc/stable/ops.h
index 18d24fce2721..c90db39cb1b9 100644
--- a/torch/csrc/stable/ops.h
+++ b/torch/csrc/stable/ops.h
@@ -269,6 +269,26 @@ inline torch::stable::Tensor clone(const torch::stable::Tensor& self) {
   return torch::stable::detail::to<torch::stable::Tensor>(stack[0]);
 }
 
+// We expect this to be the stable version of the flatten.using_ints op.
+inline torch::stable::Tensor flatten(
+    const torch::stable::Tensor& self,
+    int64_t start_dim = 0,
+    int64_t end_dim = -1) {
+  const auto num_args = 3;
+  std::array<StableIValue, num_args> stack{
+      torch::stable::detail::from(self),
+      torch::stable::detail::from(start_dim),
+      torch::stable::detail::from(end_dim)};
+#if TORCH_FEATURE_VERSION >= TORCH_VERSION_2_10_0
+  TORCH_ERROR_CODE_CHECK(torch_call_dispatcher(
+      "aten::flatten", "using_ints", stack.data(), TORCH_ABI_VERSION));
+#else
+  TORCH_ERROR_CODE_CHECK(
+      aoti_torch_call_dispatcher("aten::flatten", "using_ints", stack.data()));
+#endif
+  return torch::stable::detail::to<torch::stable::Tensor>(stack[0]);
+}
+
 #if TORCH_FEATURE_VERSION >= TORCH_VERSION_2_10_0
 
 // New ops should be added here if they use a brand new shim API
@@ -309,6 +329,8 @@ inline uint32_t get_num_threads() {
 // We expect this to be the stable version of the empty op that takes in
 // device and dtype parameters. The empty op creates a tensor with uninitialized
 // values of the specified size, dtype, and device.
+// This function is only available in 2.10 because it uses the stableivalue
+// conversion for HeaderOnlyArrayRef<T>, which is only available in 2.10.
 inline torch::stable::Tensor empty(
     torch::headeronly::IntHeaderOnlyArrayRef size,
     std::optional<torch::headeronly::ScalarType> dtype = std::nullopt,
@@ -327,22 +349,9 @@ inline torch::stable::Tensor empty(
   return torch::stable::detail::to<torch::stable::Tensor>(stack[0]);
 }
 
-// We expect this to be the stable version of the flatten.using_ints op.
-inline torch::stable::Tensor flatten(
-    const torch::stable::Tensor& self,
-    int64_t start_dim = 0,
-    int64_t end_dim = -1) {
-  const auto num_args = 3;
-  std::array<StableIValue, num_args> stack{
-      torch::stable::detail::from(self),
-      torch::stable::detail::from(start_dim),
-      torch::stable::detail::from(end_dim)};
-  TORCH_ERROR_CODE_CHECK(torch_call_dispatcher(
-      "aten::flatten", "using_ints", stack.data(), TORCH_ABI_VERSION));
-  return torch::stable::detail::to<torch::stable::Tensor>(stack[0]);
-}
-
 // We expect this to be the stable version of the reshape op.
+// This function is only available in 2.10 because it uses the stableivalue
+// conversion for HeaderOnlyArrayRef<T>, which is only available in 2.10.
 inline torch::stable::Tensor reshape(
     const torch::stable::Tensor& self,
     torch::headeronly::IntHeaderOnlyArrayRef shape) {
@@ -355,6 +364,8 @@ inline torch::stable::Tensor reshape(
 }
 
 // We expect this to be the stable version of the view op.
+// This function is only available in 2.10 because it uses the stableivalue
+// conversion for HeaderOnlyArrayRef<T>, which is only available in 2.10.
 inline torch::stable::Tensor view(
     const torch::stable::Tensor& self,
     torch::headeronly::IntHeaderOnlyArrayRef size) {
diff --git a/torch/csrc/stable/stableivalue_conversions.h b/torch/csrc/stable/stableivalue_conversions.h
index 600a661962f2..15ac8e539e76 100644
--- a/torch/csrc/stable/stableivalue_conversions.h
+++ b/torch/csrc/stable/stableivalue_conversions.h
@@ -14,6 +14,21 @@
 
 HIDDEN_NAMESPACE_BEGIN(torch, stable, detail)
 
+// Helper variable templates to detect 2.10+ types for better compile-time error
+// messages
+template <typename T>
+inline constexpr bool is_header_only_array_ref_v = false;
+
+template <typename T>
+inline constexpr bool
+    is_header_only_array_ref_v<torch::headeronly::HeaderOnlyArrayRef<T>> = true;
+
+template <typename T>
+inline constexpr bool is_std_vector_v = false;
+
+template <typename T>
+inline constexpr bool is_std_vector_v<std::vector<T>> = true;
+
 // forward declare so that the from/to() implementations in the detail
 // namespace of library.h where the real work is done can compile.
 template <typename T>
@@ -35,6 +50,17 @@ struct FromImpl {
       T val,
       [[maybe_unused]] uint64_t extension_build_version,
       [[maybe_unused]] bool is_internal) {
+    // Ensure 2.10+ types don't accidentally use the base case - provide clear
+    // compile-time errors.
+    static_assert(
+        !std::is_same_v<T, torch::stable::Device>,
+        "torch::stable::Device requires TORCH_FEATURE_VERSION >= TORCH_VERSION_2_10_0");
+    static_assert(
+        !is_header_only_array_ref_v<T>,
+        "HeaderOnlyArrayRef<T> requires TORCH_FEATURE_VERSION >= TORCH_VERSION_2_10_0");
+    static_assert(
+        !is_std_vector_v<T>,
+        "std::vector<T> requires TORCH_FEATURE_VERSION >= TORCH_VERSION_2_10_0");
     static_assert(
         sizeof(T) <= sizeof(StableIValue),
         "StableLibrary stack does not support parameter types larger than 64 bits.");
@@ -126,6 +152,18 @@ struct FromImpl<ScalarType> {
   }
 };
 
+// [Note DeviceType version guard]
+// This conversion was introduced in 2.10. However, we do not gate it
+// with TORCH_FEATURE_VERSION >= TORCH_VERSION_2_10_0 because this
+// conversion is not actually used to pass DeviceType between user
+// extensions and libtorch (i.e. there is no c10::TypeKind::DeviceType).
+// The purpose of gating other conversions is to ensure that user
+// extensions do not try to pass a StableIValue that libtorch is
+// unable to interpret.
+// This conversion is only used
+// (1) In the conversion for torch::stable::Device (already gated)
+// (2) Within the user extension to translate between libtorch/extension's
+//     DeviceType (no gating needed)
 // Specialization for torch::headeronly::DeviceType => StableIValue
 // Note that we call into the shim to translate between the user's
 // DeviceType and libtorch's DeviceType, which can be different!
@@ -225,6 +263,11 @@ struct FromImpl<torch::stable::Tensor> {
   }
 };
 
+// =============================================================================
+// FROM CONVERSIONS requiring TORCH_FEATURE_VERSION >= TORCH_VERSION_2_10_0
+// =============================================================================
+#if TORCH_FEATURE_VERSION >= TORCH_VERSION_2_10_0
+
 // Specialization for torch::headeronly::HeaderOnlyArrayRef<T> => StableIValue
 // Returns a new owning reference of the underlying list.
 template <typename T>
@@ -287,6 +330,8 @@ struct FromImpl<torch::stable::Device> {
   }
 };
 
+#endif // TORCH_FEATURE_VERSION >= TORCH_VERSION_2_10_0
+
 // =============================================================================
 // TO CONVERSIONS (StableIValue -> T)
 // =============================================================================
@@ -299,6 +344,17 @@ struct ToImpl {
       [[maybe_unused]] uint64_t extension_build_version,
       [[maybe_unused]] bool is_internal) {
     static_assert(std::is_trivially_copyable_v<T>);
+    // Ensure 2.10+ types don't accidentally use the base case - provide clear
+    // compile-time errors.
+    static_assert(
+        !std::is_same_v<T, torch::stable::Device>,
+        "torch::stable::Device requires TORCH_FEATURE_VERSION >= TORCH_VERSION_2_10_0");
+    static_assert(
+        !is_header_only_array_ref_v<T>,
+        "HeaderOnlyArrayRef<T> requires TORCH_FEATURE_VERSION >= TORCH_VERSION_2_10_0");
+    static_assert(
+        !is_std_vector_v<T>,
+        "std::vector<T> requires TORCH_FEATURE_VERSION >= TORCH_VERSION_2_10_0");
     // T may not have a default constructor. (For example, it might be
     // c10::Device.) However, std::memcpy implicitly creates a T at the
     // destination. So, we can use a union to work around this lack of
@@ -387,6 +443,7 @@ struct ToImpl<ScalarType> {
   }
 };
 
+// See [Note DeviceType version guard]
 // Specialization for StableIValue => torch::headeronly::DeviceType
 template <>
 struct ToImpl<DeviceType> {
@@ -467,6 +524,11 @@ struct ToImpl<torch::stable::Tensor> {
   }
 };
 
+// =============================================================================
+// TO CONVERSIONS requiring TORCH_FEATURE_VERSION >= TORCH_VERSION_2_10_0
+// =============================================================================
+#if TORCH_FEATURE_VERSION >= TORCH_VERSION_2_10_0
+
 // Specialization for StableIValue => std::vector<T>
 // std::vector<T> should be represented as a StableListHandle
 // filled with StableIValues
@@ -517,6 +579,8 @@ struct ToImpl<torch::stable::Device> {
   }
 };
 
+#endif // TORCH_FEATURE_VERSION >= TORCH_VERSION_2_10_0
+
 // =============================================================================
 //  end to helpers for converting between StableIValue and T
 // =============================================================================